原创

Python爬虫-爬取 ygdy8.com 站点的所有电影并写入文件和数据库

版权声明:本文为博主原创文章,转载注明出处。欢迎指点评论,技术交流,个人邮箱kangvcar@126.com https://blog.csdn.net/kangvcar/article/details/78256777

项目代码地址: ===欢迎fork 、star ===

https://github.com/kangvcar/pyproject/blob/master/Spiders/Spider_ygdy8.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2017-10-13 18:47:39
# @Author  : kangvcar (kangvcar@126.com)
# @Link    : http://www.github.com/kangvcar/
# @Version : $Id$

import re
from bs4 import BeautifulSoup
import urllib2

class movieItem(object):
	''' 定义 movieItem 类 '''
	MovieName = None
	UpdateTime = None
	Summary = None
	
class getMovieInfo(object):
	''' 爬取 www.ygdy8.net 站点的电影 '''
	def __init__(self, url):
		self.url = url
		self.sumPages = self.getSumPage(self.url)
		self.urls = self.getUrls(self.sumPages)
		self.items = self.spider(self.urls)
		self.pipelines(self.items)
		
	def getSumPage(self, url):
		''' 获取总页数 '''
		response = urllib2.urlopen(url)
		html = response.read()
		s = u'共(.*?)页'
		dSumpage = re.compile(s)
		fSumpage = re.search(dSumpage, html.decode('gbk')).group(1)
		# print fSumpage
		return fSumpage
		
	def getUrls(self, sumPages):
		''' 获取所有页面的url '''
		urls = []
		ul = self.url.split('_')
		for page in range(1, int(sumPages)+1):
			ul[-1] = str(page) + '.html'
			url = '_'.join(ul)
			urls.append(url)
		return urls
	
	def spider(self, urls):
		''' 爬取Item定义的信息'''
		items = []
		for url in urls:
			html = self.getContentPage(url)
			if html == None:
				continue
			soup = BeautifulSoup(html, 'lxml')
			tables = soup.find_all('table', attrs={'class':'tbspan'})
			for table in tables:
				item = movieItem()
				try:
					# item.MovieName = unicode(''.join(table.b.get_text().split()))
					item.MovieName = unicode(table.b.get_text().replace('\n', ''))
					item.UpdateTime = unicode(table.find_all('tr')[2].get_text().split()[0].split(':'.decode('utf-8'))[1])
					item.Summary = unicode(table.find_all('tr')[3].get_text().strip())
				except:
					pass
					continue
				else:
					items.append(item)
		return items
	
	def pipelines(self, items):
		''' 处理爬取到的内容'''
		filename = u'国内电影.txt'
		with open(filename, 'w') as fp:
			for item in items:
				fp.write('片名:%s \n更新时间:%s \n影片摘要:%s \n\n' %(item.MovieName.encode('utf-8'), item.UpdateTime.encode('utf-8'), item.Summary.encode('utf-8')))
	
	def getContentPage(self, url):
		''' 获取网页源代码'''
		try:
			response = urllib2.urlopen(url, timeout=4)
		except:
			return None
		else:
			html = response.read()
			return html

ZXDY = getMovieInfo('http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html')
# ZXDY = getMovieInfo('http://www.ygdy8.net/html/gndy/oumei/list_7_2.html')
#http://www.ygdy8.net/html/gndy/dyzz/list_23_1.html
#http://www.ygdy8.net/html/gndy/china/list_4_2.html


正文到此结束
Loading...