python 爬虫豆瓣top250

网页api:https://movie.douban.com/top250?start=0&filter=
用到的模块:urllib,re,csv 

捣鼓一上午终于好了,有些小问题

python 爬虫豆瓣top250文章来源地址https://www.yii666.com/article/754190.html

(top218有bug)具体问题:上图没有主演:用到正则表达式时取出过多的值,下图则是正常取值

python 爬虫豆瓣top250

所以取前200名,具体python代码实现如下,望大佬指导网址:yii666.com<

#! /usr/bin/python3
# -*- coding:UTF-8 -*-
from urllib import request
import re,csv class MovieTopForDouBan(object):
def __init__(self):
self.start = 0
self.param = '&filter='
self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'}
self.file_path = 'D:\\'
self.head = ['排名','名称','别名','其他名称','导演','主演','年份','地区','类型','平均分','人数','短评']
self.movie_list=[] def get_page(self):
try:
url = 'https://movie.douban.com/top250?start=' + str(self.start)
req = request.Request(url, headers=self.headers)
response = request.urlopen(req)
page = response.read().decode('utf-8')
page_num = (self.start + 25) // 25
print('正在抓取第' + str(page_num) + '页数据...')
self.start += 25
return page
except request.URLError as e:
if hasattr(e, 'reason'):
print('抓取失败,失败原因:', e.reason) def get_movie_info(self):
pattern = re.compile(u'<div.*?class="item">.*?<em class="">(.*?)</em>'
u'.*?<span.*?class="title">(.*?)</span>'
u'.*?<span.*?class="title">(.*?)</span>'
u'.*?<span.*?class="other">(.*?)</span>'
u'.*?<div.*?class="bd">.*?<p.*?class="">'
u'.*?导演:(.*?)&nbsp;.*?主演: (.*?)<br>'
u'(.*?)&nbsp;/&nbsp;(.*?)&nbsp;/&nbsp;(.*?)</p>.*?<div.*?class="star">'
u'.*?<span.*?class="rating_num".*?property="v:average">(.*?)</span>'
u'.*?<span>(.*?)人评价</span>.*?</div>'
u'.*?<span.*?class="inq">(.*?)</span>.*?</p>', re.S)
while self.start <= 176:#取前俩百 (top:218 电影名:初恋这件小事)有bug
page=self.d=self.get_page()
movies=re.findall(pattern,page)
for movie in movies:
data =list(movie)
data[2] = data[2].lstrip('&nbsp;/&nbsp;')
data[3] = data[3].lstrip('&nbsp;/&nbsp;')
data[6] = data[6].lstrip()
data[8] = data[8].rstrip()
self.movie_list.append(data) def write_text(self):
print('开始向文件写入数据....')
with open(self.file_path+'movie_info.txt','w',encoding='utf-8') as file_TopText:
try:
for movie in self.movie_list:
file_TopText.write('电影排名:' + movie[0] + '\r\n')
file_TopText.write('电影名称:' + movie[1] + '\r\n')
file_TopText.write('外文名称:' + movie[2] + '\r\n')
file_TopText.write('电影别名:' + movie[3] + '\r\n')
file_TopText.write('导演姓名:' + movie[4] + '\r\n')
file_TopText.write('主演姓名:' + movie[5] + '\r\n')
file_TopText.write('上映年份:' + movie[6] + '\r\n')
file_TopText.write('制作国家/地区:' + movie[7] + '\r\n')
file_TopText.write('电影类别:' + movie[8] + '\r\n')
file_TopText.write('电影评分:' + movie[9] + '\r\n')
file_TopText.write('参评人数:' + movie[10] + '\r\n')
file_TopText.write('简短影评:' + movie[11] + '\r\n\r\n')
print('抓取结果写入文件成功...')
except Exception as e:
print(e)
print('数据写入完毕....') def write_csv_file(self):
path = self.file_path + 'movie_info.csv'
common=0
try:
with open(path, 'w', newline='',encoding='utf-8') as csv_file:
writer = csv.writer(csv_file, dialect='excel')
if self.head is not None:
writer.writerow(self.head)
for row in self.movie_list:
writer.writerow(row)
common+=1
print("将CSV文件写入路径%s成功。" % path)
except Exception as e:
print("将CSV文件写入路径: %s, 信息: %s" % (path, e))
print(common) def main(self):
print('开始从豆瓣电影抓取数据........')
self.get_movie_info()
self.write_text()
#self.write_csv_file()
print('数据抓取完毕...') if __name__ == '__main__':
movie = MovieTopForDouBan()
movie.main()

d盘根目录生成一个movie_info.txt 文件文章来源地址:https://www.yii666.com/article/754190.html

python 爬虫豆瓣top250文章地址https://www.yii666.com/article/754190.html网址:yii666.com

版权声明:本文内容来源于网络,版权归原作者所有,此博客不拥有其著作权,亦不承担相应法律责任。文本页已经标记具体来源原文地址,请点击原文查看来源网址,站内文章以及资源内容站长不承诺其正确性,如侵犯了您的权益,请联系站长如有侵权请联系站长,将立刻删除

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信图片_20190322181744_03.jpg

微信扫一扫打赏

请作者喝杯咖啡吧~

支付宝扫一扫领取红包,优惠每天领

二维码1

zhifubaohongbao.png

二维码2

zhifubaohongbao2.png