from urllib.request import urlopen,Request
import re
#=============================返回页面内容
#============================取页
def getPage(num):
url = 'https://movie.douban.com/top250?start=%s&filter='%num*25
header={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
}
ret = Request(url,headers=header)
res= urlopen(ret)
return res.read().decode('utf-8')
def parsePage(res):
ret = byte.finditer(res)
for el in ret:
yield {'id':el.group('id'),
'name':el.group('name'),
'rating_num':el.group('rating_num'),
'pingfen人数':el.group('num')
}
def saveData(ret):
with open('douban.txt',mode='a',encoding='utf-8') as f:
f.write(ret+'\n')
byte= re.compile('<div class="item">.*?<em class="">(?P<id>\d+).*?<span class="title">(?P<name>\w+).*?<span class="rating_num" property="v:average">(?P<rating_num>\d\.\d).*?<span>(?P<num>\d+人评价)</span>',re.S)
for i in range(10):
res = getPage(i)
ret = parsePage(res)
for el in ret:
data = str(el)
saveData(data)