import requests
import time
from lxml import etree
import json
#获取网页函数
def getpage(url):
try:
headers={'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Mobile Safari/537.36'}
res=requests.get(url,headers=headers)
if res.status_code==200:
return res.text
else:
return None
except:
return None
#解析网页函数
def parsepage(html):
ht=etree.HTML(html)
items=ht.xpath('//div[@class="item"]')
for item in items:
res={
'title':item.xpath('.//span[@class="title"]/text()'),
'index':item.xpath('.//div[@class="item"]//em/text()'),
'score':item.xpath('.//span[@class="rating_num"]/text()'),
'actor':item.xpath('.//p[@class=""]/text()'),
'image':item.xpath('.//img[@width="100"]/@src')
}
yield res
#写入文件
def writefile(item):
with open('豆瓣.json','a',encoding='utf-8') as f:
print('正在写入数据{}...'.format(item['title']))
f.write(json.dumps(item,ensure_ascii=False))
f.write('\n')
#定义一个主函数
def main(offset):
url='https://movie.douban.com/top250?start={}'.format(offset)
html=getpage(url)
print('正在解析程序.....')
if html:
for i in parsepage(html):
writefile(i)
if __name__=="__main__":
for i in range(0,250,25):
main(offset=i)
time.sleep(2)