爬取豆瓣top250

import requests
from lxml import etree
import pandas as pd

num = 0
url = "https://movie.douban.com/top250?start="+str(num)+"&filter="
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/"
					   "537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
all_name = []
all_star = []
all_actor = []
all_types = []

while num < 250:
	resp = requests.get(url, headers=headers)
	e = etree.HTML(resp.text)
	name = e.xpath('//div[@class="hd"]/a/span[1]/text()')
	star = e.xpath('//div[@class="star"]/span[2]/text()')
	actor_type = e.xpath('//div[@class="bd"]/p[1]/text()')

	actor = actor_type[::2]
	types = actor_type[1::2]
	actor = [each.strip() for each in actor]
	actor = [each.replace("\xa0", "") for each in actor]
	types = [each.replace("\xa0", "") for each in types]
	types = [each.strip() for each in types]

	all_name.extend(name)
	all_star.extend(star)
	all_actor.extend(actor)
	all_types.extend(types)
	num += 25
	url = "https://movie.douban.com/top250?start="+str(num)+"&filter="

posted @ 2023-04-06 13:34 jzm1/ 阅读(20) 评论(0) 收藏举报

刷新页面返回顶部

爬取豆瓣top250

公告