进击的爬虫-001-猫眼电影爬取
猫眼电影top 100 爬取
import requests
import re
def get_html(url,data):
ret = requests.get(url, params=data)
return ret.text
# re_before = re.compile('<dd>\s*.*\s*.*?title="(.*?)"')
# movies = re.finditer(movie_re, ret) #找出当前页面十个电影,得到一个可迭代对象
# movie_obj = next(movies).group() #通过next方法拿到第一个电影
# print(movie_obj)
#
# movie_name = re.findall('s="name".*?title="(.*?)"', movie_obj) #拿到第一个电影的电影名
# print(movie_name[0])
#
# movie_star = re.findall('s="star">\s*(.*?)\s*<', movie_obj) #拿到电影主演
# print(movie_star[0])
#
# movie_releasetime = re.search('s="releasetime">上映时间:(?P<time>.*?)<', movie_obj) #拿到电影时间
# print(movie_releasetime.group('time'))
def get_info(html_res):
movie_re = re.compile('<dd>[\d\D]*?</dd>')
movies = re.finditer(movie_re, html_res)
for movie_obj in movies:
movie_obj = movie_obj.group()
movie_name = re.findall('s="name".*?title="(.*?)"', movie_obj)[0]
movie_star = re.findall('s="star">\s*(.*?)\s*<', movie_obj)[0]
movie_releasetime = re.search('s="releasetime">上映时间:(?P<time>.*?)<', movie_obj).group('time')
movieinfo = f'电影名:{movie_name}, {movie_star}, 上映时间:{movie_releasetime}'
print(movieinfo)
url = 'https://maoyan.com/board/4'
data = {
'offset':0
}
for i in range(10):
data['offset'] = i * 10
html_res = get_html(url, data)
get_info(html_res)

浙公网安备 33010602011771号