from urllib.request import urlopen
import re
"""
爬虫,爬一个豆瓣网,把爬出来的存储到一个文件
"""
def get_request(url):
"""
把网址变为源码,因为计算机只能读懂源码,然后吧源码变为utf-8的格式
:param url:
:return:
"""
ret = urlopen(url)
return ret.read().decode('utf-8')
def get_findall(s):
"""
把源码的utf-8的正则显示出来,满足的就以列表打印出来
:param s:
:return:
"""
ret = re.findall(
'<div class="item">.*?<div class="pic">.*?<em.*?>(?P<id>\d.).*?<span class="title">.*?(?P<name>.*?)</span>'
'.*?<span class="rating_num".*?>(?P<arg>.*?)</span>.*?<span>(?P<cou>.*?)评价</span>', s, re.S)
return ret
def get_ur(w):
"""
调用函数。
:param w:
:return:
"""
url = 'https://movie.douban.com/top250?start=%s&filter=' % w
ret = get_request(url)
ret = get_findall(ret)
return ret
con = 0
while con < 30:
ret = get_ur(con)
with open('电影', 'w', encoding='utf-8') as f:
if ret:
try:
a = tuple(ret)
b = list(a)
f.write(str(b))
except Exception:
print('hehe1')
f.close()
print(ret)