简单爬取豆瓣re模块使用

from urllib.request import  urlopen,Request
import re
#=============================返回页面内容
#============================取页
def getPage(num):
    url = 'https://movie.douban.com/top250?start=%s&filter='%num*25
    header={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"
    }
    ret =  Request(url,headers=header)
    res= urlopen(ret)
    return  res.read().decode('utf-8')
def parsePage(res):
    ret = byte.finditer(res)
    for el in ret:
        yield {'id':el.group('id'),
                'name':el.group('name'),
                'rating_num':el.group('rating_num'),
                'pingfen人数':el.group('num')
               }

def saveData(ret):
    with open('douban.txt',mode='a',encoding='utf-8') as f:
        f.write(ret+'\n')



byte= re.compile('<div class="item">.*?<em class="">(?P<id>\d+).*?<span class="title">(?P<name>\w+).*?<span class="rating_num" property="v:average">(?P<rating_num>\d\.\d).*?<span>(?P<num>\d+人评价)</span>',re.S)
for i in  range(10):
    res = getPage(i)
    ret = parsePage(res)
    for el in ret:
        data = str(el)
        saveData(data)

 

posted @ 2021-01-21 11:44  苦行僧冬*婷  阅读(60)  评论(0)    收藏  举报