Python--day18--爬虫

viewcode:
1 import requests 2 3 import re 4 import json 5 6 def getPage(url): 7 8 response=requests.get(url) 9 return response.text 10 11 def parsePage(s): 12 13 com=re.compile('<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>' 14 '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>',re.S) 15 16 ret=com.finditer(s) 17 for i in ret: 18 yield { 19 "id":i.group("id"), 20 "title":i.group("title"), 21 "rating_num":i.group("rating_num"), 22 "comment_num":i.group("comment_num"), 23 } 24 25 def main(num): 26 27 url='https://movie.douban.com/top250?start=%s&filter='%num 28 response_html=getPage(url) 29 ret=parsePage(response_html) 30 print(ret) 31 f=open("move_info7","a",encoding="utf8") 32 33 for obj in ret: 34 print(obj) 35 data=json.dumps(obj,ensure_ascii=False) 36 f.write(data+"\n") 37 38 if __name__ == '__main__': 39 count=0 40 for i in range(10): 41 main(count) 42 count+=25
flag简化版:
1 import re 2 import json 3 from urllib.request import urlopen 4 5 def getPage(url): 6 response = urlopen(url) 7 return response.read().decode('utf-8') 8 9 def parsePage(s): 10 com = re.compile( 11 '<div class="item">.*?<div class="pic">.*?<em .*?>(?P<id>\d+).*?<span class="title">(?P<title>.*?)</span>' 12 '.*?<span class="rating_num" .*?>(?P<rating_num>.*?)</span>.*?<span>(?P<comment_num>.*?)评价</span>', re.S) 13 14 ret = com.finditer(s) 15 for i in ret: 16 yield { 17 "id": i.group("id"), 18 "title": i.group("title"), 19 "rating_num": i.group("rating_num"), 20 "comment_num": i.group("comment_num"), 21 } 22 23 24 def main(num): 25 url = 'https://movie.douban.com/top250?start=%s&filter=' % num 26 response_html = getPage(url) 27 ret = parsePage(response_html) 28 print(ret) 29 f = open("move_info7", "a", encoding="utf8") 30 31 for obj in ret: 32 print(obj) 33 data = str(obj) 34 f.write(data + "\n") 35 36 count = 0 37 for i in range(10): 38 main(count) 39 count += 25
浙公网安备 33010602011771号