1 import requests
2 import json
3 from requests.exceptions import ConnectionError
4 from json.decoder import JSONDecodeError
5 from lxml import etree
6 import re
7 #from config import *
8 import pymongo
9 import os
10 from hashlib import md5
11 from multiprocessing import Pool
12
13 client=pymongo.MongoClient('lacalhost')
14 db=client['toutiao1']
15 def get_page_parse(offset,keyword):
16 data={
17 'offset': offset,
18 'format': 'json',
19 'keyword':keyword,
20 'autoload':'true',
21 'count': 20,
22 'cur_tab': 3
23 }
24 url="https://www.toutiao.com/search_content/"
25 try:
26 response = requests.get(url,params=data)
27 if response.status_code == 200:
28 #json_data=response.json()
29 json_data=response.text
30 return json_data
31 return None
32 except ConnectionError:
33 print('Error occurred')
34 return None
35
36 def parse_page_index(text):
37 try:
38 data = json.loads(text)
39 for item in data.get('data'):
40 #print(item["article_url"])
41 if "article_url" in item:
42 yield item.get("article_url")
43 #yield item.get('article_url','default_value')
44 except JSONDecodeError:
45 pass
46 #html=get_page_parse(0,'街拍')
47 #for url in parse_page_index(html):
48 #if len(url)>9:
49 #print(url)
50 def get_page_detail(url):
51 try:
52 response = requests.get(url)
53 if response.status_code == 200:
54 return response.text
55 return None
56 except ConnectionError:
57 print('Error occurred')
58 return None
59
60 def parse_page_details(html,url):
61 html1 = etree.HTML(html)
62 title = html1.xpath('//head/title/text()')
63 pattern = re.compile('gallery: JSON.parse\("(.*?)"\),\s+siblingList', re.S)
64 urls = re.findall(pattern, html)
65 # print(urls)
66 #print(title)
67 d = ",".join(urls)
68 s = d.replace('\\', "")
69 j = json.loads(s)
70 images_urls = [item.get('url') for item in j["sub_images"]]
71 for images_url in images_urls:download_image(images_url)
72 return {
73 'title':title,
74 'url':url,
75 'images_urls': images_urls
76
77 }
78
79 def save_to_mongo(resuit):
80 if db['toutiao1'].insert(resuit):
81 print("yes")
82 return True
83 else:
84 return False
85
86 def download_image(url):
87 print('brgain',url)
88 try:
89 response = requests.get(url)
90 if response.status_code == 200:
91 save_image(response.content)
92 return None
93 except ConnectionError:
94 print('Error occurred')
95 return None
96
97 def save_image(content):
98 file_path='{0}/{1}/{2}.{3}'.format(os.getcwd(),'pictyre',md5(content).hexdigest(),'jpg')
99 if not os.path.exists(file_path):
100 with open(file_path,'wb') as f:
101 f.write(content)
102 f.close()
103
104
105 def main(offset):
106 text=get_page_parse(offset,'街拍')
107 urls=parse_page_index(text)
108 for url in urls:
109 html=get_page_detail(url)
110 parse_page_details(html,url)
111 #print(result)
112 #save_to_mongo(result)
113
114 if __name__=='__main__':
115 for i in range(1,2):
116 pool=Pool()
117 pool.map(main,[offset*20 for offset in range(1,2)])