1 import json
2 import os
3 from urllib.parse import urlencode
4 import pymongo
5 import requests
6 from bs4 import BeautifulSoup
7 from requests.exceptions import ConnectionError
8 import re
9 from multiprocessing import Pool
10 from hashlib import md5
11 from json.decoder import JSONDecodeError
12 from config import *
13
14 client = pymongo.MongoClient(MONGO_URL, connect=False)
15 db = client[MONGO_DB]
16
17
18 def get_page_index(offset, keyword):
19 data = {
20 'autoload': 'true',
21 'count': 20,
22 'cur_tab': 3,
23 'format': 'json',
24 'keyword': keyword,
25 'offset': offset,
26 }
27 params = urlencode(data)
28 base = 'http://www.toutiao.com/search_content/'
29 url = base + '?' + params
30 try:
31 response = requests.get(url)
32 if response.status_code == 200:
33 return response.text
34 return None
35 except ConnectionError:
36 print('Error occurred')
37 return None
38
39
40 def download_image(url):
41 print('Downloading', url)
42 try:
43 response = requests.get(url)
44 if response.status_code == 200:
45 save_image(response.content)
46 return None
47 except ConnectionError:
48 return None
49
50
51 def save_image(content):
52 file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
53 print(file_path)
54 if not os.path.exists(file_path):
55 with open(file_path, 'wb') as f:
56 f.write(content)
57 f.close()
58
59
60 def parse_page_index(text):
61 try:
62 data = json.loads(text)
63 if data and 'data' in data.keys():
64 for item in data.get('data'):
65 yield item.get('article_url')
66 except JSONDecodeError:
67 pass
68
69
70 def get_page_detail(url):
71 try:
72 response = requests.get(url)
73 if response.status_code == 200:
74 return response.text
75 return None
76 except ConnectionError:
77 print('Error occurred')
78 return None
79
80
81 def parse_page_detail(html, url):
82 soup = BeautifulSoup(html, 'lxml')
83 result = soup.select('title')
84 title = result[0].get_text() if result else ''
85 images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
86 result = re.search(images_pattern, html)
87 if result:
88 data = json.loads(result.group(1).replace('\\', ''))
89 if data and 'sub_images' in data.keys():
90 sub_images = data.get('sub_images')
91 images = [item.get('url') for item in sub_images]
92 for image in images: download_image(image)
93 return {
94 'title': title,
95 'url': url,
96 'images': images
97 }
98
99
100 def save_to_mongo(result):
101 if db[MONGO_TABLE].insert(result):
102 print('Successfully Saved to Mongo', result)
103 return True
104 return False
105
106
107 def main(offset):
108 text = get_page_index(offset, KEYWORD)
109 urls = parse_page_index(text)
110 for url in urls:
111 html = get_page_detail(url)
112 result = parse_page_detail(html, url)
113 if result: save_to_mongo(result)
114
115
116 if __name__ == '__main__':
117 pool = Pool()
118 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
119 pool.map(main, groups)
120 pool.close()
121 pool.join()