1 # coding: utf-8
2 import html
3 import json
4 import os
5 import re
6 import sys
7 from datetime import datetime
8 from pprint import pprint
9
10 import parsel
11 import requests
12 import schedule
13 from selenium.webdriver import Chrome
14 from selenium.webdriver.chrome.options import Options
15 import random
16 import time
17 import threading
18 from concurrent.futures import ThreadPoolExecutor
19
20
21 class WebDriver():
22 def __init__(self, url):
23 self.url = url
24 self.option = Options()
25 self.option.add_argument('--disable-blink-features=AutomationControlled')
26 self.option.add_argument('headless')
27 self.option.add_argument('--disable-gpu')
28 self.web = Chrome(options=self.option)
29
30 def run(self):
31 self.web.get(url=self.url)
32 self.web.maximize_window()
33 self.web.implicitly_wait(20)
34 time.sleep(1)
35
36 for i in range(0, 10000, random.randint(100,150)):
37 js = f"document.querySelector('#zjxw_pc').scrollTo(0, {i})"
38 self.web.execute_script(js)
39 time.sleep(random.random())
40 return self.pageSource()
41
42 def pageSource(self):
43 return self.web.page_source
44
45
46 class Spider(WebDriver):
47 def parseHomePage(self, htmlSource):
48 selector = parsel.Selector(htmlSource)
49 divs = selector.css('div.channelList_article_container__3dui5 div.listItems_article_item__3B3fT')
50 articleUrls = []
51 for div in divs:
52 articleUrl = div.css('a::attr(href)').get()
53 articleUrls.append(articleUrl)
54 return articleUrls
55
56 def paseArticlePage(self, articleUrl):
57 from faker import Faker
58 headers = {'user-agent': Faker().user_agent()}
59 resp = requests.get(url=articleUrl, headers=headers)
60 time.sleep(random.random())
61
62 selectoe = parsel.Selector(resp.text)
63 try:
64 json_data = json.loads(selectoe.css('script#__NEXT_DATA__::text').get())
65 articleData = json_data['props']['pageProps']['article']
66 except TypeError:
67 return None
68
69 docTitle = articleData['docTitle']
70 publishedAt = articleData['publishedAt']
71 dateObj = datetime.fromtimestamp(int(str(publishedAt)[0:10]))
72 publishedAt = dateObj.strftime('%Y-%m-%d %H:%M:%S')
73 content = articleData['content']
74 source = articleData['source']
75 author = articleData['author']
76 if author:
77 author = f'<p>作者:{author}</p>'
78 else:
79 author = ''
80 webLink = articleData['webLink']
81 if webLink != '':
82 video = ''.join(selectoe.css('video').getall())
83 video = html.unescape(video)
84 video = f'<div div align="center" style="text-align: left;">{video}</div>'
85 content = video + content
86
87 date = publishedAt.split(' ', 1)[0]
88 pub_date = publishedAt + '@' + source
89 column_url = self.url
90 head = f'<p class="column_url"><a href="{column_url}">栏目地址:{column_url}</a></p>\n' \
91 f'<p class="chapter_url"><a href="{articleUrl}">文章详细地址:{articleUrl}</a></p>\n' \
92 f'<p class="Summary"></p>\n' \
93 f'<p class="pub_data">{pub_date}</p>\n'
94 content = f'<div class="content">{author}{content}</div>'
95 content = head + '\n' + content
96 return docTitle, content, date
97
98 def save_content(self, path, title, content, date):
99 new_date = str(date).replace('/', '-')
100 timestamp = str(time.time()).split('.')[1]
101 if title == None:
102 if content:
103 title = timestamp
104 else:
105 return
106 new_title = ''.join(re.findall(r'([\u2E80-\u9FFF0-9a-zA-Z“”:?!《》,-]+)', title))
107 if '.' in new_title:
108 new_title = new_title.replace('.', '点')
109 if 'BR' in new_title:
110 new_title = new_title.replace('BR', '')
111
112 path = path.replace('\\', '\\\\')
113 filedspath = f'{path}\\{new_date}\\'
114 if not os.path.exists(filedspath):
115 os.mkdir(filedspath)
116 filedsname = filedspath + f'{new_title}.html'
117 with open(filedsname, mode='w', encoding='utf-8') as f:
118 f.write('<!DOCTYPE html>\n<html>\n')
119 f.write('<head><meta charset="UTF-8"></head>\n')
120 f.write('<body>\n')
121 f.write(f'<h1 align="center">{title}</h1>\n')
122 f.write(content)
123 f.write('\n</body>\n</html>')
124
125
126 class PicSpider(Spider):
127 def parseHomePage(self, htmlSource):
128 selector = parsel.Selector(htmlSource)
129 divs = selector.css('div.channelList_image_article_container__3rM_q div a')
130 articleUrls = []
131 for a in divs:
132 articleUrl = a.css('::attr(href)').get()
133 articleUrls.append(articleUrl)
134 return articleUrls
135
136
137 # 头条
138 def runTouTiao(path):
139 homeUrl = 'https://zj.zjol.com.cn/?id=52e5f902cf81d754a434fb50'
140 toutiao = Spider(url=homeUrl)
141 homePageSource = toutiao.run()
142 articleUrls = toutiao.parseHomePage(htmlSource=homePageSource)
143
144 for articleUrl in articleUrls:
145 content = toutiao.paseArticlePage(articleUrl)
146 if content != None:
147 title, content, date = content
148 toutiao.save_content(f'{path}\\头条', title, content, date)
149 print(title, date, articleUrl)
150
151
152 # 天下
153 def runTianXia(path):
154 homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba90a159bb84750661d51'
155 tianxia = Spider(url=homeUrl)
156 homePageSource = tianxia.run()
157 articleUrls = tianxia.parseHomePage(htmlSource=homePageSource)
158
159 for articleUrl in articleUrls:
160 content = tianxia.paseArticlePage(articleUrl)
161 if content != None:
162 title, content, date = content
163 tianxia.save_content(f'{path}\\天下', title, content, date)
164 print(title, date, articleUrl)
165
166
167 # 浙江
168 def runZheJiang(path):
169 homeUrl = 'https://zj.zjol.com.cn/?id=5d4ba8cd159bb84750661d50'
170 zhejiang = Spider(url=homeUrl)
171 homePageSource = zhejiang.run()
172 articleUrls = zhejiang.parseHomePage(htmlSource=homePageSource)
173
174 for articleUrl in articleUrls:
175 content = zhejiang.paseArticlePage(articleUrl)
176 if content != None:
177 title, content, date = content
178 zhejiang.save_content(f'{path}\\浙江', title, content, date)
179 print(title, date, articleUrl)
180
181
182 # 战疫
183 def runZhanYi(path):
184 homeUrl = 'https://zj.zjol.com.cn/?id=5e2e4410b4a13d092b0dc969'
185 zhanyi = Spider(url=homeUrl)
186 homePageSource = zhanyi.run()
187 articleUrls = zhanyi.parseHomePage(htmlSource=homePageSource)
188
189 for articleUrl in articleUrls:
190 content = zhanyi.paseArticlePage(articleUrl)
191 if content != None:
192 title, content, date = content
193 zhanyi.save_content(f'{path}\\战疫', title, content, date)
194 print(title, date, articleUrl)
195
196
197 # 观点
198 def runGuanDian(path):
199 homeUrl = 'https://zj.zjol.com.cn/?id=584e6ac7e200b2098f871d3a'
200 guandian = Spider(url=homeUrl)
201 homePageSource = guandian.run()
202 articleUrls = guandian.parseHomePage(htmlSource=homePageSource)
203
204 for articleUrl in articleUrls:
205 content = guandian.paseArticlePage(articleUrl)
206 if content != None:
207 title, content, date = content
208 guandian.save_content(f'{path}\\观点', title, content, date)
209 print(title, date, articleUrl)
210
211
212 # 生活
213 def runShengHuo(path):
214 homeUrl = 'https://zj.zjol.com.cn/?id=5534eb21498e2ca4bf9f3c34'
215 shenghuo = Spider(url=homeUrl)
216 homePageSource = shenghuo.run()
217 articleUrls = shenghuo.parseHomePage(htmlSource=homePageSource)
218
219 for articleUrl in articleUrls:
220 content = shenghuo.paseArticlePage(articleUrl)
221 if content != None:
222 title, content, date = content
223 shenghuo.save_content(f'{path}\\生活', title, content, date)
224 print(title, date, articleUrl)
225
226
227 # 图片
228 def runTuPian(path):
229 homeUrl = 'https://zj.zjol.com.cn/image-list'
230 tupian = PicSpider(url=homeUrl)
231 homePageSource = tupian.run()
232 articleUrls = tupian.parseHomePage(htmlSource=homePageSource)
233
234 for articleUrl in articleUrls:
235 content = tupian.paseArticlePage(articleUrl)
236 if content != None:
237 title, content, date = content
238 tupian.save_content(f'{path}\\图片', title, content, date)
239 print(title, date, articleUrl)
240
241
242 # 杭州
243 def runHangZhou(path):
244 homeUrl = 'https://zj.zjol.com.cn/local?id=53845624e4b08e9fb1cdfc17'
245 hangzhou = Spider(url=homeUrl)
246 homePageSource = hangzhou.run()
247 articleUrls = hangzhou.parseHomePage(htmlSource=homePageSource)
248
249 for articleUrl in articleUrls:
250 content = hangzhou.paseArticlePage(articleUrl)
251 if content != None:
252 title, content, date = content
253 hangzhou.save_content(f'{path}\\杭州', title, content, date)
254 print(title, date, articleUrl)
255
256
257 # 宁波
258 def runNingBo(path):
259 homeUrl = 'https://zj.zjol.com.cn/local?id=53845a6fe4b08e9fb1cdfcac'
260 ningbo = Spider(url=homeUrl)
261 homePageSource = ningbo.run()
262 articleUrls = ningbo.parseHomePage(htmlSource=homePageSource)
263
264 for articleUrl in articleUrls:
265 content = ningbo.paseArticlePage(articleUrl)
266 if content != None:
267 title, content, date = content
268 ningbo.save_content(f'{path}\\宁波', title, content, date)
269 print(title, date, articleUrl)
270
271
272 # 温州
273 def runWenZhou(path):
274 homeUrl = 'https://zj.zjol.com.cn/local?id=53845aaee4b08e9fb1cdfcb4'
275 wenzhou = Spider(url=homeUrl)
276 homePageSource = wenzhou.run()
277 articleUrls = wenzhou.parseHomePage(htmlSource=homePageSource)
278
279 for articleUrl in articleUrls:
280 content = wenzhou.paseArticlePage(articleUrl)
281 if content != None:
282 title, content, date = content
283 wenzhou.save_content(f'{path}\\温州', title, content, date)
284 print(title, date, articleUrl)
285
286
287 # 湖州
288 def runHuZhou(path):
289 homeUrl = 'https://zj.zjol.com.cn/local?id=53845b49e4b08e9fb1cdfcc1'
290 huzhou = Spider(url=homeUrl)
291 homePageSource = huzhou.run()
292 articleUrls = huzhou.parseHomePage(htmlSource=homePageSource)
293
294 for articleUrl in articleUrls:
295 content = huzhou.paseArticlePage(articleUrl)
296 if content != None:
297 title, content, date = content
298 huzhou.save_content(f'{path}\\湖州', title, content, date)
299 print(title, date, articleUrl)
300
301
302 # 嘉兴
303 def runJiaXing(path):
304 homeUrl = 'https://zj.zjol.com.cn/local?id=53845af4e4b08e9fb1cdfcbd'
305 jiaxing = Spider(url=homeUrl)
306 homePageSource = jiaxing.run()
307 articleUrls = jiaxing.parseHomePage(htmlSource=homePageSource)
308
309 for articleUrl in articleUrls:
310 content = jiaxing.paseArticlePage(articleUrl)
311 if content != None:
312 title, content, date = content
313 jiaxing.save_content(f'{path}\\嘉兴', title, content, date)
314 print(title, date, articleUrl)
315
316
317 # 绍兴
318 def runShaoXing(path):
319 homeUrl = 'https://zj.zjol.com.cn/local?id=53845b81e4b08e9fb1cdfccf'
320 shaoxing = Spider(url=homeUrl)
321 homePageSource = shaoxing.run()
322 articleUrls = shaoxing.parseHomePage(htmlSource=homePageSource)
323
324 for articleUrl in articleUrls:
325 content = shaoxing.paseArticlePage(articleUrl)
326 if content != None:
327 title, content, date = content
328 shaoxing.save_content(f'{path}\\绍兴', title, content, date)
329 print(title, date, articleUrl)
330
331
332 # 金华
333 def runJinHua(path):
334 homeUrl = 'https://zj.zjol.com.cn/local?id=53845bd9e4b08e9fb1cdfcda'
335 jinhua = Spider(url=homeUrl)
336 homePageSource = jinhua.run()
337 articleUrls = jinhua.parseHomePage(htmlSource=homePageSource)
338
339 for articleUrl in articleUrls:
340 content = jinhua.paseArticlePage(articleUrl)
341 if content != None:
342 title, content, date = content
343 jinhua.save_content(f'{path}\\金华', title, content, date)
344 print(title, date, articleUrl)
345
346
347 # 衢州
348 def runQuZhou(path):
349 homeUrl = 'https://zj.zjol.com.cn/local?id=53845c2ae4b08e9fb1cdfce3'
350 quzhou = Spider(url=homeUrl)
351 homePageSource = quzhou.run()
352 articleUrls = quzhou.parseHomePage(htmlSource=homePageSource)
353
354 for articleUrl in articleUrls:
355 content = quzhou.paseArticlePage(articleUrl)
356 if content != None:
357 title, content, date = content
358 quzhou.save_content(f'{path}\\衢州', title, content, date)
359 print(title, date, articleUrl)
360
361
362 # 舟山
363 def runZhouShan(path):
364 homeUrl = 'https://zj.zjol.com.cn/local?id=53845c65e4b08e9fb1cdfce7'
365 zhoushan = Spider(url=homeUrl)
366 homePageSource = zhoushan.run()
367 articleUrls = zhoushan.parseHomePage(htmlSource=homePageSource)
368
369 for articleUrl in articleUrls:
370 content = zhoushan.paseArticlePage(articleUrl)
371 if content != None:
372 title, content, date = content
373 zhoushan.save_content(f'{path}\\舟山', title, content, date)
374 print(title, date, articleUrl)
375
376
377 # 台州
378 def runTaiZhou(path):
379 homeUrl = 'https://zj.zjol.com.cn/local?id=53845c96e4b08e9fb1cdfcec'
380 taizhou = Spider(url=homeUrl)
381 homePageSource = taizhou.run()
382 articleUrls = taizhou.parseHomePage(htmlSource=homePageSource)
383
384 for articleUrl in articleUrls:
385 content = taizhou.paseArticlePage(articleUrl)
386 if content != None:
387 title, content, date = content
388 taizhou.save_content(f'{path}\\台州', title, content, date)
389 print(title, date, articleUrl)
390
391
392 # 丽水 来源
393 def runLiShui(path):
394 homeUrl = 'https://zj.zjol.com.cn/local?id=53845cd2e4b08e9fb1cdfcf0'
395 lishui = Spider(url=homeUrl)
396 homePageSource = lishui.run()
397 articleUrls = lishui.parseHomePage(htmlSource=homePageSource)
398
399 for articleUrl in articleUrls:
400 content = lishui.paseArticlePage(articleUrl)
401 if content != None:
402 title, content, date = content
403 lishui.save_content(f'{path}\\丽水', title, content, date)
404 print(title, date, articleUrl)
405
406
407 # 义乌
408 def runYiWu(path):
409 homeUrl = 'https://zj.zjol.com.cn/local?id=5428f31b498e0d3c0109194e'
410 tianxia = Spider(url=homeUrl)
411 homePageSource = tianxia.run()
412 articleUrls = tianxia.parseHomePage(htmlSource=homePageSource)
413
414 for articleUrl in articleUrls:
415 content = tianxia.paseArticlePage(articleUrl)
416 if content != None:
417 title, content, date = content
418 tianxia.save_content(f'{path}\\义乌', title, content, date)
419 print(title, date, articleUrl)
420
421
422 # 视频
423 def runShiPin(path):
424 homeUrl = 'https://zj.zjol.com.cn/?id=57d690e7e200b20fbb4af09f'
425 shipin = Spider(url=homeUrl)
426 homePageSource = shipin.run()
427 articleUrls = shipin.parseHomePage(htmlSource=homePageSource)
428
429 for articleUrl in articleUrls:
430 content = shipin.paseArticlePage(articleUrl)
431 if content != None:
432 title, content, date = content
433 shipin.save_content(f'{path}\\视频', title, content, date)
434 print(title, date, articleUrl)
435
436
437 def runAll():
438 column_list = [runTouTiao, runTianXia, runZheJiang, runZhanYi, runGuanDian, runShengHuo, runTuPian, runHangZhou,
439 runNingBo, runWenZhou, runHuZhou, runJiaXing, runShaoXing, runJinHua, runQuZhou, runZhouShan,
440 runTaiZhou, runLiShui, runYiWu, runShiPin]
441
442 with ThreadPoolExecutor(max_workers=5) as t:
443 for column in column_list:
444 t.submit(column, 'E:\\data\\zjxww')
445
446 # for i in range(10):
447 # time.sleep(1)
448 # print(i)
449
450
451 def start(minutes=0):
452 flg = minutes
453 schedule.every(minutes).minutes.do(runAll)
454 while True:
455 if minutes != 0:
456 print(f'等待{minutes}分钟')
457 else:
458 minutes = flg
459 schedule.run_pending()
460
461
462 time.sleep(60)
463 minutes = minutes - 1
464
465
466 def exists():
467 column_zh_list = ['头条', '天下', '浙江', '战疫', '观点', '生活', '图片', '杭州', '宁波', '温州',
468 '湖州', '嘉兴', '绍兴', '金华', '衢州', '舟山', '台州', '丽水', '义乌', '视频', ]
469
470 if not os.path.exists('E:\\data\\zjxww'):
471 os.mkdir('E:\\data\\zjxww')
472 for i in column_zh_list:
473 filedspath = f'E:\\data\\zjxww\\{i}'
474 os.mkdir(filedspath)
475 else:
476 for i in column_zh_list:
477 filedspath = f'E:\\data\\zjxww\\{i}'
478 if not os.path.exists(filedspath):
479 os.mkdir(filedspath)
480
481
482 if __name__ == "__main__":
483 if len(sys.argv) == 2:
484 exists()
485 minutes = int(sys.argv[1])
486 start(minutes)
487 if len(sys.argv) == 3:
488 exists()
489 flg = int(sys.argv[2])
490 if flg == 1:
491 runAll()
492 minutes = int(sys.argv[1])
493 start(minutes)
494
495 # runHuZhou('E:\\data\\zjxww')