1 # coding=utf-8
2 import numpy as np
3 import pandas as pd
4 import sys
5
6 from selenium import webdriver
7 import time
8 import requests
9 import re
10 from openpyxl.workbook import Workbook
11 import matplotlib.pyplot as plt
12 import matplotlib
13
14 urls = []
15 urls_new = []
16 titles = []
17 titles_new = []
18 days = []
19 comments = []
20 authors = []
21 sources = []
22 comment = []
23 ty = []
24 def save_to_file(file_name, contents):
25 fh = open(file_name, 'w')
26 fh.write(contents)
27 fh.close()
28
29 url="https://www.ithome.com/"
30 # headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
31 # '(KHTML,like Gecko) Chrome/50.0.2661.102 Safari/537.36 QIHU 360EE'}
32 headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0"}
33 rep = requests.get(url,headers=headers)
34 rep.encoding="utf-8"
35 strw=rep.text
36 save_to_file('ithome.html', strw)
37 p = re.compile(r'<div class="lst lst-1 new-list">(.*?)</div>\s*?</div>')
38 m = p.findall(strw)
39 print(len(m[0]))
40 p = re.compile(r'<div class=\"block \d{4} new-list-\d{1}\"(?: style=\".*?\")?><ul>(.*?)</ul></div>')
41 m2 = p.findall(m[0])
42 print(len(m2))
43
44 broswer = webdriver.Chrome('D:\谷歌\Google\Chrome\Application\chromedriver.exe')
45
46 for i in m2:
47 m2 = re.findall(r'</span><span class=\"title\">.*?href=\"(.*?)\">(?:<.*?>)?(.*?)(?:</font>)?</a></span></li>', i)
48 for j in m2:
49 urls.append(j[0])
50 titles.append(j[1])
51 print(len(urls))
52 for i in range(len(urls)):
53 print(u'读取中' + urls[i])
54 broswer.get(urls[i])
55 time.sleep(1)
56 strw2 = broswer.page_source
57 # print(strw2)
58 p2 = re.compile(r'https://\w+?.ithome.com/(?:html/)?(.*?)/.*?')
59 m2 = p2.findall(urls[i])
60 print(m2)
61 p = re.compile(u'<span id="pubtime_baidu">(\d*-\d*-\d*).*?</span><span id="source_baidu">'
62 u'来源:<a href=".*?" .*?>(.*?)</a></span><span id="author_baidu">'
63 u'作者:(?:<strong>)?(.*?)(?:</strong>)?</span>.*?<span id="commentcount">(.*?)</span>')
64 m = p.findall(strw2)
65 print(m)
66 if len(m) > 0:
67 days.append(m[0][0])
68 sources.append(m[0][1])
69 authors.append(m[0][2])
70 urls_new.append(urls[i])
71 comments.append(m[0][3])
72 titles_new.append(titles[i])
73 ty.append(m2[0])
74 print("读取结束")
75 data={'日期':days,'作者':authors,'来源':sources,'标题':titles_new,'链接':urls_new,'评论数量':comments,'新闻类型':ty}
76 df = pd.DataFrame(data, columns=['日期', '作者', '来源','标题','链接','评论数量','新闻类型'])
77 # print(df)
78 df.to_excel(r'ShuJuPa.xlsx',sheet_name='数据爬取结果',encoding='gb2312')