
1 import requests
2 from requests.exceptions import RequestException
3 from lxml import etree
4 import csv
5 import re
6 import time
7 from urllib import parse
8 import time
9
10
11 def get_page(url):
12 """
13 获取网页的源代码
14 :param url:
15 :return:
16 """
17 try:
18 headers = {
19 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
20
21 }
22 response = requests.get(url, headers=headers)
23 if response.status_code == 200:
24 return response.text
25 return None
26 except RequestException:
27 return None
28
29 def timeswitch(chuo):
30
31 tupTime = time.localtime(chuo) # 秒时间戳
32 stadardTime = time.strftime("%Y-%m-%d %H:%M:%S", tupTime)
33 return stadardTime
34
35 def parse_page(text):
36 """
37 解析网页源代码
38 :param text:
39 :return:
40 """
41 html = etree.HTML(text)
42
43 '''
44 movie_name = html.xpath("//*[@id='sogou_vr_11002601_title_0']/text()[1]")
45 actor = html.xpath("//p[@class='star']/text()")
46 actor = list(map(lambda item: re.sub('\s+', '', item), actor))
47 time = html.xpath("//p[@class='releasetime']/text()")
48 grade1 = html.xpath("//p[@class='score']/i[@class='integer']/text()")
49 grade2 = html.xpath("//p[@class='score']/i[@class='fraction']/text()")
50 new = [grade1[i] + grade2[i] for i in range(min(len(grade1), len(grade2)))]
51 ranking = html.xpath("///dd/i/text()")
52 return zip(ranking, movie_name, actor, time, new)
53 '''
54
55 biaotinew = list()
56 biaoti = html.xpath("//div[@class='txt-box']/h3/a")
57 for bt in biaoti:
58 b = bt.xpath("string(.)")
59 biaotinew.append(b)
60 print(biaotinew)
61
62 wangzhinew = list()
63 base_url = 'https://weixin.sogou.com'
64 wangzhi = html.xpath("//div[@class='txt-box']/h3//@href")
65 for wz in wangzhi:
66 w = "".join(list(base_url)+wangzhi)
67 wangzhinew.append(w)
68 print(wangzhinew)
69
70 zhaiyaonew = list()
71 zhaiyao = html.xpath("//p[@class='txt-info']")
72 for bt in zhaiyao:
73 b = bt.xpath("string(.)")
74 zhaiyaonew.append(b)
75 print(zhaiyaonew)
76
77 gzh = html.xpath("//a[@class='account']/text()")
78 print(gzh)
79
80 lastnew = list()
81 shijiannew = list()
82 shijian = html.xpath("//div[2]/div/span")
83 for bt in shijian:
84 b = bt.xpath("string(.)")
85 shijiannew.append(b)
86 for bp in shijiannew :
87 newstr = re.findall(r"\d+\.?\d*",bp)
88 # ['1.45', '5', '6.45', '8.82']
89 lastor = ''.join(newstr)
90 lastnew.append(timeswitch(int(lastor)))
91 print(lastnew)
92
93
94
95
96 return zip(biaotinew,wangzhinew,zhaiyaonew,gzh,lastnew)
97
98
99
100
101 def change_page1(number):
102 """
103 翻页
104 :param number:
105 :return:
106 """
107 base_url ='https://weixin.sogou.com/weixin?oq=&query=python&_sug_type_=1&sut=0&lkt=0%2C0%2C0&s_from=input&ri=1&_sug_=n&type=2&sst0=1604564741184&page='
108 url = base_url +str(number)+'&ie=utf8&p=40040108&dp=1&w=01015002&dr=1'
109 return url
110
111
112 def save_to_csv(result, filename):
113 """
114 保存
115 :param result:
116 :param filename:
117 :return:
118 """
119 with open(filename, 'a',encoding='utf-8-sig',newline="") as csvfile:
120 writer = csv.writer(csvfile, dialect='excel')
121 writer.writerow(result)
122
123
124 def main():
125 """
126 主函数
127 :return:
128 """
129 f = open('message.csv', 'a+', encoding='utf-8-sig', newline="") # newline取消空行
130 csv_writer = csv.writer(f)
131 csv_writer.writerow(["文章名称","文章链接地址","摘要","公众号名称","发布时间"])
132 f.close()
133
134
135
136
137 for number in range(1,6):
138 url = change_page1(number)
139 text = get_page(url)
140 result = parse_page(text)
141 for a in result:
142 save_to_csv(a, filename='message.csv')
143
144
145 if __name__ == '__main__':
146 main()