1 from bs4 import BeautifulSoup
2 import urllib.request
3 import urllib.parse
4 import sqlite3
5 import setting
6 import time
7 import re
8 import os
9
10
11 # 打开链接
12 def ask_url(para_url, para_head):
13 request = urllib.request.Request(url=para_url, headers=para_head)
14 para_html = '' # response对象被 read后是二进制数据
15 try:
16 response = urllib.request.urlopen(request)
17 para_html = response.read()
18 except urllib.error as e:
19 print(e.reason)
20
21 return para_html
22
23
24 # 连接数据库
25 def connect(para_db, para_sql):
26 conn = sqlite3.connect(para_db)
27 c = conn.cursor()
28 sql = para_sql
29 c.execute(sql)
30 conn.commit() # 一定注意执行后还要提交!
31 c.close()
32 conn.close()
33
34
35 # 写入 error.txt成 error_list
36 def creat_error_list(txt_path):
37 error_list = []
38 file = open(txt_path, 'r')
39 for i in file:
40 error_list.append(i.strip())
41 file.close() # 注意关闭文件
42 return error_list
43
44
45 # 写出 error_list成 error.txt
46 def creat_error_txt(error_list):
47 error_file = open('error_url.txt', 'w')
48 for url in error_list:
49 error_file.write(url)
50 error_file.write('\n')
51 error_file.close()
52
53
54 # 打开要爬取的 txt文件并进行数据处理
55 def initial_txt_data_process(file_name):
56 # 列表初始化
57 class_list = [] # 类列表(用于精确搜索)
58 top_name_list = [] # 名列表(二维数组,先是每类,再是类中的每个名字)
59
60 # 打开 txt文件
61 file = open(file_name, 'rb')
62 read = file.read().decode('utf-8') # 解码
63 file.close() # 记得关闭文件
64
65 # 查找类与姓名
66 class1 = re.findall(setting.re_txtFind_class, read)
67 name1 = re.findall(setting.re_txtFind_name, read)
68
69 # 数据处理
70 for i in class1:
71 class_list.append(i.strip('\r'))
72
73 for i in range(0, len(name1)):
74 name1[i] = name1[i].strip('\r')
75 name1[i] = name1[i].split('、')
76 top_name_list.append(name1[i]) # 此步多余,处理后的 name1已经满足要求了
77
78 # 数据返回
79 return class1, top_name_list # 返回了类列表与姓名列表
80
81
82 # 获取第一步 - 百度搜索的 url (参数:需要爬取的 txt文件)
83 def get_url(file_name):
84 url_list = []
85 class1, top_name_list = initial_txt_data_process(file_name) # 获取类列表与姓名列表
86 for i in range(0, len(class1)):
87 for j in range(0, len(top_name_list[i])):
88 natural_url = 'https://www.baidu.com/s?ie=UTF-8&wd=' + urllib.parse.quote(
89 '%s(%s)百度百科' % (top_name_list[i][j], class1[i]))
90 url_list.append(natural_url)
91 print('url列表获取完毕,共%d条数据' % len(url_list))
92 return url_list
93
94
95 # 数据处理 - 正式爬取信息
96 def data_handle(url_list, error_list):
97 for index in range(0, len(url_list)):
98 athlete = []
99 # time.sleep(5)
100 # 初始网页 - 百度搜索
101 try:
102 html_first = ask_url(url_list[index], setting.head_package).decode('utf-8')
103 except Exception as e:
104 print('百度搜索发生异常:', e)
105 error_list.append(url_list[index])
106 continue
107
108 # beautiful soup解析 html
109 soup = BeautifulSoup(html_first, 'html.parser')
110
111 # 从搜索结果的左侧内容查找 - 结果中的第一条
112 content_left = soup.find_all('div', id='content_left')
113 first_link = re.findall(setting.re_link, str(content_left))
114 try:
115 first_link = first_link[0]
116 except IndexError as e:
117 print('遭遇百度验证,此次爬取失败!')
118 if url_list[index] not in error_list:
119 error_list.append(url_list[index])
120 continue
121
122 # 第二网页 - 百度百科
123 # time.sleep(5)
124 try:
125 html_sec = ask_url(first_link, setting.head_package).decode('utf-8')
126 except Exception as e:
127 print('发生了异常:', e)
128 continue
129
130 # beautiful soup解析 html
131 soup = BeautifulSoup(html_sec, 'html.parser')
132
133 # 查找姓名
134 content = soup.find_all('dd', class_="lemmaWgt-lemmaTitle-title")
135 temp_name1 = re.findall(setting.re_name1, str(content))
136 temp_name2 = re.findall(setting.re_name2, str(content))
137 if temp_name1:
138 temp_name1 = temp_name1[0]
139 if temp_name2:
140 temp_name2 = temp_name2[0]
141 else:
142 temp_name2 = ' '
143
144 athlete.append(temp_name1)
145 athlete.append(temp_name2)
146
147 # 查找简介
148 content = soup.find_all('div', class_="lemma-summary")
149 result = re.sub(setting.re_summary, '', str(content))
150 result = re.sub(setting.re_remove, '', result)
151 result = result.strip()
152
153 athlete.append(result)
154
155 # 写入此时的百度百科链接
156 athlete.append(first_link)
157
158 # 查找图片链接
159 content = soup.find_all('div', class_="summary-pic") # 查找 summary_pic标签
160 result = re.findall(setting.re_summary_img, str(content))
161 if result:
162 result = result[0]
163 else:
164 result = ' '
165 athlete.append(result)
166
167 # 连接数据库
168 # 写入数据
169 sql_insert_athlete = '''
170 insert into athlete_info(
171 name1, name2, instruction, baike_url, img_url
172 ) values (%s, %s, %s, %s, %s)
173 ''' % ('"' + athlete[0] + '"', '"' + athlete[1] + '"', '"' + athlete[2] + '"', '"' + athlete[3] + '"', '"' + athlete[4] + '"')
174
175 connect('athlete.db', sql_insert_athlete)