爬虫实战 - 利用百度百科爬取运动员的信息(func.py)

func.py (定义函数)

  1 from bs4 import BeautifulSoup
  2 import urllib.request
  3 import urllib.parse
  4 import sqlite3
  5 import setting
  6 import time
  7 import re
  8 import os
  9 
 10 
 11 # 打开链接
 12 def ask_url(para_url, para_head):
 13     request = urllib.request.Request(url=para_url, headers=para_head)
 14     para_html = ''  # response对象被 read后是二进制数据
 15     try:
 16         response = urllib.request.urlopen(request)
 17         para_html = response.read()
 18     except urllib.error as e:
 19         print(e.reason)
 20 
 21     return para_html
 22 
 23 
 24 # 连接数据库
 25 def connect(para_db, para_sql):
 26     conn = sqlite3.connect(para_db)
 27     c = conn.cursor()
 28     sql = para_sql
 29     c.execute(sql)
 30     conn.commit()  # 一定注意执行后还要提交!
 31     c.close()
 32     conn.close()
 33 
 34 
 35 # 写入 error.txt成 error_list
 36 def creat_error_list(txt_path):
 37     error_list = []
 38     file = open(txt_path, 'r')
 39     for i in file:
 40         error_list.append(i.strip())
 41     file.close()  # 注意关闭文件
 42     return error_list
 43 
 44 
 45 # 写出 error_list成 error.txt
 46 def creat_error_txt(error_list):
 47     error_file = open('error_url.txt', 'w')
 48     for url in error_list:
 49         error_file.write(url)
 50         error_file.write('\n')
 51     error_file.close()
 52 
 53 
 54 # 打开要爬取的 txt文件并进行数据处理
 55 def initial_txt_data_process(file_name):
 56     # 列表初始化
 57     class_list = []  # 类列表(用于精确搜索)
 58     top_name_list = []  # 名列表(二维数组,先是每类,再是类中的每个名字)
 59 
 60     # 打开 txt文件
 61     file = open(file_name, 'rb')
 62     read = file.read().decode('utf-8')  # 解码
 63     file.close()  # 记得关闭文件
 64 
 65     # 查找类与姓名
 66     class1 = re.findall(setting.re_txtFind_class, read)
 67     name1 = re.findall(setting.re_txtFind_name, read)
 68 
 69     # 数据处理
 70     for i in class1:
 71         class_list.append(i.strip('\r'))
 72 
 73     for i in range(0, len(name1)):
 74         name1[i] = name1[i].strip('\r')
 75         name1[i] = name1[i].split('')
 76         top_name_list.append(name1[i])  # 此步多余,处理后的 name1已经满足要求了
 77 
 78     # 数据返回
 79     return class1, top_name_list  # 返回了类列表与姓名列表
 80 
 81 
 82 # 获取第一步 - 百度搜索的 url (参数:需要爬取的 txt文件)
 83 def get_url(file_name):
 84     url_list = []
 85     class1, top_name_list = initial_txt_data_process(file_name)  # 获取类列表与姓名列表
 86     for i in range(0, len(class1)):
 87         for j in range(0, len(top_name_list[i])):
 88             natural_url = 'https://www.baidu.com/s?ie=UTF-8&wd=' + urllib.parse.quote(
 89                 '%s(%s)百度百科' % (top_name_list[i][j], class1[i]))
 90             url_list.append(natural_url)
 91     print('url列表获取完毕,共%d条数据' % len(url_list))
 92     return url_list
 93 
 94 
 95 # 数据处理 - 正式爬取信息
 96 def data_handle(url_list, error_list):
 97     for index in range(0, len(url_list)):
 98         athlete = []
 99         # time.sleep(5)
100         # 初始网页 - 百度搜索
101         try:
102             html_first = ask_url(url_list[index], setting.head_package).decode('utf-8')
103         except Exception as e:
104             print('百度搜索发生异常:', e)
105             error_list.append(url_list[index])
106             continue
107 
108         # beautiful soup解析 html
109         soup = BeautifulSoup(html_first, 'html.parser')
110 
111         # 从搜索结果的左侧内容查找 - 结果中的第一条
112         content_left = soup.find_all('div', id='content_left')
113         first_link = re.findall(setting.re_link, str(content_left))
114         try:
115             first_link = first_link[0]
116         except IndexError as e:
117             print('遭遇百度验证,此次爬取失败!')
118             if url_list[index] not in error_list:
119                 error_list.append(url_list[index])
120             continue
121 
122         # 第二网页 - 百度百科
123         # time.sleep(5)
124         try:
125             html_sec = ask_url(first_link, setting.head_package).decode('utf-8')
126         except Exception as e:
127             print('发生了异常:', e)
128             continue
129 
130         # beautiful soup解析 html
131         soup = BeautifulSoup(html_sec, 'html.parser')
132 
133         # 查找姓名
134         content = soup.find_all('dd', class_="lemmaWgt-lemmaTitle-title")
135         temp_name1 = re.findall(setting.re_name1, str(content))
136         temp_name2 = re.findall(setting.re_name2, str(content))
137         if temp_name1:
138             temp_name1 = temp_name1[0]
139         if temp_name2:
140             temp_name2 = temp_name2[0]
141         else:
142             temp_name2 = ' '
143 
144         athlete.append(temp_name1)
145         athlete.append(temp_name2)
146 
147         # 查找简介
148         content = soup.find_all('div', class_="lemma-summary")
149         result = re.sub(setting.re_summary, '', str(content))
150         result = re.sub(setting.re_remove, '', result)
151         result = result.strip()
152 
153         athlete.append(result)
154 
155         # 写入此时的百度百科链接
156         athlete.append(first_link)
157 
158         # 查找图片链接
159         content = soup.find_all('div', class_="summary-pic")  # 查找 summary_pic标签
160         result = re.findall(setting.re_summary_img, str(content))
161         if result:
162             result = result[0]
163         else:
164             result = ' '
165         athlete.append(result)
166 
167         # 连接数据库
168         # 写入数据
169         sql_insert_athlete = '''
170             insert into athlete_info(
171             name1, name2, instruction, baike_url, img_url
172             ) values (%s, %s, %s, %s, %s)
173         ''' % ('"' + athlete[0] + '"', '"' + athlete[1] + '"', '"' + athlete[2] + '"', '"' + athlete[3] + '"', '"' + athlete[4] + '"')
174 
175         connect('athlete.db', sql_insert_athlete)

 

posted @ 2021-04-23 16:37  vosoland  阅读(229)  评论(0)    收藏  举报