import json
import re
import requests
from urllib.parse import quote
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
class BaiDuPerson:
def __init__(self, name):
self.temp_url = 'https://baike.baidu.com/search/word?word='
self.headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
self.response = ''
self.save_path = r'E:\百度json文件'
self.name = name
self.run()
def get_response(self):
url = self.temp_url + quote(self.name)
response = requests.get(url=url, headers=self.headers)
self.response = response.content.decode('utf8')
def check_ambiguity(self):
"""校验人名是否有歧义--多个人名指代"""
doc = pq(self.response)
ul = doc('.polysemantList-wrapper.cmn-clearfix')
if ul:
return True
else:
return False
def get_introduction(self):
"""
获得简介
"""
soup = BeautifulSoup(self.response, "lxml")
try:
result = soup.select(".lemma-summary")[0].text
result = "".join(result)
except:
result = ''
return result
def get_person_lifetime(self):
"""
获取生平数据
"""
res = self.response.split('<h2 class="title-text"')
h2_dict = {}
if len(res) == 1:
doc = pq(self.response)
content = doc('.para').text()
h2_dict['生平'] = content
else:
for h2 in res[1:]:
tmp2 = {}
if '<div class="album-list">' in h2:
h2 = h2.split('<div class="album-list">')[0]
if '<dt class="reference-title"' in h2:
h2 = h2.split('<dt class="reference-title"')[0]
if '<div class="rs - container - foot"' in h2:
h2 = h2.split('<div class="rs - container - foot"')[0]
if '<div class="tashuo-bottom"' in h2:
h2 = h2.split('<div class="tashuo-bottom"')[0]
if '<div class="go-auth-box"' in h2:
h2 = h2.split('<div class="go-auth-box"')[0]
if '<div class="side-content">' in h2:
h2 = h2.split('<div class="side-content">')[0]
h2 = '<h2 class="title-text"' + h2
soup = BeautifulSoup(h2, "lxml")
h2_key = soup.find("h2").get_text().replace(self.name, '').strip()
h3_dict = {}
if "<h3" in h2:
for h3 in h2.split("<h3")[1:]:
tmp3 = {}
h3 = "<h3" + h3
soup = BeautifulSoup(h3, "lxml")
replace = soup.find("h3").get_text()
h3_title = replace.replace(self.name, '').strip()
if "<ul" in h3:
res = h3.split("<ul")
ul_dict = {}
for ul in res[1:]:
ul = "<ul" + ul
soup = BeautifulSoup(ul, "lxml")
ul_title = soup.find("ul").get_text().replace(self.name, '').strip()
tmp1 = {}
for item in ul.split("</ul>")[1:]:
v_list = [] # 存储多个关系
soup = BeautifulSoup(item, "lxml")
ul_vlist = soup.find_all("div")
for i in ul_vlist:
ul_v = i.get_text().replace("\xa0", '')
for shangbiao in re.findall(re.compile(r"\[\d+\]"), ul_v):
ul_v = ul_v.replace(shangbiao, "")
if ul_v == '':
continue
else:
v_list.append(ul_v)
tmp1[ul_title] = v_list
ul_dict.update(tmp1)
h3_dict.update(ul_dict)
else:
h3_v = soup.get_text().replace(replace, "").replace("\xa0", '')
for shangbiao in re.findall(re.compile(r"\[\d+\]"), h3_v):
h3_v = h3_v.replace(shangbiao, "")
tmp3[h3_title] = [h3_v]
h3_dict.update(tmp3)
tmp2 = {h2_key: h3_dict}
h2_dict.update(tmp2)
else:
h2_v = soup.get_text().replace(soup.find("h2").get_text(), "").replace("\xa0", '')
for shangbiao in re.findall(re.compile(r"\[\d+\]"), h2_v):
h2_v = h2_v.replace(shangbiao, "")
h2_v = h2_v.split("\n")
h2_v_list = []
for item in h2_v:
if item and (not item == '编辑'):
h2_v_list.append(item)
tmp = {h2_key: h2_v_list}
h2_dict.update(tmp)
return h2_dict
def get_relationship(self):
"""
获取人物关系
"""
relationship = []
soup = BeautifulSoup(self.response, "lxml")
res_ship = soup.select(".info .name")
res_value = soup.select(".info .title")
for i in range(len(res_ship)):
temp = []
temp.append(self.name)
temp.append(res_ship[i].string)
temp.append(res_value[i].string)
relationship.append(temp)
return relationship
def get_person_details(self):
"""获取人物标签栏数据"""
doc = pq(self.response)
person_detail_key_doc_list = doc('.basic-info.cmn-clearfix dt').items()
person_detail_key_list = []
for key_doc in person_detail_key_doc_list:
person_detail_key = key_doc.text().replace(' ','')
person_detail_key_list.append(person_detail_key)
person_detail_value_doc_list = doc('.basic-info.cmn-clearfix dd').items()
person_detail_value_list = []
for value_doc in person_detail_value_doc_list:
person_detail_value = value_doc.text().replace(' ','')
person_detail_value_list.append(person_detail_value)
person_detail_dict = dict(zip(person_detail_key_list, person_detail_value_list))
return person_detail_dict
def get_name(self):
"""抓取的首页的人物名字"""
soup = BeautifulSoup(self.response, "lxml")
try:
name = soup.find("h1").text
except:
name = ''
return name
def run(self):
self.get_response()
check_ambiguity_result = self.check_ambiguity()
if check_ambiguity_result:
with open('有歧义.txt', 'a', encoding='utf8') as f:
f.write(self.name+'\n')
else:
introduction = self.get_introduction()
person_name = self.get_name()
relationship = self.get_relationship()
person_lifetime = self.get_person_lifetime()
person_detail = self.get_person_details()
person_information = dict()
person_information['Introduction'] = introduction
person_information['Rel'] = relationship
person_information['Details'] = person_detail
person_information.update(person_lifetime)
with open(self.save_path+'\\'+person_name+'.json', 'w', encoding='utf8') as f:
f.write(json.dumps(person_information, ensure_ascii=False))
if __name__ == '__main__':
name = '裴寂'
BaiDuPerson(name)