百科爬虫

特点:根据关键词的特征 寻找与关键词最相关的百科的实体

==|  Fighting ~~
import re
from urllib.parse import quote, urljoin
import requests
from bs4 import BeautifulSoup
from lxml import etree

s1, s2 = '\u4e00', '\u9fa5'
d1, d2 = '0', '9'
po = ",。、;():\n.-():-"


def get_str_baike(s):
    ans = ''
    s = re.sub('\[[^\[]*\]', '', s)
    pos = ',、;():.-():-'
    for ch in s:
        if (ch in pos or s1 <= ch <= s2 or d1 <= ch <= d2 or 'A' <= ch <= 'Z' or 'a' <= ch <= 'z'):
            ans += ch
    return ans


def craw_bk(key, feature=''):
    def rt_response(url):
        sessions = requests.session()
        sessions.headers[
            'User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'
        html = sessions.get(url)
        html.encoding = 'utf'
        return etree.HTML(html.text)

    def get_raw_html(url, code='UTF-8'):
        head = {
            'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Mobile Safari/537.36"
        }
        try:
            r = requests.get(url, headers=head)
            r.encoding = code
            html = r.text
        except BaseException:
            print("open error", url)
            return ""
        return html

    def get_key_val(html):
        ans = dict()
        soup = BeautifulSoup(html, 'lxml')
        dd = soup.find('dd', class_='lemmaWgt-lemmaTitle-title')
        if dd:
            ans['name'] = dd.find('h1').get_text()
        dt = soup.find_all('dt', class_='basicInfo-item name')
        dd = soup.find_all('dd', class_='basicInfo-item value')
        for i in range(len(dt)):
            s1 = dt[i].get_text().strip('\n')
            s2 = dd[i].get_text().strip('\n')
            s = ''.join([ch for ch in s1 if '\u4e00' <= ch <= '\u9fa5'])
            ans[s] = s2
            # print(f'{s}: {s2}')
        div = soup.find('div', class_='lemma-summary')
        if div:
            pa = div.find_all('div', class_='para')
            txt = '\n'.join([it.get_text() for it in pa])
            li = txt.strip('\n').split('\n')
            txt = '\n'.join([it for it in li if it != '\n'])
            ans['introduct'] = txt
        return ans

    def search_find(key, feature):
        key = quote(key + feature)
        url = 'http://baike.baidu.com/search/none?word={}'.format(key)
        response = rt_response(url)
        hrefs = response.xpath('//a[@class="result-title"]/@href')
        if hrefs:
            href = urljoin(url, hrefs[0])
            url = href + '?noadapt=1'
            html = get_raw_html(url, code='UTF-8')
            ans = get_key_val(html)
            return ans
        else:
            return None
    s = quote(key)
    url = 'http://baike.baidu.com/item/' + s + '?noadapt=1'
    html = get_raw_html(url)
    soup = BeautifulSoup(html, 'lxml')
    s = soup.find('div', class_="main-content") 
    if s and feature in s.get_text():  # feature 不在文本中
        ans = get_key_val(html)
    else:
        ans = search_find(key, feature)  # 搜索查询
    for key, val in ans.items():  # 字符串规范
        ans[key] = get_str_baike(val)
    return ans


if __name__ == '__main__':
    ans = craw_bk('朝阳区', feature='长春市')
    for key, val in ans.items():
        print(f'{key}:{val}')