#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import requests
import re
def remove_sup(string):
regex = re.compile(r'\[\d+-?\d+\]|\n')
return regex.sub('', string)
def query(content):
url = 'https://baike.baidu.com/item/' + urllib.parse.quote(content)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
req = urllib.request.Request(url=url, headers=headers, method='GET')
response = urllib.request.urlopen(req)
text = response.read().decode('utf-8')
soup=BeautifulSoup(text,"html.parser")
summary = soup.find("div", attrs={'class':'lemma-summary'})
s = ''
for p in summary.find_all("div", attrs={'class':'para'}):
s += remove_sup(p.get_text()) +'\n'
return s
if __name__ == "__main__":
"""
从百度百科获取查询的单位简介
"""
print('请输入要查询的单位(如清华大学)')
content = input('请输入:')
print(query(content))