import requests
import re
from lxml import etree
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"
}
def parse_page(url):
response=requests.get(url,headers=headers)
text=response.text
# 使用xpath找的标题
# html=etree.HTML(text)
# titles=html.xpath("//div[@class='cont']//b/text()")
titles =re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)#re.DOTALL代表.匹配所有字符
dynasties=re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
#author=re.findall(r'<p\sclass="source">.*?<a.*>(.*?)</a>',text)#偶然发现很神奇
authors=re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)
content_tags=re.findall(r'<div\sclass="contson".*?>(.*?)</div>',text,re.DOTALL)
contents=[]
for content in content_tags:
x=re.sub(r'<.*?>',"",content)
contents.append(x.strip())
poems=[]
for value in zip(titles,dynasties,authors,contents):
title,dynasty,author,content=value
poem={
'title':title,
'dynasty':dynasty,
'author':author,
'content':content
}
poems.append(poem)
for poem in poems:
print(poem)
def main():
for x in range(1,11):
url="https://www.gushiwen.org/default_%s.aspx" % x
parse_page(url)
if __name__ == '__main__':
main()