import requests
import re
import time
import json
from requests.exceptions import RequestException
def get_html_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
}
html = requests.get(url, headers=headers)
if html.status_code==200:
return html.text
return None
except RequestException:
return None
def get_parse_page(html):
pattern=re.compile('<li data-rid.*?>[\s\S.]*?<span class=.*?>(.*?)<cite>[\s\S.]*?<h4><a.*?data-bid=.*?>(.*?)</a>'
'</h4>[\s\S]*?<p class="author">[\s\S]*?<img.*?data-eid=.*?>(.*?)</a>',re.S)
items=re.findall(pattern,html)
for item in items:
yield {
'rank':item[0],
'title':item[1],
'author':item[2]
}
#<h4><a.*?data-bid=.*?>(.*?)</a></h4>[\s\S]*?<p class="author">[\s\S]*?<a.*?target="_blank">(.*?)</a><em>.*?</span>[\s\S.]*?<p class="intro">[\s\S]*?(.*?)[\s\S]*?</p>
def write_to_file(content):
with open('result.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
def main(page):
url='https://www.qidian.com/rank/yuepiao?page='+str(page)
html=get_html_page(url)
for con in get_parse_page(html):
write_to_file(con)
if __name__=='__main__':
for i in range(5):
main(i+1)
time.sleep(1)