import requests as r
import re,encodings
import time
from lxml import etree
def pa( url,name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
z = r.get(url, headers = headers)
z.encoding = 'UTF-8'
html = etree.HTML(z.text)
# 查找章节名字
zhangjie = html.xpath('//*[@id="wrapper"]/div[3]/div/div[2]/h1/text()')[0]
print(zhangjie)
# xpath 查找小说内容
content = html.xpath('//*[@id="content"]/text()')
content = '\n'.join(content)
with open(name, 'a+', encoding="UTF-8") as txt:
txt.write(zhangjie + "\n")
txt.write(content)
print(zhangjie + ":\t写入成功")
if __name__ == '__main__':
mulu_url = 'http://www.yuetutu.com/cbook_22694/'
'User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
s = r.get(mulu_url)
s.encoding = 'utf-8'
html = etree.HTML(s.text)
text = s.text
name = (re.search('<h1>(.*?)</h1>',text)).group()
name = (name.replace("<h1>",'')).replace('</h1>','')
name = "./%s.txt"%name
mulu = html.xpath('//*[@id="list"]/dl/dd/a/@href')
print(name)
print(mulu)
b = 1;
for i in mulu:
if b > 8 :
pa('http://www.yuetutu.com'+i, name)
b= 1+b
time.sleep(1)