import requests
import os
from lxml import html
import time
def get_title_url(tree):
'''一级 获取标题'''
# 史书典籍
# 格式:/book/sanguoyanyi.html
History_book_url_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/@href")
# 格式:三国演义
History_book_name_list = tree.xpath("//div[@class='index-li'][3]/ul/li/a/text()")
return History_book_url_list,History_book_name_list
def get_article_url(tree):
'''二级 获取文章标题'''
# 三国演义典籍
# 格式:/book/sanguoyanyi/1.html
book_url_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/@href")
# 格式:第一回·宴桃园豪杰三结义 斩黄巾英雄首立功
book_name_list = tree.xpath("//div[@class='book-mulu']/ul/li/a/text()")
return book_url_list,book_name_list
def get_article(tree):
'''三级 获取文章内容'''
# 第一回·宴桃园豪杰三结义 斩黄巾英雄首立功
# 格式:/book/sanguoyanyi/1.html
article_list = tree.xpath("//div[@class='chapter_content']/p/text()")
return ''.join(article_list)
def get_request(url,headers):
'''获取页面'''
response = requests.get(url=url,headers=headers)
tree = html.fromstring(response.text)
return tree
def save_mkdir(two):
'''三级 保存文章夹'''
# 一级文件夹
if os.path.exists('史书典籍'):
pass
else:
os.mkdir('史书典籍')
# 二级文件夹
if os.path.exists('史书典籍/'+ two):
pass
else:
os.mkdir('史书典籍/'+ two)
def police_2(a):
'''二级中断检测'''
b = None
if os.path.exists('史书典籍/police_2.txt'):
with open('史书典籍/police_2.txt', 'r') as f:
b = f.read()
f.close()
if b is None:
return True
elif b is '':
return True
if a < int(b):
return False
# 写入并返回True
with open('史书典籍/police_2.txt', 'w') as f:
f.write(str(a))
f.close()
return True
def police_3(a):
'''三级中断检测'''
b = None
if os.path.exists('史书典籍/police_3.txt'):
with open('史书典籍/police_3.txt', 'r') as f:
b = f.read()
f.close()
if b is None:
return True
elif b is '':
return True
if a < int(b):
return False
# 写入并返回True
with open('史书典籍/police_3.txt', 'w') as f:
f.write(str(a))
f.close()
return True
def main():
'''主函数'''
# 根路由
root = 'http://www.shicimingju.com'
# 头部
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36'
}
# 获取root页面
tree1 = get_request(root,headers)
# 获取一级名字和路由
History_book_url_list, History_book_name_list = get_title_url(tree1)
# 获取二级页面
for i in range(len(History_book_url_list)):
if police_2(i) is False:
continue
# 二级路由
url2 = root + History_book_url_list[i]
print("爬取>>>"+History_book_name_list[i]+'开始')
tree2 = get_request(url2,headers)
# 获取二级名字和路由
book_url_list,book_name_list = get_article_url(tree2)
# 文章夹保存
save_mkdir(History_book_name_list[i])
# 下载文章
for j in range(len(book_url_list)):
if police_3(j) is False:
continue
time.sleep(1)
# 三级路由
url3 = root + book_url_list[j]
print("爬取:" + book_name_list[j])
# 文章
tree3 = get_request(url3, headers)
txt = get_article(tree3)
# 文章标题
txt_name = book_name_list[j]
# 文章保存
file_path = '史书典籍/{}/{}.txt'.format(History_book_name_list[i],(txt_name.replace(' ','')).replace('·',''))
with open(file_path,'w',encoding='utf-8') as f:
f.write(txt)
f.close()
print("爬取>>>" + History_book_name_list[i] + '结束')
if __name__ == '__main__':
main()