- 普通方式编写
from lxml import html
import requests
import os
url_first = "http://www.daomubiji.com/"
def get_text(url):
response = requests.request('get', url)
return response.text
# 获取所有部 书
text = get_text(url_first)
etree = html.etree.HTML(text)
# 所有书的名字
book_name = etree.xpath("//ul[@class='sub-menu']/li/a/text()")
# 所有书的链接
book_link = etree.xpath("//ul[@class='sub-menu']/li/a/@href")
try:
for i in range(len(book_link)):
second = get_text(book_link[i])
etree_second = html.etree.HTML(second)
link = etree_second.xpath("//article[@class='excerpt excerpt-c3']/a/@href")
book_title = etree_second.xpath("//article[@class='excerpt excerpt-c3']/a/text()")
print(book_name[i])
if os.path.exists(book_name[i]):
pass
else:
os.mkdir(book_name[i])
for j in range(len(link)):
book_content = get_text(link[j])
content_etree = html.etree.HTML(book_content)
book_p = content_etree.xpath('//article[@class="article-content"]/p/text()')
print(book_p)
try:
with open('{}/{}.txt'.format(book_name[i],book_title[j]), 'w+') as f:
for data in book_p:
# print(book_p)
f.write(data + "\n")
except Exception:
print("编写出现异常")
print(link, len(link))
print(book_title, len(book_title))
except Exception:
print("未知错误")
- 面向对象方式编写
from lxml import html
import requests
import os
class DaoMuNote:
def __init__(self, url=None):
self.url = url
# 获取相应文本
def get_response_text(self, url):
response = requests.request('get', url=url)
return response.text
# 创建 etree 对象
def create_etree(self, text):
etree = html.etree.HTML(text)
return etree
# 创建文件夹
def create_files(self, file_name):
self.file_name = file_name
if os.path.exists(file_name):
return file_name
else:
os.mkdir(file_name)
return file_name
# 写文本
def writr_text(self, file_name, files, text):
try:
with open('{}/{}.txt'.format(file_name, files), 'a+') as f:
f.write(text+'\n')
except Exception:
print("编写出现错误")
url = "http://www.daomubiji.com/"
# 初始化 DaoMuNote 对象
note = DaoMuNote(url=url)
# 第一页的所有标签
first_content = note.get_response_text(note.url)
first_etree = note.create_etree(first_content)
first_links = first_etree.xpath("//ul[@class='sub-menu']/li/a/@href")
first_title = first_etree.xpath("//ul[@class='sub-menu']/li/a/text()")
# 异常处理
try:
# 第一页遍历 访问各个集数的链接 并进行xpath解析
for i in range(len(first_links)):
print(first_links[i])
second_content = note.get_response_text(first_links[i])
second_etree = note.create_etree(second_content)
second_links = second_etree.xpath("//article[@class='excerpt excerpt-c3']/a/@href")
second_titles = second_etree.xpath("//article[@class='excerpt excerpt-c3']/a/text()")
# 创建文件夹 书的集数作为标题
file_name = note.create_files(first_title[i])
for j in range(len(second_links)):
third_content = note.get_response_text(second_links[j])
third_etree = note.create_etree(third_content)
book_content = third_etree.xpath('//article[@class="article-content"]/p/text()')
print(book_content)
print(file_name, second_titles[j])
for data in book_content:
note.writr_text(file_name, second_titles[j], data)
except Exception as e:
print("出现异常",e)