# -*-conding:utf-8-*-
"""
# File : 获取西游记数据.py
# Time :2022/4/24 17:38
# Author :希维
# version :python 3.8
# Description:
"""
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from lxml import etree
import os
def getTitleLink(url):
"""
:获取目录页上各章节标题和链接
"""
headers = {
'user-Agent':UserAgent().random
}
resp = requests.get(url, headers=headers).content.decode('gbk', errors='ignore') # 获取相应内容,改变内容编码格式
e = etree.HTML(resp)
chapters = e.xpath("/html/body/div[5]/div[2]/ul/li/a")
print('chapters',chapters)
titleLink = {} # 创建一个字典,用于存放每一章节的标题与对应的链接
for each in chapters:
title = each.text
# print('title:',title)
link = ('https://www.gdwxcn.com/'+ str(each.get('href'))) # 构建完整链接
# print('link:',link)
titleLink[title] = link #将新的标题和链接信息添加到字典
return titleLink # 返回目录页获取的章节与连接数据
def getText(url):
"""
:用于获取对应的文本信息
"""
headers = {
'user-Agent': UserAgent().random
}
resp = requests.get(url, headers=headers).content.decode('gbk', errors='ignore') # 获取相应内容,改变内容编码格式
# print(resp)
e = etree.HTML(resp)
text = e.xpath('/html/body/div[5]/div/div[1]/p/text()')
print('text:',text)
return text
def mkdir(path):
"""
创建文件夹
"""
folder = os.path.exists(path)
if not folder: # 判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) # makedirs 创建文件时如果路径不存在会创建这个路径
print("--- new folder... ---")
print("--- OK ---")
else:
print("--- There is this folder! ---")
def main():
"""
:定义函数获取小说
"""
url = 'https://www.gdwxcn.com/gdxs/xyj/'
titleLink = getTitleLink(url)
# print('titlelink', titleLink)
mkdir("D:\\Users\\Donal\\Documents\\pythonProject\\爬虫\\西游记小说爬取/西游记") # 创建文件夹
i = 1
for title, link in titleLink.items():
with open('D:\\Users\\Donal\\Documents\\pythonProject\\爬虫\\西游记小说爬取/西游记/'+str(i)+str(title)+'.txt', 'w',encoding='utf-8') as f:
for text in getText(link):
f.write(text)
f.close()
i += 1
if __name__ == '__main__':
main()