# encoding=utf-8
import json # json 包,用于读取解析,生成json格式的文件内容
import time
from random import randint
import requests # 请求包 用于发起网络请求
from bs4 import BeautifulSoup # 解析页面内容帮助包
from lxml import etree
import re # 正则表达式
import csv
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码
# 全局变量
booknamelist = []
authorlist = []
typelist = []
contentlist = []
novel = []
def get_data(url):
"""
获取数据
:param url: 请求网址
:return:返回请求的页面内容
"""
# 请求头,模拟浏览器,否则请求会返回418
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'}
time.sleep(randint(1, 8))#控制爬虫速率以保证不会很快被网站察觉否则爬两页就会被封
resp = requests.get(url=url, headers=header) # 发送请求
resp.encoding = 'utf-8'
if resp.status_code == 200:
# 如果返回成功,则返回内容
return resp.text
else:
# 否则,打印错误状态码,并返回空
print('返回状态码:', resp.status_code)
return ''
# 写入文件
def writeIntoCSVFile(fileName):
'''
:param fileName:待保存csv文件路径
:return: None
'''
# newline = ''解决csv写入内容自动换行的问题
# 参考文献:https://blog.csdn.net/weixin_44064937/article/details/105745398
f = open(fileName, 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(f)
# 构建列表头
csv_writer.writerow(['作者', '书名', '类型', '简介', '小说'])
for i in range(len(booknamelist)):
csv_writer.writerow(
[str(authorlist[i]).strip(), str(booknamelist[i]).strip(), str(typelist[i]).strip(),
str(contentlist[i]).strip(), str(novel[i])])#加上strip去除里面的空格
f.close()
# 爬取600页数据一共9000条数据
for i in range(30, 100):
# LoopUrl = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter=' # 网页切换
url = 'http://read.nlc.cn/yuewen/index?&pageNo=' + str(i) + '&categoryId=14500'#url里面的参数可以根据实际信息修改
html = get_data(url=url) # 获取数据
root = etree.HTML(html)
booknames = root.xpath('//li/a/span[@class="right"]/span[@class="tt"]/text()') # 书名
authors = root.xpath('//li/a/span[@class="right"]/span[@class="txt1"]/text()') # 作者信息
types = root.xpath('//li/a/span[@class="right"]/span[@class="txt1"]/i/text()') # 类型
contents = root.xpath('//li/a/span[@class="right"]/span[@class="txt2"]/text()') # 简介
for bookname in booknames:
str(bookname).strip()
booknamelist.append(bookname)
# print(bookname)
for author in authors:
str(author).strip()
if author != '\r\n\t\t\t\t\t\t\t\t\t': # 因为打印出来的里面有很多空格所以吧不是空格的代替掉
authorlist.append(author)
# print(author)
for type in types:
str(type).strip()
typelist.append(type)
# print(type)
for content in contents:
novel.append("传记")#根据不同的类型进行修改
str(content).strip()
contentlist.append(content)
# print(content)
print("第", i, "页爬取完成")
# print("bookname:list:", booknamelist)
# print("authorlist:", authorlist)
writeIntoCSVFile(fileName='data/cultural2.txt')
# print(html + "555555555555555555555")
print('done')