import requests
import re
import time
from bs4 import BeautifulSoup
import csv
import xlrd
from xlutils.copy import copy
import random
##屏蔽https错误
requests.packages.urllib3.disable_warnings()
class Spider():
def __init__(self,keyworks):
self.kw = keyworks
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Cookie":"D3z_vi-ds=f1f6d61ffd02c29c1cd832a363888be3; __jsluid_s=0b360d705e0e333a682280ae3b03bf90; Hm_lvt_c909c1510b4aebf2db610b8d191cbe91=1655284406; Hm_lpvt_c909c1510b4aebf2db610b8d191cbe91=1655285546",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
self.csv_name = "123.csv"
self.url = "https://www.dlzb.com/zb/search.php?kw="+str(self.kw)
self.path = "123.xls"
def getContent(self,nextUrl):
res = requests.get(nextUrl, headers=self.headers, verify=False) #https访问
return res
pass
#写入csv
def writeXlx(self,title,kw):
data = [title,kw]
f = open(self.csv_name, 'a+', newline='', encoding='utf-8')
# 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
# csv_writer.writerow(["作者", '摘要'])
# 3. 构建列表头
csv_writer.writerow(data)
f.close()
pass
def getPage(self,p,total):
for num in range(total):
# if num == 1:
# break
nextUrl = self.url+"&page="+str(num+p)
print(nextUrl)
#获取内容
content = self.getContent(nextUrl)
#print(content.text)
# re.S匹配多行,包括换行符\n
res = re.findall(r'<ul class=\"gclist_ul listnew\">(.*?)<\/ul>', content.text, re.S)
if res:
titleList = re.findall(r'<a class=\"gccon_title\".*?>(.*?)<\/a>', res[0], re.S)
data = []
for title in titleList:
soup = BeautifulSoup(title, 'html.parser')#过滤html标签
print(str(self.kw) +"-第"+str(num+p)+"页"+str(),soup.get_text())
data.append([soup.get_text(),self.kw])
self.writeXLSAppend(data)
time.sleep(random.uniform(1, 10))
else:
print(self.kw,"end*************")
print(content.text)
print(res)
break
pass
def writeXLSAppend(self,value):
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(self.path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
new_worksheet.write(i + rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(self.path) # 保存工作簿
print("xls格式表格【追加】写入数据成功!")
if __name__ == '__main__':
#https://www.dlzb.com/zb/search.php?kw=%E6%99%BA%E8%83%BD%E5%AE%A2%E6%9C%8D
list = ['人工智能','知识图谱','计算机视觉','图像识别','文本挖掘','文本分析','知识问答','神经网络']
for x in list:
print(x)
##页数default=1,多少条(500)
spider = Spider(x).getPage(1,1000) ##