import requests
from bs4 import BeautifulSoup
import re
from lxml import etree
import time
import csv
requests.packages.urllib3.disable_warnings()
#需要生成的cs名字
csv_name = "123.csv"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
"""1 第一步"""
# 获取要爬取的分页, 当前第几页,总共要多少条
def get_page_total(p1,total):
for x in range(p1,total):
p1=str(x*10)
url = "https://xueshu.baidu.com/s?wd=journaluri%3A%2820bd239813882ced%29%20applied%20energy&pn="+p1+"&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&sc_hit=1"
#print(url)
print("当前第"+str(x)+"页,共需要"+str(total)+"条")
get_url(url)
time.sleep(1)
#print(x)
"""2 第二步"""
#获取url文章链接地址,-》跳转到详情页
"""
content = ['https://xueshu.baidu.com/usercenter/paper/show?paperid=e04cdee2122f75b0011cc9e7b452d72b&site=xueshu_se',
'https://xueshu.baidu.com/usercenter/paper/show?paperid=9ccc121c6260e006c41c32f04ddf2e85&site=xueshu_se'] ...
"""
def get_url(url):
r = requests.get(url, headers=headers, verify=False)
html = r.text
selector = etree.HTML(html)
content = selector.xpath('//h3[@class="t c_font"]//a/@href')
##获取内容详情,百度文库每页是十条,
get_page_content(content)
"""3"""
#获取内容详情,百度文库每页是十条,
def get_page_content(detail_url):
for link in detail_url:
data = []
#print(link)
rr = requests.get(link, headers=headers,verify=False)
selector = etree.HTML(rr.text)
#作者
zuozhe = selector.xpath('//p[@class="author_text"]//span//text()')
#摘要
zhaiyao = selector.xpath('//p[@class="abstract"]//text()')
# print(zuozhe)
# print(zhaiyao)
data.append(','.join(zuozhe))
data.append(','.join(zhaiyao))
#print(data)
print("开始写入csv")
f_csv(data)
time.sleep(1)
pass
"""4"""
#写入csv ["111", "222"]
def f_csv(data):
f = open(csv_name, 'a+', newline='', encoding='utf-8')
# 2. 基于文件对象构建 csv写入对象
csv_writer = csv.writer(f)
#csv_writer.writerow(["作者", '摘要'])
# 3. 构建列表头
csv_writer.writerow(data)
f.close()
pass
"""run 爬虫"""
#生成csv头部
csv_head = ["作者","摘要"]
#print(csv_head)
f_csv(csv_head)
#获取每篇文章url # 获取要爬取的分页, 当前第几页,总共要多少条
get_page_total(0,1)
#datas = [['M Poeschl', 'S Ward', 'P Owende'],['The energy efficiency of, different, biogas systems'] ]
#
# print(','.join(datas[0]))
# print(','.join(datas[1]))
#f_csv(datas)