保存text方式
def saveText(files):
with open("discuss.text","w",encoding="utf-8") as f:
for i in files:
f.write(i)
pandas
def saveText(f):
excel=pd.DataFrame(f)
excel.to_excel("comment_on.xlsx")
爬取豆瓣短评
import requests
from lxml import etree
import sys
import pandas as pd
def getHtmlUrl(urls):
url={}
urls_html=[]
for i in range(100):
url[i]=urls+str(i+1)
urls_html.append(url[i])
return urls_html
def getHtmlText(url,header):
file={}
files=[]
r=requests.get(url=url,headers=header)
s=etree.HTML(r.text)
for i in range(21):
file[i]=s.xpath('//*[@id="comments"]/ul[1]/li['+str(i+1)+']/div[2]/p/span/text()')
files=files+file[i]
return files
# def saveText(files):
# with open("discuss.text","w",encoding="utf-8") as f:
# for i in files:
# f.write(i)
def saveText(f):
excel=pd.DataFrame(f)
excel.to_excel("comment_on.xlsx")
if __name__ == '__main__':
urls="https://book.douban.com/subject/34876107/comments/hot?p="
header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
url=getHtmlUrl(urls)
#print(getHtmlText(url,header))
f=[]
for i in range(100):
files=getHtmlText(url[i],header)
if files==[]:
sys.exit()
else:
f=f+files
saveText(f)