爬虫入门之pandas保存、豆瓣短评

保存text方式

def saveText(files):
    with open("discuss.text","w",encoding="utf-8") as f:
        for i in files:
            f.write(i)

pandas

def saveText(f):
    excel=pd.DataFrame(f)
    excel.to_excel("comment_on.xlsx")

爬取豆瓣短评

import requests
from lxml import etree
import sys
import pandas as pd

def getHtmlUrl(urls):
    url={}
    urls_html=[]
    for i in range(100):
        url[i]=urls+str(i+1)
        urls_html.append(url[i])
    return urls_html

def getHtmlText(url,header):
    file={}
    files=[]
    r=requests.get(url=url,headers=header)
    s=etree.HTML(r.text)
    for i in  range(21):
        file[i]=s.xpath('//*[@id="comments"]/ul[1]/li['+str(i+1)+']/div[2]/p/span/text()')
        files=files+file[i]
    return files

# def saveText(files):
#     with open("discuss.text","w",encoding="utf-8") as f:
#         for i in files:
#             f.write(i)

def saveText(f):
    excel=pd.DataFrame(f)
    excel.to_excel("comment_on.xlsx")


if __name__ == '__main__':
    urls="https://book.douban.com/subject/34876107/comments/hot?p="
    header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
    url=getHtmlUrl(urls)
    #print(getHtmlText(url,header))
    f=[]
    for i in range(100):
        files=getHtmlText(url[i],header)
        if files==[]:
            sys.exit()
        else:
            f=f+files
        saveText(f)
posted @ 2020-03-12 21:57  Mario_ok  阅读(207)  评论(0)    收藏  举报