爬虫初探-笔趣阁小说下载

# -*- coding: utf-8 -*-
"""
Created on Tue Dec  1 12:31:07 2020

@author: zhaolulu
"""
import pandas as pd
import requests
from lxml import etree


headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}

def url_read(url):
    try:
        reponse = requests.get(url,headers=headers)
    except:
        print('failed')
    return reponse.content.decode('utf-8')

if __name__=='__main__':
    #笔趣阁小说网站
    url='http://www.xbiquge.la/'
    text = url_read(url)
    print("============================================")
    selector=etree.HTML(text)
    #这个是主页上最新小说的url
    ret=selector.xpath('//*[@id="newscontent"]/div[1]/ul/li/span[2]/a//@href')
    for note_url in ret:
        print(note_url)
    # 这选择了其中一条url 做测试
    #print(ret[0]) # http://www.xbiquge.la/62/62585/
    n_text = url_read('http://www.xbiquge.la/62/62585/')
    n_html=etree.HTML(n_text)
    xpath_ret = n_html.xpath('//*[@id="list"]/dl/dd/a/@href')
    index =0;
    for t_url in xpath_ret:
        #具体的章节内容
        f_url = 'http://www.xbiquge.la'+t_url
        print(f_url)
        article = url_read(f_url)
        article_text=etree.HTML(article)
        article_detail=article_text.xpath('//*[@id="content"]/text()')
        if len(article_detail) > 0:
            pd.Series(article_detail).to_csv('..\\book\\'+str(index))
            index=index+1
posted @ 2020-12-03 09:38  咸鱼人生&  阅读(146)  评论(0编辑  收藏  举报