python动态网页的爬取

例子：爬取笔趣阁的小说圣墟

1.爬取小说章节的URL

from bs4 import BeautifulSoup
from selenium import webdriver
import re

def book_url():
    chromeOptions = webdriver.ChromeOptions()
    # 设置代理
    chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
    # 一定要注意，=两边不能有空格
    driver = webdriver.Firefox()
    driver.implicitly_wait(30)  # 隐性等待，最长等30秒
    driver.get(r'http://www.xbiquge.la/13/13959/')
    txt = driver.page_source
    soup = BeautifulSoup(txt, 'html.parser')
    url = re.findall('<a href="(.*)">', str(soup.find_all('div', id='list')))
    word = re.findall('<a.*>(.*)</a>', str(soup.find_all('div', id='list')))
    word_dict = dict(zip(list(word), list(url)))
    driver.quit()
    return word_dict

2.爬取小说前200章的内容并写入txt文本中

from bs4 import BeautifulSoup
from selenium import webdriver
import re
import codecs
import crawling.pro_2.py1 as py1


def url():
    word_dict = py1.book_url()
    word = []
    for i in word_dict.values():
        word.append(i)
    return word


def book(url):
    chromeOptions = webdriver.ChromeOptions()
    # 设置代理
    chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
    # 一定要注意，=两边不能有空格，不能是这样--proxy-server = http://202.20.16.82:10152
    driver = webdriver.Firefox()
    driver.implicitly_wait(30)  # 隐性等待，最长等30秒
    driver.get('http://www.xbiquge.la/'+url)
    txt = driver.page_source
    soup = BeautifulSoup(txt, 'html.parser')
    a = soup.find_all('div', id='content')
    a = re.sub(r'<div id="content">', '', str(a))
    a = re.sub(r'</p></div>', '', str(a))
    a = re.sub(r'\xa0', '', str(a))
    a = re.sub(r'<p><a href=', '', str(a))
    a = re.sub(r'target="_blank">', '', str(a))
    a = re.sub(r'</a>', '', str(a))
    a = str(a)
    line = list(a.split("<br/>"))
    name = re.findall('<h1>(.*)</h1', str(soup.find_all('div', class_='bookname')))
    name = re.sub("'", '', str(name))
    f = codecs.open('小说圣墟.txt', 'a', 'utf-8')
    kong_list = []
    for j in line:
        if j == '\n':
            kong_list.append(j)
    for k in kong_list:
        line.remove(k)
    print(name, end='\n', file=f)
    for i in line:
        text = re.sub(r'\n', '', i)
        print(text, file=f)
    driver.quit()


if __name__ == '__main__':
    url_list = url()
    del url_list[200:]
    for time in url_list:
        book(time)

posted @ 2020-04-30 09:49 辉哥哥~ 阅读(832) 评论(0) 收藏举报

刷新页面返回顶部

辉哥哥~

python动态网页的爬取

公告