python动态网页的爬取

例子:爬取笔趣阁的小说圣墟

1.爬取小说章节的URL

from bs4 import BeautifulSoup
from selenium import webdriver
import re

def book_url():
chromeOptions = webdriver.ChromeOptions()
# 设置代理
chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
# 一定要注意,=两边不能有空格
driver = webdriver.Firefox()
driver.implicitly_wait(30) # 隐性等待,最长等30
driver.get(r'http://www.xbiquge.la/13/13959/')
txt = driver.page_source
soup = BeautifulSoup(txt, 'html.parser')
url = re.findall('<a href="(.*)">', str(soup.find_all('div', id='list')))
word = re.findall('<a.*>(.*)</a>', str(soup.find_all('div', id='list')))
word_dict = dict(zip(list(word), list(url)))
driver.quit()
return word_dict

2.爬取小说前200章的内容并写入txt文本中

from bs4 import BeautifulSoup
from selenium import webdriver
import re
import codecs
import crawling.pro_2.py1 as py1


def url():
word_dict = py1.book_url()
word = []
for i in word_dict.values():
word.append(i)
return word


def book(url):
chromeOptions = webdriver.ChromeOptions()
# 设置代理
chromeOptions.add_argument("--proxy-server=http://202.20.16.82:10152")
# 一定要注意,=两边不能有空格,不能是这样--proxy-server = http://202.20.16.82:10152
driver = webdriver.Firefox()
driver.implicitly_wait(30) # 隐性等待,最长等30
driver.get('http://www.xbiquge.la/'+url)
txt = driver.page_source
soup = BeautifulSoup(txt, 'html.parser')
a = soup.find_all('div', id='content')
a = re.sub(r'<div id="content">', '', str(a))
a = re.sub(r'</p></div>', '', str(a))
a = re.sub(r'\xa0', '', str(a))
a = re.sub(r'<p><a href=', '', str(a))
a = re.sub(r'target="_blank">', '', str(a))
a = re.sub(r'</a>', '', str(a))
a = str(a)
line = list(a.split("<br/>"))
name = re.findall('<h1>(.*)</h1', str(soup.find_all('div', class_='bookname')))
name = re.sub("'", '', str(name))
f = codecs.open('小说圣墟.txt', 'a', 'utf-8')
kong_list = []
for j in line:
if j == '\n':
kong_list.append(j)
for k in kong_list:
line.remove(k)
print(name, end='\n', file=f)
for i in line:
text = re.sub(r'\n', '', i)
print(text, file=f)
driver.quit()


if __name__ == '__main__':
url_list = url()
del url_list[200:]
for time in url_list:
book(time)
 
posted @ 2020-04-30 09:49  辉哥哥~  阅读(794)  评论(0编辑  收藏  举报