Python爬虫 #014 Selenium实战案例
爬虫会占用网站资源,对网站属于一种攻击,所以爬取需适当,用于学习即可
1. 爬取读书网书名及简介
from selenium import webdriver
url = 'https://www.dushu.com/book/1163_1.html'
driver = webdriver.Chrome()
driver.get(url)
driver.implicitly_wait(10)
f = open(r'D:\books.txt', mode = 'a', encoding = 'utf-8')
# 爬取10页
for page in range(1,11):
# 进度条滑到最底部,否则因未刷新出页面而找不到元素,报错
js = 'window.scrollTo(0,document.body.scrollHeight)'
driver.execute_script(js)
# 注意 elements可以接收很多元素,element只能接收一个元素
books = driver.find_elements_by_xpath('//div[@class="bookslist"]/ul/li')
print(books)
for book in books:
title = book.find_element_by_xpath('.//h3/a').text
author = book.find_element_by_xpath('./div/p[1]').text
content = book.find_element_by_xpath('./div/p[2]').text
f.write('《' + title+ '》' + ' ' + author + '\n' + content + '\n')
print('page%d'%page)
# 上一页和下一页按钮的class相同,但第一页只有下一页一个按钮
if page == 1:
nextpage = driver.find_element_by_xpath('//div[@class="pages"]/a[@class="disabled"]')
else:
nextpage = driver.find_element_by_xpath('//div[@class="pages"]/a[@class="disabled"][2]')
nextpage.click()
driver.close()
f.close()
2. 爬取网易云热评
from selenium import webdriver
import time
class YunSpider(object):
def __init__(self, url):
self.url = url
self.driver = webdriver.Chrome()
# 获取数据
def getContent(self):
self.driver.get(self.url)
# 先进入IFrame,0代表第一个
## IFrame相当于把网页装进盒子,想要进行操作得先进盒子
self.driver.switch_to_frame(0)
js = 'window.scrollBy(0,8000)'
self.driver.execute_script(js)
# 获取数据并翻页
for page in range(10):
## 获取数据
itms = self.driver.find_elements_by_xpath('//div[@class="cmmts j-flag"]/div[@class="itm"]')
# print(itms)
for itm in itms:
user = itm.find_element_by_xpath('.//a[@class="s-fc7"]').text
content = itm.find_element_by_xpath('.//div[@class="cnt f-brk"]').text
# print(user,'------',content)
# 传入给saveData函数,位置传参
YunSpider.saveData(user,content) # self.saveData(user, content)
## 翻页
nextpage = self.driver.find_element_by_link_text('下一页')
nextpage.click()
time.sleep(1)
# page 从0开始
print('page%d'%(page+2))
# 保存数据
@staticmethod
def saveData(user,content):
# 需手动创建文件
with open(r'D:\yun.txt',mode = 'a', encoding = 'utf-8') as f:
f.write(user + ' :' + content + '\n')
if __name__ == '__main__':
url = 'https://music.163.com/#/song?id=1429908253'
yun = YunSpider(url)
yun.getContent()
本文来自博客园,作者:{枫_Null},转载请注明原文链接:https://www.cnblogs.com/fengNull/articles/16655277.html

浙公网安备 33010602011771号