豆瓣最佳影评-星级转换
总的来说,爬取豆瓣信息不算难,因为在网上一抓一大把教程,但是自己写的代码还是和别人的不一样,特别是自己一个一个想出来一个一个敲出来的那种酸爽赶脚
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from lxml import etree
import csv
file = open('douban1.csv','a',newline='',encoding='utf-8')
writer = csv.writer(file)#这种方式比with open慢!
writer.writerow(['星级','内容'])
def id(url):
#获取每个影评的id
rsp=requests.get(url)
html=etree.HTML(rsp.text)
id=html.xpath('//div[@class="main-bd"]/div/@data-rid')
return id
def next_url(url): #获取下一页url
rsp = requests.get(url)
html = etree.HTML(rsp.text)
next_url=html.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href')
next_url='https://movie.douban.com'+next_url[0]
# print(next_url)
sel(next_url)
def star(url,i):
#获取每个影评的星级
rsp=requests.get(url)
html=etree.HTML(rsp.text)
star=html.xpath('//div[@data-cid="'+i+'"]/div[@class="main review-item"]/header[@class="main-hd"]/span/@class')
return star
def translate(star):#把代码转换为相应的星级
if star[0]=='allstar50 main-title-rating':
star=str('五星')
return star
if star[0]=='allstar40 main-title-rating':
star=str('四星')
return star
if star[0]=='allstar30 main-title-rating':
star=str('三星')
return star
if star[0]=='allstar20 main-title-rating':
star=str('二星')
return star
if star[0]=='allstar10 main-title-rating':
star=str('一星')
return star
if star[0]=='main-meta':#有些没有星级
star=str('默认好评')
return star
def sel(url):
# 加载动态页面,使用selenium调用Chrome浏览器点击展开按钮
brow = webdriver.Chrome(r"D:\Python\Scripts\chromedriver.exe")
brow.get(url)
id1=id(url)
for i in id1:
box1=brow.find_element_by_xpath('//div[@class="short-content"]/a[@id="toggle-'+i+'-copy"]')
brow.execute_script("window.scrollTo(0,"+i*1200+")")#让屏幕滚动,使得鼠标可以聚焦!
# print(box1)#1200是实验得到的数值,没有统一性
box1.click() #点击展开按钮
time.sleep(2) # 必须等待两秒,否则获取到的代码是未点击加载的代码
html = brow.page_source
soup = BeautifulSoup(html, "lxml")
for j in soup.find_all('div', class_='review-content clearfix'):
content = j.get_text()
content=content.strip() #去掉空格
content = content.replace('\n', '').replace('\t', '').replace('\xa0', '').replace('\r', '')#去掉转义字符
content = content.split('*')#转换为列表
# print(content)
# print(type(content))
star1=star(url,i)#获取星级代码
# print(star1) # ['allstar50 main-title-rating']
star2 = translate(star1)#获取转换后的相应星级
star2 = star2.split('*')
# print(star2)
for i,j in zip(star2,content):
params=(i,j)
print(params)
writer.writerow(params)#写入文件中
brow.close()#关闭浏览器
next_url(url)#点击下一页
if __name__=='__main__':
url = 'https://movie.douban.com/review/best/'
sel(url)#启动程序
'''
selenium.common.exceptions.WebDriverException: Message: unknown error: Element <a href="javascript:;" id="toggle-9590829-copy" class="unfold" title="...">展开</a> is not clickable at point (120, 586). Other element would receive the click: <div class="review-content clearfix" data-author="夜第七章" data-url="https://movie.douban.com/review/9592082/" data-original="1">...</div>
(Session info: chrome=54.0.2840.99)
(Driver info: chromedriver=2.27.440174 (e97a722caafc2d3a8b807ee115bfb307f7d2cfd9),platform=Windows NT 10.0.14393 x86_64)
错误原因:选的元素不是input,无法聚集焦点,使用sleep,window.scrollTo(0,x)
'''
浙公网安备 33010602011771号