一个关于豆瓣影评的爬虫,涉及:模拟登陆,翻页抓取。直接上代码:
import re
import time
import requests
import xlsxwriter
from bs4 import BeautifulSoup
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36',
'Referer':'https://www.douban.com/accounts/login?source=movie'}
s = requests.Session()
def log_in(login_url):
# 获取验证码并保存到本地
imgdata = s.get("https://www.douban.com/accounts/login?source=movie", headers=headers, verify=False).text
print(imgdata)
pa = re.compile(r'<img id="captcha_image" src="(.*?)" alt="captcha" class="captcha_image"/>')
img_url = re.findall(pa, imgdata)[0]
print(img_url)
picdata = s.get(img_url).content
with open("douban.jpg", 'wb') as f:
f.write(picdata)
# 获取随机ID
pa_id = re.compile(r'<input type="hidden" name="captcha-id" value="(.*?)"/>')
capid = re.findall(pa_id, imgdata)[0]
print(capid)
capimg = input("输入验证码:")
payload = {
"source":"movie",
"redir":"https://movie.douban.com/",
"form_email":"你的邮箱",
"form_password":"你的密码",
"captcha-solution":capimg,
"captcha-id":capid,
"login":"登录"
}
# log_url = "https://accounts.douban.com/login"
data1 = s.post(login_url, data=payload, verify=False) # 绕过了SSL验证
print(data1.status_code)
i = 0
def get_data(url):
time.sleep(2)
print("#"*50)
global i
print(i)
try:
data = s.get(url, headers = headers).text
print(data)
except:
try:
time.sleep(3)
print("正在尝试重新加载页面...")
data = s.get(url, headers= headers).text
except:
workbook.close()
pass
# print(data)
# 解析网页
soup = BeautifulSoup(data, "lxml")
comments = soup.findAll("div", {"class":"comment-item"})
# print(len(comments))
for comment in comments:
i += 1
info = comment.find("span",{"class":"comment-info"})
# get date
date = info.find("span",{"class":""}).get_text()
pa_date = re.compile("\d\d\d\d-\d\d-\d\d")
date = re.findall(pa_date, date)[0]
# print(date)
worksheet.write(i,0,date)
# get star
star = info.find("span")["class"][0][-2:-1]
# print(star)
worksheet.write(i,1,star)
# get vote
vote = comment.find("span", {"class":"comment-vote"}).find("span").get_text()
# print(vote)
worksheet.write(i,2,vote)
# get content
content = comment.find("div", {"class":"comment"}).find("p").get_text()
print(content)
worksheet.write(i,3,content)
# 获取下一页的url,递归抓取
pa = re.compile('<a href="?(.*?)" .*? class="next">后一页</a>')
try:
next = str(pa.findall(data)[0]).replace("amp;","")
next_url = "https://movie.douban.com/subject/25958717/comments" + next
print("正在抓取"+next_url+"...")
get_data(next_url)
except:
workbook.close()
pass
workbook = xlsxwriter.Workbook('海蒂和爷爷影评.xlsx')
worksheet = workbook.add_worksheet()
worksheet.set_column('A:A', 20)
worksheet.set_column('B:B', 10)
worksheet.set_column('C:C', 10)
worksheet.set_column('D:D', 500)
login_url = "https://accounts.douban.com/login"
log_in(login_url)
comment_data = get_data("https://movie.douban.com/subject/25958717/comments")
workbook.close()
这里有两个问题:
1.首先,登陆的时候,可能会不需要验证码(当然也不会抓到验证码的图片。。),加上try就可以了。
2.数据抓取不全。。。总是剩下1/5左右的数据抓不到,,目前还未解决,请看到的大神指点!
浙公网安备 33010602011771号