202107292331 - 爬虫之bilibili鬼畜视频
难点一:采用re.compile解析html
难点二:动态ua和动态ip反爬
# -*- coding:utf-8 -*-
import random
from time import sleep
from bs4 import BeautifulSoup # 网页解析,获取数据
import re # 正则表达式,正则匹配
import urllib.request,urllib.error # 制定URL,获取数据
import xlwt # 进行excel操作
import sqlite3 # 进行sqlite数据库操作
def main():
baseUrl = "https://search.bilibili.com/video?keyword=%E9%AC%BC%E7%95%9C&page="
# 爬取网页
dataList = getData(baseUrl)
# 保存数据
saveData(dataList)
find_title = re.compile(r'title="(.*?)">')
find_des = re.compile(r'<div class="des hide">(.*?)</div>', re.S)
find_watch = re.compile(r'<span class="so-icon watch-num" title="观看"><i class="icon-playtime"></i>(.*?)</span>', re.S)
find_hide = re.compile(r'<span class="so-icon hide" title="弹幕"><i class="icon-subtitle"></i>(.*?)</span>', re.S)
find_time = re.compile(r'<span class="so-icon time" title="上传时间"><i class="icon-date"></i>(.*?)</span>', re.S)
find_up = re.compile(r'target="_blank">(.*?)</a></span></div></div></li>')
find_href = re.compile(r'<a class="img-anchor" href="//(.*?)" target="_blank"')
def getData(baseUrl):
dataList = []
# 遍历50页
for i in range(1,51):
url = baseUrl + str(i)
html = askUrl(url)
sleep(5)
# 逐一解析数据
soup = BeautifulSoup(html, "html.parser")
data = []
for item in soup.find_all('li', class_="video-item matrix"):
# 每个视频的信息,标题名、视频介绍、观看人数、弹幕数、上传时间、up主
# print(item)
item = str(item)
title = re.findall(find_title, item)[0].strip()
des = re.findall(find_des, item)[0].strip()
watch = re.findall(find_watch, item)[0].strip()
hide = re.findall(find_hide, item)[0].strip()
time = re.findall(find_time, item)[0].strip()
up = re.findall(find_up, item)[0].strip()
href = re.findall(find_href, item)[0].strip()
data= [title, up, time, watch, hide, href, des]
# print(data)
dataList.append(data)
print("page_" + str(i) + " has done!")
return dataList
def askUrl(url):
# 模拟浏览器头部信息
#head = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"}
USER_AGENTS= [
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0"
]
user_agent= random.choice(USER_AGENTS) # 用户代理
print(user_agent)
# 设置代理
proxy_list= [
{"http" :"218.75.69.50:57903"},
{"http" :"121.40.108.76:80"},
{"http" :"36.112.139.146:3128"},
{"http" :"60.191.11.249:3128"},
{"http" :"115.239.99.59:8118"},
{"http" :"115.29.199.16:8118"},
{"http" :"61.164.39.69:53281"},
{"http" :"182.92.110.245:8118"},
{"http" :"58.246.3.178:53281"},
{"http" :"122.234.95.67:9000"}
]
proxy = random.choice(proxy_list)
print(proxy)
proxy = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy)
request = urllib.request.Request(url, headers={"user-agent":user_agent})
html = ""
try:
# reponse = urllib.request.urlopen(request)
reponse = opener.open(request)
html = reponse.read().decode("utf-8")
return html
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
def saveData(dataList):
work_book = xlwt.Workbook(encoding="utf-8")
work_sheet = work_book.add_sheet("sheet1")
for i in range(0, len(dataList)):
work_sheet.write(i, 0, "vvideo_" + str(i))
for j in range(0, len(dataList[i])):
work_sheet.write(i, j+1, dataList[i][j])
work_book.save("test.xls")
if __name__ == "__main__":
main()

浙公网安备 33010602011771号