202107292331 - 爬虫之bilibili鬼畜视频

难点一：采用re.compile解析html
难点二：动态ua和动态ip反爬
# -*- coding:utf-8 -*-  
import random  
from time import sleep  
  
from bs4 import BeautifulSoup            # 网页解析，获取数据  
import re                                # 正则表达式，正则匹配  
import urllib.request,urllib.error       # 制定URL，获取数据  
import xlwt                              # 进行excel操作  
import sqlite3                           # 进行sqlite数据库操作  
  
def main():  
    baseUrl = "https://search.bilibili.com/video?keyword=%E9%AC%BC%E7%95%9C&page="  
 # 爬取网页  
 dataList = getData(baseUrl)  
    # 保存数据  
 saveData(dataList)  
  
find_title = re.compile(r'title="(.*?)">')  
find_des = re.compile(r'<div class="des hide">(.*?)</div>', re.S)  
find_watch = re.compile(r'<span class="so-icon watch-num" title="观看"><i class="icon-playtime"></i>(.*?)</span>', re.S)  
find_hide = re.compile(r'<span class="so-icon hide" title="弹幕"><i class="icon-subtitle"></i>(.*?)</span>', re.S)  
find_time = re.compile(r'<span class="so-icon time" title="上传时间"><i class="icon-date"></i>(.*?)</span>', re.S)  
find_up = re.compile(r'target="_blank">(.*?)</a></span></div></div></li>')  
find_href = re.compile(r'<a class="img-anchor" href="//(.*?)" target="_blank"')  
def getData(baseUrl):  
    dataList = []  
    # 遍历50页  
 for i in range(1,51):  
        url = baseUrl + str(i)  
        html = askUrl(url)  
        sleep(5)  
        # 逐一解析数据  
 soup = BeautifulSoup(html, "html.parser")  
        data = []  
        for item in soup.find_all('li', class_="video-item matrix"):  
            # 每个视频的信息，标题名、视频介绍、观看人数、弹幕数、上传时间、up主  
 # print(item)  
 item = str(item)  
            title = re.findall(find_title, item)[0].strip()  
            des = re.findall(find_des, item)[0].strip()  
            watch = re.findall(find_watch, item)[0].strip()  
            hide = re.findall(find_hide, item)[0].strip()  
            time = re.findall(find_time, item)[0].strip()  
            up = re.findall(find_up, item)[0].strip()  
            href = re.findall(find_href, item)[0].strip()  
            data= [title, up, time, watch, hide, href, des]  
            # print(data)  
 dataList.append(data)  
        print("page_" + str(i) + " has done!")  
    return dataList  
  
def askUrl(url):  
    # 模拟浏览器头部信息  
 #head = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36"}  
 USER_AGENTS= [  
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",  
 "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",  
 "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",  
 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",  
 "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",  
 "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",  
 "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0"  
 ]  
  
    user_agent= random.choice(USER_AGENTS) # 用户代理  
 print(user_agent)  
    # 设置代理  
 proxy_list= [  
        {"http" :"218.75.69.50:57903"},  
 {"http" :"121.40.108.76:80"},  
 {"http" :"36.112.139.146:3128"},  
 {"http" :"60.191.11.249:3128"},  
 {"http" :"115.239.99.59:8118"},  
 {"http" :"115.29.199.16:8118"},  
 {"http" :"61.164.39.69:53281"},  
 {"http" :"182.92.110.245:8118"},  
 {"http" :"58.246.3.178:53281"},  
 {"http" :"122.234.95.67:9000"}  
    ]  
    proxy = random.choice(proxy_list)  
    print(proxy)  
    proxy = urllib.request.ProxyHandler(proxy)  
    opener = urllib.request.build_opener(proxy)  
    request = urllib.request.Request(url, headers={"user-agent":user_agent})  
    html = ""  
 try:  
        # reponse = urllib.request.urlopen(request)  
 reponse = opener.open(request)  
        html = reponse.read().decode("utf-8")  
        return html  
    except urllib.error.URLError as e:  
        if hasattr(e, "code"):  
            print(e.code)  
        if hasattr(e, "reason"):  
            print(e.reason)  
  
def saveData(dataList):  
    work_book = xlwt.Workbook(encoding="utf-8")  
    work_sheet = work_book.add_sheet("sheet1")  
    for i in range(0, len(dataList)):  
        work_sheet.write(i, 0, "vvideo_" + str(i))  
        for j in range(0, len(dataList[i])):  
            work_sheet.write(i, j+1, dataList[i][j])  
    work_book.save("test.xls")  
  
if __name__ == "__main__":  
    main()
posted @ 2025-03-19 22:05 钱塘江畔阅读(14) 评论(0) 收藏举报
刷新页面返回顶部
钱塘江畔

小树不修不直溜

202107292331 - 爬虫之bilibili鬼畜视频

公告