Python b站市集爬取(xls文档)

最近刚学Python,欢迎指正!
 
 代码如下:
from bs4 import BeautifulSoup
import re
import urllib.request , urllib.error
import xlwt
import requests
import json
from time import sleep

# 市集网址
# https://mall.bilibili.com/neul-next/index.html?page=magic-market_index

def main():
    baseurl = "https://mall.bilibili.com/mall-magic-c/internet/c2c/v2/list"  #要爬取的网页链接
    # 初始内容
    count = 1               # XLS文件行数,也代表当前爬取的条数
    nextId = 'null'         # 初始为null,请求一次后调用返回的nextId,相当于市集中下拉加载,避免数据出现过多重复
    savepath = "bili市集.xls"    #当前目录新建XLS,存储进去
    book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
    sheet = book.add_sheet('bili市集', cell_overwrite_ok=True) #创建工作表
    col = ("商品ID","名称","原价","现价",'折扣','需求')          # 第一行的标题
    for i in range(0,6):
        sheet.write(0,i,col[i])  #列名

    # headers信息记得换成你自己的,不然启动不了
    # 1.爬取网页
    while True:
        print(count)
        datalist = []  #用来存储爬取的网页信息
        headers = {  # 模拟浏览器头部信息,向豆瓣服务器发送消息
            "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36 Edg/127.0.0.0",
            "Referer":"https://mall.bilibili.com/neul-next/index.html?page=magic-market_index&spm_id_from=333.976.0.0&nextId=8rSbdqlNjZJpaF/pfRJDoRfGMgJf09RRVpwPjOZWsCk=",
            "Cookie": "buvid3=2E10289A-87CA-96D1-47AA-7482214B36E005450infoc; b_nut=1699060605; i-wanna-go-back=-1; b_ut=7; _uuid=9710F85CE-3BDC-101FD-B9C10-B46B5DA4548A05342infoc; enable_web_push=DISABLE; home_feed_column=5; buvid4=1537D9E3-F562-905A-78FB-CA1D79DB7F6506120-023110409-Cn343QTXf6%2BjSxYPEUmyhw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)~J|Yl|R~0J'uYmm|Y|Ylm; DedeUserID=527939461; DedeUserID__ckMd5=2ffc916fcabf277f; header_theme_version=CLOSE; buvid_fp_plain=undefined; hit-dyn-v2=1; fingerprint=56c9072e75e90bcc4912bee7160e9492; buvid_fp=56c9072e75e90bcc4912bee7160e9492; PVID=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW2; b_lsid=A7F2D1FF_1912FC2D156; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjMzNDIzODIsImlhdCI6MTcyMzA4MzEyMiwicGx0IjotMX0.iP50pBtEYpPfPmPmSozweDLSSYOwwkKK2rEvKpYkZHs; bili_ticket_expires=1723342322; SESSDATA=833d7897%2C1738635183%2C517f7%2A82CjCQYnRlN6wNV735KPFTyLULeos2ijAKiTJhD48Hxd1Zsrla4ITnMIcEtw3PSn5uDKMSVjdPZ05MazdTXzJQMUFOdmxpTldpTWpNMTNjZV9FNnEydjFhNUliSWVSaER3MTN1S2t6bnlvUGhUdlRQU1FqUVBLdmdFVnFNbDRMekphdlpDaE54TDRnIIEC; bili_jct=60791c407d93bd2ab6232cd56a93d90c; sid=8q7fuwa6; browser_resolution=1700-837; bp_t_offset_527939461=963124327111196672; msource=pc_web; deviceFingerprint=ab6cf5caf2d71946e86a8ef11c47fc21; from=pc_ticketlist; kfcFrom=market_detail; Hm_lvt_8d8d2f308d6e6dffaf586bd024670861=1723085019; Hm_lpvt_8d8d2f308d6e6dffaf586bd024670861=1723085019; HMACCOUNT=E7DD35196FF03284"
        }
        # 类型筛选。手办:2312 模型:2066 周边:2331 3C数码:2273
        json_data = {
            'categoryFilter': '2312',
            'nextId': nextId,
            'sortType': 'TIME_DESC',
            'priceFilters': ["5000-20000"],   # 价格筛选,这里是50~200元,真实价格后面要加两个0
            # "discountFilters": ["50-70"],   # 折扣筛选
        }
        # 用户代理
        try:
            request = requests.post(baseurl, headers=headers, json=json_data)
            res = json.loads(request.text)
            if res is not None:
                nextId = res['data']['nextId']
                res = res['data']['data']
            if nextId is None:
                    break
        except TypeError as e:
            if "'NoneType' object is not subscriptable" in str(e):
                sleep(2)
            else:
                print("新错误")
                raise
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP 错误发生: {http_err}")
        except requests.exceptions.ConnectionError as conn_err:
            print(f"连接错误发生: {conn_err}")
        except requests.exceptions.Timeout as timeout_err:
            print(f"请求超时发生: {timeout_err}")
        except requests.exceptions.RequestException as req_err:
            print(f"请求错误发生: {req_err}")
        except Exception as err:
            print(f"发生其他错误: {err}")

        for item in res:
            if not isinstance(item, dict):
                print('列表错误',item)
                break
            # 根据原价现价换算折扣
            price = float(item['showPrice']) / float(item['showMarketPrice'])
            price = round(price, 2)
            # 根据是否需求做了两种填入格式,比较简陋可自行更改
            if contains_keywords(item['c2cItemsName']):
                # 对应每列
                datalist = [item['c2cItemsId'],item['c2cItemsName'],item['showMarketPrice'],item['showPrice'],price,'']
            else:
                # 对应每列
                datalist = [item['c2cItemsId'],item['c2cItemsName'],item['showMarketPrice'],item['showPrice'],price,'']
            # 将这行数据保存
            for j in range(0,6):
                sheet.write(count,j,datalist[j])  #数据
            count += 1
        # 3.保存数据
        book.save(savepath) #保存
        # 4.爬取1000行后 结束循环
        if count >= 1000:
            break


# 关键词判断,是否是有需求的
def contains_keywords(text):
    # 定义要检查的子字符串
    keywords = ["初音", "GSC", "炽天使"]

    # 检查文本中是否包含任何一个关键字
    for keyword in keywords:
        if keyword in text:
            return True
    return False




if __name__ == "__main__":  # 当程序执行时
    # 调用函数
     main()
    # init_db("movietest.db")
     print("爬取完毕!")

 

 
posted @ 2024-08-12 16:26  杯今  阅读(697)  评论(0)    收藏  举报