Python b站市集爬取(xls文档)
最近刚学Python,欢迎指正!
代码如下:
from bs4 import BeautifulSoup import re import urllib.request , urllib.error import xlwt import requests import json from time import sleep # 市集网址 # https://mall.bilibili.com/neul-next/index.html?page=magic-market_index def main(): baseurl = "https://mall.bilibili.com/mall-magic-c/internet/c2c/v2/list" #要爬取的网页链接 # 初始内容 count = 1 # XLS文件行数,也代表当前爬取的条数 nextId = 'null' # 初始为null,请求一次后调用返回的nextId,相当于市集中下拉加载,避免数据出现过多重复 savepath = "bili市集.xls" #当前目录新建XLS,存储进去 book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象 sheet = book.add_sheet('bili市集', cell_overwrite_ok=True) #创建工作表 col = ("商品ID","名称","原价","现价",'折扣','需求') # 第一行的标题 for i in range(0,6): sheet.write(0,i,col[i]) #列名 # headers信息记得换成你自己的,不然启动不了 # 1.爬取网页 while True: print(count) datalist = [] #用来存储爬取的网页信息 headers = { # 模拟浏览器头部信息,向豆瓣服务器发送消息 "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Mobile Safari/537.36 Edg/127.0.0.0", "Referer":"https://mall.bilibili.com/neul-next/index.html?page=magic-market_index&spm_id_from=333.976.0.0&nextId=8rSbdqlNjZJpaF/pfRJDoRfGMgJf09RRVpwPjOZWsCk=", "Cookie": "buvid3=2E10289A-87CA-96D1-47AA-7482214B36E005450infoc; b_nut=1699060605; i-wanna-go-back=-1; b_ut=7; _uuid=9710F85CE-3BDC-101FD-B9C10-B46B5DA4548A05342infoc; enable_web_push=DISABLE; home_feed_column=5; buvid4=1537D9E3-F562-905A-78FB-CA1D79DB7F6506120-023110409-Cn343QTXf6%2BjSxYPEUmyhw%3D%3D; CURRENT_FNVAL=4048; rpdid=|(u)~J|Yl|R~0J'uYmm|Y|Ylm; DedeUserID=527939461; DedeUserID__ckMd5=2ffc916fcabf277f; header_theme_version=CLOSE; buvid_fp_plain=undefined; hit-dyn-v2=1; fingerprint=56c9072e75e90bcc4912bee7160e9492; buvid_fp=56c9072e75e90bcc4912bee7160e9492; PVID=1; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW2; b_lsid=A7F2D1FF_1912FC2D156; bsource=search_bing; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjMzNDIzODIsImlhdCI6MTcyMzA4MzEyMiwicGx0IjotMX0.iP50pBtEYpPfPmPmSozweDLSSYOwwkKK2rEvKpYkZHs; bili_ticket_expires=1723342322; SESSDATA=833d7897%2C1738635183%2C517f7%2A82CjCQYnRlN6wNV735KPFTyLULeos2ijAKiTJhD48Hxd1Zsrla4ITnMIcEtw3PSn5uDKMSVjdPZ05MazdTXzJQMUFOdmxpTldpTWpNMTNjZV9FNnEydjFhNUliSWVSaER3MTN1S2t6bnlvUGhUdlRQU1FqUVBLdmdFVnFNbDRMekphdlpDaE54TDRnIIEC; bili_jct=60791c407d93bd2ab6232cd56a93d90c; sid=8q7fuwa6; browser_resolution=1700-837; bp_t_offset_527939461=963124327111196672; msource=pc_web; deviceFingerprint=ab6cf5caf2d71946e86a8ef11c47fc21; from=pc_ticketlist; kfcFrom=market_detail; Hm_lvt_8d8d2f308d6e6dffaf586bd024670861=1723085019; Hm_lpvt_8d8d2f308d6e6dffaf586bd024670861=1723085019; HMACCOUNT=E7DD35196FF03284" } # 类型筛选。手办:2312 模型:2066 周边:2331 3C数码:2273 json_data = { 'categoryFilter': '2312', 'nextId': nextId, 'sortType': 'TIME_DESC', 'priceFilters': ["5000-20000"], # 价格筛选,这里是50~200元,真实价格后面要加两个0 # "discountFilters": ["50-70"], # 折扣筛选 } # 用户代理 try: request = requests.post(baseurl, headers=headers, json=json_data) res = json.loads(request.text) if res is not None: nextId = res['data']['nextId'] res = res['data']['data'] if nextId is None: break except TypeError as e: if "'NoneType' object is not subscriptable" in str(e): sleep(2) else: print("新错误") raise except requests.exceptions.HTTPError as http_err: print(f"HTTP 错误发生: {http_err}") except requests.exceptions.ConnectionError as conn_err: print(f"连接错误发生: {conn_err}") except requests.exceptions.Timeout as timeout_err: print(f"请求超时发生: {timeout_err}") except requests.exceptions.RequestException as req_err: print(f"请求错误发生: {req_err}") except Exception as err: print(f"发生其他错误: {err}") for item in res: if not isinstance(item, dict): print('列表错误',item) break # 根据原价现价换算折扣 price = float(item['showPrice']) / float(item['showMarketPrice']) price = round(price, 2) # 根据是否需求做了两种填入格式,比较简陋可自行更改 if contains_keywords(item['c2cItemsName']): # 对应每列 datalist = [item['c2cItemsId'],item['c2cItemsName'],item['showMarketPrice'],item['showPrice'],price,'是'] else: # 对应每列 datalist = [item['c2cItemsId'],item['c2cItemsName'],item['showMarketPrice'],item['showPrice'],price,''] # 将这行数据保存 for j in range(0,6): sheet.write(count,j,datalist[j]) #数据 count += 1 # 3.保存数据 book.save(savepath) #保存 # 4.爬取1000行后 结束循环 if count >= 1000: break # 关键词判断,是否是有需求的 def contains_keywords(text): # 定义要检查的子字符串 keywords = ["初音", "GSC", "炽天使"] # 检查文本中是否包含任何一个关键字 for keyword in keywords: if keyword in text: return True return False if __name__ == "__main__": # 当程序执行时 # 调用函数 main() # init_db("movietest.db") print("爬取完毕!")

浙公网安备 33010602011771号