64爬取b站,微博,ai问答等数据写入excel
# 功能1:获取手机号归属地 # 功能2:查询天气 # 功能3:查询百度热搜 # 功能4:查询微博热搜 # 功能5:查询b站 # 功能6 ai问答(在这用不了 涉及网站逆向写在另外一个py模块,没写入到这里) # coding=gbk # -*- coding:uft-8 -*- import requests import time import os import re import pandas as pd from lxml import etree import io import sys import datetime os.environ['NO_PROXY'] = 'https://cc-api.sbaliyun.com/v1/completions' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' } # 手机号 def phone_number(): while True: phoneNumber = input("查询的手机号(按q退出手机号查询):") if phoneNumber == 'q': break try: params = { 'mobile': phoneNumber, 'action': 'mobile' } url = 'https://www.ip138.com/mobile.asp' res = requests.get(url=url, headers=headers, params=params) # print(res.text) e = etree.HTML(res.text) KH = e.xpath('//div[@class="table"]/table/tbody/tr[2]/td/text()')[0] # 卡号归属地 kh = e.xpath('//div[@class="table"]/table/tbody/tr[2]/td/span/text()')[0] # 卡号归属地 YXS = e.xpath('//div[@class="table"]/table/tbody/tr[3]/td/text()')[0] # 运行商 yxs = e.xpath('//div[@class="table"]/table/tbody/tr[3]/td/a/text()')[0] # 运行商 QH = e.xpath('//div[@class="table"]/table/tbody/tr[5]/td/text()')[0] # 区号 qh = e.xpath('//div[@class="table"]/table/tbody/tr[5]/td/a/text()')[0] # 区号 YB = e.xpath('//div[@class="table"]/table/tbody/tr[6]/td/text()')[0] # 邮编 yb = e.xpath('//div[@class="table"]/table/tbody/tr[6]/td/a/text()')[0] # 邮编 # print(KH, kh) # return f'{KH}-->{kh}\n{YXS}-->{yxs}\n{QH}-->{qh}\n{YB}-->{yb}' print("查询结果如下:") print(f'{KH}-->{kh}\n{YXS}--->{yxs}\n{QH}------>{qh}\n{YB}------>{yb}') except Exception: print("输入的手机号格式不正确,请重新输入!") # 天气 def get_weather(): while True: location = input("输入查询的地区(按q退出天气查询):") if location == 'q': break try: params = { 'location': location } url = 'https://www.wentian123.com/search/' res = requests.get(url=url, headers=headers, params=params) e = etree.HTML(res.text) dz = e.xpath('//div[@class="table-inner"]/table//td/a/text()') # 地址 xqj = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[1]/p[1]/text()') # 星期几 # rq = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[1]/p[2]/text()') # 日期 tqkj = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[2]/p[2]/span/text()') # 天气情况 ds = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[3]/p/text()') # 度数 fxjs = e.xpath('//div[@class="table-inner"]/table/tbody/tr/td[4]/p/text()') # 风向级数 # print(dz, xqj, rq, tqkj, ds, fxjs) # print(f'现在是:\t{xqj[0]}\t{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}') for d, t, d2, f in zip(dz, tqkj, ds, fxjs): if d.__contains__(location): print(d, t, d2, f) print(f'现在是:\t{xqj[0]}\t{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}') except Exception: print("输入的地区有误,请重写输入!") # 百度热搜 def bai_du_rei_sou(): while True: lis = ['realtime', 'novel', 'movie', 'teleplay', 'car', 'game'] print('1.热搜榜\t 2.小说\t 3.电影\t 4.电视剧\t 5.汽车\t 6.游戏\t 7.退出百度热榜单查询') dic = { 'realtime': '热搜榜', 'novel': '小说', 'movie': '电影', 'teleplay': '电视剧', 'car': '汽车', 'game': '游戏' } try: num = int(input("输入你要查询的榜单:")) if num == 7: break print(f'正在查询{dic[lis[num - 1]]}的榜单:') url = 'https://top.baidu.com/board' params = { # 'tab': 'realtime', 'tab': lis[num - 1] } res = requests.get(url=url, headers=headers, params=params) e = etree.HTML(res.text) titles = e.xpath('//div[@class="c-single-text-ellipsis"]/text()') # 标题 hot_nums = e.xpath('//div[@class="hot-index_1Bl1a"]/text()') # 热搜指数 # print(hot_nums) # print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}') for t, h in zip(titles, hot_nums): print(f'标题:{t}------>热度:{h}') print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}') except Exception: print("输入格式错误,请重新输入!") # 微博热搜 def weibo_search(): while True: lis = ['realtimehot', 'socialevent', 'entrank'] url = 'https://s.weibo.com/top/summary' print('1.热搜榜\t 2.要闻榜\t 3.文娱榜\t 4.退出微博热榜单查询') num = int(input("输入你要查询的榜单:")) try: if num == 4: break params = { 'cate': lis[num - 1] } headers = { 'cookie': 'SINAGLOBAL=690519784757.2731.1671192419517; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WFB2MFg.53.mACIaAgd8wTi5JpVF020e05Neh5XSoMp; SUB=_2AkMUwi8HdcPxrAZZnPoTymngb49H-jynF0bxAn7uJhMyAxh87nwzqSVutBF-XMKjNdhFviACxIXacTNM_j5vca_y; _s_tentry=www.google.com; UOR=,,www.google.com; Apache=8260187671478.501.1675384443714; ULV=1675384443775:3:1:1:8260187671478.501.1675384443714:1671340035730', 'referer': 'https://www.google.com/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' } res = requests.get(url=url, headers=headers, params=params) e = etree.HTML(res.text) if params['cate'] != 'socialevent': xh = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[1]/text()') # 序号 biao_t = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[2]/a/text()') # 置顶+标题 biao_tts = e.xpath('//div[@id="pl_top_realtimehot"]//tbody/tr/td[2]/a/text()')[1:] # 标题 # print(xh,biao_t) # print(len(xh),len(biao_t)) print(f"置顶:------->{biao_t[0]}") for x, b in zip(xh, biao_tts): print(f"{x}-------->{b}") else: biaot = e.xpath('//div[@id="pl_top_realtimehot"]/table/tbody/tr/td[2]/a/text()') # 标题 for i in biaot: print(f'o------>{i[1:][:-1]}') print(f'北京时间:{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}') except Exception: print("存在响应或输入问题!重新查询!") # b站类 class Bili: # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码 sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码 headers = { 'accept': 'application/json, text/plain, */*', 'origin': 'https://www.bilibili.com', 'referer': 'https://www.bilibili.com/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36' } # b站排行榜 def get_bili(self): lis_e = ['all', 'bangumi', 'guochan', 'guochuang', 'documentary', 'douga', 'music', 'dance', 'game', 'knowledge', 'tech', 'sports', 'car', 'life', 'food', 'animal', 'kichiku', 'fashion', 'ent', 'cinephile', 'movie', 'tv', 'variety', 'origin', 'rookie'] lis_c = ['全站', '番剧', '国产动画', '国创相关', '纪录片', '动画', '音乐', '舞蹈', '游戏', '知识', '科技', '运动', '汽车', '生活', '美食', '动物圈', '鬼畜', '时尚', '娱乐', '影视', '电影', '电视剧', '综艺', '原创', '新人'] # print(len(lis_e), len(lis_c)) dic = { } for i in range(len(lis_e)): # 写入字典 dic[lis_e[i]] = lis_c[i] while True: print('1.全站 2.番剧 3.国产动画 4.国创相关 5.纪录片 6.动画 7.音乐 8.舞蹈 9.游戏 10.知识 11.科技 12.运动 13.汽车 14.生活 15.美食 \ 16.动物圈 17.鬼畜 18.时尚 19.娱乐 20.影视 21.电影 22.电视剧 23.综艺 24.原创 25.新人 100.退出当前查询') # print(dic) num = int(input("你要查询的类型榜单:")) if num == 100: break try: print(f'正在查询{lis_c[num - 1]}的榜单:') url = f'https://www.bilibili.com/v/popular/rank/{lis_e[num - 1]}' res = requests.get(url, headers) # print(res.text) # title = re.findall('class="title">(.*?)</a>', res.text) # 标题 # up = re.findall('alt="up"(.*?)', res.text, re.S) # print(title) e = etree.HTML(res.text) total = e.xpath('//div[@class="detail"]//span/text()') # up 播放 评论 bt = e.xpath('//div[@class="info"]/a/text()') # 标题 # print(total) lis = [] for i in total: t = i.strip().replace('\n', '') lis.append(t) # print(lis) lis2 = [lis[i:i + 3] for i in range(0, len(lis), 3)] # 将里面的元素排成3个一组 # print(lis2) print(e.xpath('//div[@id="app"]//ul[@class="rank-tab"]/li/text()')) for i, b in zip(lis2, bt): print(f'标题:{b}------------up:{i[0]}------------播放:{i[1]}------------评论:{i[2]}') print(f'查询{lis_c[num - 1]}榜单完毕!') save = input("是否保存到本地? 'y/n':") if save == 'y': # ------------------------------excel todo today = datetime.datetime.today() # year = today.year # month = today.month print(today) today = str(today).split(' ')[0].replace('-', '_') total_list = [] for i ,b in zip(lis2, bt): dic = { "标题": b, "up": i[0], "播放量": i[1], "评论": i[2] } total_list.append(dic) pf = pd.DataFrame(total_list) # 转列表为DataFrame path = pd.ExcelWriter(f'{today}{lis_c[num - 1]}.xlsx') # 设置保存路径 pf.to_excel(path, encoding='utf-8', index=False) # 转化为Excel path.save() # 保存 print(f'{lis_c[num - 1]}已保存!') # ------------------------------excel except Exception: print("输入格式有误或响应错误,重新输入") # 综合热门 def zong_he_rm(self): url = 'https://api.bilibili.com/x/web-interface/popular' params = { # 'ps': '20', # 展示数据量 'ps': '50', # 展示数据量 'pn': '1' } res = requests.get(url, headers=headers, params=params) datas = res.json()['data']['list'] # print(datas) for data in datas: print( f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}') print() # ------------------------------excel todo save = input("是否保存到本地? 'y/n':") if save == 'y': today = datetime.datetime.today() # year = today.year # month = today.month print(today) today = str(today).split(' ')[0].replace('-', '_') total_list = [] for data in datas: dic = { "板块": data["tname"], "标题": data["title"], "up": data["owner"]["name"], "播放量": data["stat"]["view"], "评论数": data["stat"]["reply"], "投币数": data["stat"]["coin"], "点赞数": data["stat"]["like"] } total_list.append(dic) pf = pd.DataFrame(total_list) # 转列表为DataFrame path = pd.ExcelWriter(f'{today}b站综合热门.xlsx') # 设置保存路径 pf.to_excel(path, encoding='utf-8', index=False) # 转化为Excel path.save() # 保存 print(f'b站综合热门已保存!') # ------------------------------excel # 每周必看 def weekly(self, num_page): url = 'https://api.bilibili.com/x/web-interface/popular/series/one' params = { 'number': num_page } res = requests.get(url, headers=headers, params=params) datas = res.json()['data']['list'] # print(datas) for data in datas: print( f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}') print() # 每周必看(第**期) def week2(self): url = 'https://api.bilibili.com/x/web-interface/popular/series/list' res = requests.get(url, headers=headers).json() # print(res) lists = res['data']['list'] num_lis = [] for i in lists: print(f'期数:{i["number"]}----------{i["subject"]}-----------{i["name"]}') num_lis.append(i["number"]) return num_lis # 入站必刷 def r_z(self): url = 'https://api.bilibili.com/x/web-interface/popular/precious' params = { 'page_size': '100', 'page': '1' } res = requests.get(url, headers=headers, params=params) datas = res.json()['data']['list'] # print(datas) for data in datas: print( f'{data["tname"]}-------标题:{data["title"]}-----up:{data["owner"]["name"]}------播放量:{data["stat"]["view"]}-----评论数:{data["stat"]["reply"]}------投币数:{data["stat"]["coin"]}-------点赞数{data["stat"]["like"]}') print() # 全站音乐榜 def music_(self, num): url = 'https://api.bilibili.com/x/copyright-music-publicity/toplist/music_list' params = { 'list_id': num } res = requests.get(url, headers=headers, params=params) try: datas = res.json()['data']['list'] # print(datas) for data in datas: print( f'歌名:{data["music_title"]}-----歌手:{data["singer"]}-----热度:{data["heat"]}-----播放量:{data["creation_play"]}------up:{data["creation_nickname"]}------成就:{data["achievements"]}') print() except Exception: print("响应超时或改期数不存在! 请查询输入!") def main(): while True: print( "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------") print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t ???Welcome to into???') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 1:查询手机号') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 2:查询天气') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 3:查询百度热搜') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 4:查询微博热搜') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 5:查询b站页面') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t 6:ai问答区') print('->\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t q:退出查询') print( "----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------") order = input("输入查询的序号:") if order == '1': print("----进入查询手机号界面----") phone_number() elif order == '2': print("----进入查询天气界面----") get_weather() elif order == '3': print("----进入查询百度热搜界面----") bai_du_rei_sou() elif order == '4': print("----进入查询微博热搜界面----") weibo_search() elif order == '5': bl = Bili() print("----进入查询b站界面----") while True: print("1.综合热门 2.每周必看 3.入站必刷 4.排行榜 5.全站音乐榜 6.退出当前查询") n5 = input("输入你要查询的板块:") if n5 == '1': bl.zong_he_rm() elif n5 == '2': qi_shu = bl.week2() while True: print(qi_shu) num = input("输入你要查看的期数(按q退出):") if num == 'q': break bl.weekly(num) elif n5 == '3': bl.r_z() elif n5 == '4': bl.get_bili() elif n5 == '5': while True: num1 = input('输入查询期号(按q退出!):') if num1 == 'q': break bl.music_(num=num1) elif n5 == '6': break elif order == '6': from re_Ai import input_get_info input_get_info() elif order == 'q': exit() else: print("输入的格式有误!请重新输入") if __name__ == '__main__': main()
本文来自博客园,作者:__username,转载请注明原文链接:https://www.cnblogs.com/code3/p/17091119.html
【推荐】2025 HarmonyOS 鸿蒙创新赛正式启动,百万大奖等你挑战
【推荐】博客园的心动:当一群程序员决定开源共建一个真诚相亲平台
【推荐】开源 Linux 服务器运维管理面板 1Panel V2 版本正式发布
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步