背景
之前是可以通过直接调https://m.weibo.cn/api/container/getIndex?containerid={containerid}&page={page} 的方式可以直接调取对应用户的微博,但是最近爬取的时候发现这个已经停了,所以需要一种新的方式来进行爬取。
新方法
通过利用https://weibo.com/ajax/statuses/mymblog?uid=××× 方式去爬
- 电脑浏览器登录 weibo.com → 点开目标用户主页 → F12 → Network 里任意刷新一下。
- 找一条 https://weibo.com/ajax/statuses/mymblog?uid=××× 的请求,把 Request Headers 里的
a. Cookie 整段复制下来(只需要 SUB= 那一小段也可以,但整段最稳)。
b. x-xsrf-token
c. sec-fetch-* 系列(共 3 行)
d. x-requested-with
整段复制下来备用(x-xsrf-token 每次登录会变,但 24 h 内有效)。 - 代码里补字段:
cookie = '把你刚才复制的 Cookie 整段粘进来'
uid = '目标用户的 uid(网址里那串数字)'
save = f'{uid}_weibo.csv'
HEAD = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Cookie': cookie,
'Referer': f'https://weibo.com/u/{uid}',
'X-Requested-With': 'XMLHttpRequest',
'X-Xsrf-Token': '刚才复制的值', # ← 新增
'Sec-Fetch-Site': 'same-origin', # ← 新增
'Sec-Fetch-Mode': 'cors', # ← 新增
'Sec-Fetch-Dest': 'empty' # ← 新增
}
import requests
import pandas as pd
from datetime import datetime
import re
from zoneinfo import ZoneInfo
import random
import time
import logging
def fetch_one(page: int):
url = f'https://weibo.com/ajax/statuses/mymblog'
params = {'uid': uid, 'page': page, 'feature': 0}
r = requests.get(url, params=params, headers=HEAD, timeout=10)
if r.status_code != 200:
logging.info('!!! 被拦 / cookie 失效,先停 30s'); time.sleep(30); return []
return r.json()['data']['list']
def saveToExcel(reposts, filename):
df = pd.DataFrame(reposts)
logging.info('开始保存数据')
df.to_excel(filename, index=False)
def main():
reposts = []
page = 1
while page < 5:
posts = fetch_one(page)
if not posts: break
for card in posts:
cleaned_text = ''
repost_time = datetime.strptime(card.get('created_at'), '%a %b %d %H:%M:%S %z %Y').replace(tzinfo=None)
if start_date <= repost_time and repost_time <= end_date:
repost_content = card.get('text', '')
repost_url = f"https://weibo.com/{uid}/{card.get('mid')}"
if card.get('retweeted_status') != {}:
retweeted_content = card.get('retweeted_status')['text_raw']
cleaned_text = re.sub(r'[^\u4e00-\u9fa5,。!?、;:“”‘’()…—~\n]', '', retweeted_content)
logging.info({
'时间': datetime.fromisoformat(repost_time.isoformat()).strftime("%Y-%m-%d %H:%M:%S"),
'文本': cleaned_text if cleaned_text else repost_content,
'url': repost_url
})
reposts.append({
'时间': datetime.fromisoformat(repost_time.isoformat()).strftime("%Y-%m-%d %H:%M:%S"),
'文本': cleaned_text if cleaned_text else repost_content,
'url': repost_url
})
logging.info(f'第 {page} 页完成,已存 {len(posts)} 条'.format(page=page, posts=posts))
page += 1
time.sleep(1.5) # 保守 1.5 s,基本不会被 403
saveToExcel(reposts,save)
logging.info('全部搞定 →')
if __name__ == '__main__':
start_time = datetime.strptime('2025-10-01', '%Y-%m-%d')
end_date = datetime.strptime('2025-11-01', '%Y-%m-%d')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
main()
浙公网安备 33010602011771号