python beautifulsoup 爬虫实战--抓取acm队员atcoder和codeforces比赛数据
首先需要安装bs4包 命令如下:pip install beautifulsoup4
- atcoder提供了单个用户的历史比赛信息网页:https://atcoder.jp/users/a2018040538/history
- codeforces提供了api,使用json数据格式,网址:https://codeforces.com/api/user.rating?handle=Fefer_Ivan
对上面网页进行分析,抓取历史参赛数据的代码如下:
# pip install BeautifulSoup4 from bs4 import BeautifulSoup import requests import json, time def getUrlText(url): while True: try: html = requests.get(url) html = html.text break except requests.exceptions.ConnectionError: print('ConnectionError -- please wait 3 seconds') time.sleep(3) except requests.exceptions.ChunkedEncodingError: print('ChunkedEncodingError -- please wait 3 seconds') time.sleep(3) except: print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds') time.sleep(3) return html def getCFUserData(cfID): url = "https://codeforces.com/api/user.rating?handle=" + cfID html = getUrlText(url) js = json.loads(html) if 'result' not in js.keys(): return [] results = json.loads(html)['result'] datalist = [] for d in results: date = d["ratingUpdateTimeSeconds"] contest = d["contestName"] rank = d["rank"] newRating = d["newRating"] diff = int(newRating) - int(d["oldRating"]) datalist.append({ 'date': date, 'contest': contest, 'rank': rank, 'newRating': newRating, 'diff':diff }) # print(datalist['result']) return datalist # atcoder def getACUserData(acID): url = "https://atcoder.jp/users/"+acID+"/history" html = getUrlText(url) soup = BeautifulSoup(html, features="lxml") table = soup.select('#history') if len(table) > 0: t = table[0] else: return [] # [dict1, dict2, ...] # dict:{'date': date, 'contest': contest, 'rank': rank, 'newRating': newRanking, 'diff':diff} data_list = [] for idx, tr in enumerate(t.select('tr')): if idx != 0: tds = tr.select('td') date = tds[0].select('time')[0].text contest = tds[1].select('a')[0].text rank = tds[2].select('a')[0].text if len(tds[4].select('span')) > 0: newRating = tds[4].select('span')[0].text else: newRating = tds[4].text diff = tds[5].contents[0] # print(date,contest,rank,newRating,diff) data_list.append({ 'date': date, 'contest': contest, 'rank': rank, 'newRating': newRating, 'diff':diff }) return data_list if __name__ == "__main__": acID = "Trebleb" #"a2018040538" dataList = getACUserData(acID) print(dataList) cfID = "bhyyb" #"Fefer_Ivan" datalist = getCFUserData(cfID) print(datalist)