python beautifulsoup 爬虫实战--抓取acm队员atcoder和codeforces比赛数据

首先需要安装bs4包 命令如下:pip install beautifulsoup4
对上面网页进行分析,抓取历史参赛数据的代码如下:
# pip install BeautifulSoup4
from bs4 import BeautifulSoup
import requests
import json, time

def getUrlText(url):
    while True:
        try:
            html = requests.get(url)
            html = html.text
            break
        except requests.exceptions.ConnectionError:
            print('ConnectionError -- please wait 3 seconds')
            time.sleep(3)
        except requests.exceptions.ChunkedEncodingError:
            print('ChunkedEncodingError -- please wait 3 seconds')
            time.sleep(3)    
        except:
            print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
            time.sleep(3)
    return html

def getCFUserData(cfID):
    url = "https://codeforces.com/api/user.rating?handle=" + cfID
    html = getUrlText(url)    
    js = json.loads(html)    
    if 'result' not in js.keys():
        return []
    results = json.loads(html)['result']
    datalist = []
    for d in results:
        date = d["ratingUpdateTimeSeconds"]
        contest = d["contestName"]
        rank = d["rank"]
        newRating = d["newRating"]
        diff = int(newRating) - int(d["oldRating"])
        datalist.append({
                'date': date,
                'contest': contest, 
                'rank': rank, 
                'newRating': newRating, 
                'diff':diff
            })
    # print(datalist['result'])
    return datalist

# atcoder 
def getACUserData(acID):
    url = "https://atcoder.jp/users/"+acID+"/history"
    html = getUrlText(url)
    soup = BeautifulSoup(html, features="lxml")
    table = soup.select('#history')
    if len(table) > 0:
        t = table[0]
    else:
        return []

    # [dict1, dict2, ...]
    # dict:{'date': date, 'contest': contest, 'rank': rank, 'newRating': newRanking, 'diff':diff}
    data_list = []  

    for idx, tr in enumerate(t.select('tr')):
        if idx != 0:
            tds = tr.select('td')
            date = tds[0].select('time')[0].text
            contest = tds[1].select('a')[0].text
            rank = tds[2].select('a')[0].text
            if len(tds[4].select('span')) > 0:
                newRating = tds[4].select('span')[0].text
            else:
                newRating = tds[4].text
            diff = tds[5].contents[0]
            # print(date,contest,rank,newRating,diff)
            data_list.append({
                'date': date,
                'contest': contest, 
                'rank': rank, 
                'newRating': newRating, 
                'diff':diff
            })

    return data_list

if __name__ == "__main__":
    acID = "Trebleb" #"a2018040538"
    dataList = getACUserData(acID)
    print(dataList)
    cfID = "bhyyb"  #"Fefer_Ivan"
    datalist = getCFUserData(cfID)
    print(datalist)

 

 

posted @ 2020-01-27 19:52  liuyong0076  阅读(835)  评论(0编辑  收藏  举报