爬虫xpath


def get():
    asdic={}
    dic = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
        "Cookie": "__utma=83743493.2124382466.1651801022.1651801022.1651801022.1; __utmc=83743493; __utmz=83743493.1651801022.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; c=BAgiEjExNC4yNTAuMTYuNzE%3D--24719bef0974c111f4021396aa1aab5d6c1a55aa; _bgp_session=BAh7BjoPc2Vzc2lvbl9pZEkiJTQyZDM4Y2QwZTEyZDA3NDY2N2ExZjYzODljNDA3MTA2BjoGRUY%3D--97bf7880ade319982a10f7fcb96a5b29c0329859; __utmb=83743493.3.10.1651801022"
    }
    url = 'https://bgp.he.net/AS137695#_peers6'
    resp=requests.get(url,headers=dic)
    #print(resp.text)
    html=etree.HTML(resp.text)
    '''/html/body/div[2]/div[8]/table/tbody/tr[1]'''
    #divs = html.xpath('/html/body/div[2]/div[4]/table/tbody/tr')
    divs=html.xpath('/html/body/div[2]/div[8]/table/tbody/tr')

    for div in divs:
        try:
            des=div.xpath('./td[2]/text()')[0].strip()
        except:
            continue
        asnum=div.xpath('./td[4]/a/text()')[0].strip()
        #print(html.xpath('/html/body/div[2]/div[6]/table/tbody/tr[1]/td[1]/a'))
        asdic[asnum]=des

    print(asdic)

posted @ 2022-05-06 13:48  lifei888  阅读(24)  评论(0)    收藏  举报