Python 爬取微博热搜

链接地址:https://www.cnblogs.com/JustNo/p/10726802.html

微博热搜地址:https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6

在微博热搜地址 查看网页源代码,找到 class="td-02",如果要引用href连接地址,则需要再 a后面增加//@href

href=html.xpath('//td[@class="td-02"]/a//@href')
 
源码如下,其中我调用了钉钉机器人:
###导入模块
import requests
from lxml import etree
import requests,json

###网址
url="https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6"
###模拟浏览器
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}

###主函数
def main():
    ###获取html页面
    html=etree.HTML(requests.get(url,headers=header).text)
    rank=html.xpath('//td[@class="td-01 ranktop"]/text()')
    affair=html.xpath('//td[@class="td-02"]/a/text()')
    view = html.xpath('//td[@class="td-02"]/span/text()')
    href=html.xpath('//td[@class="td-02"]/a//@href')
    top=affair[0]
    affair=affair[1:]
    data="";
    data+="### top:["+top+"](https://s.weibo.com"+href[0]+") \n"
    #print('{0:<10}\t{1:<40}'.format("top",top))
    for i in range(0, len(affair)):
        #print("{0:<10}\t{1:{3}<30}\t{2:{3}>20}".format(rank[i],affair[i],view[i],chr(12288)))
        print(href[i])
        data+=(">- "+rank[i]+"\t ["+affair[i]+"](https://s.weibo.com"+href[i+1]+")\t "+view[i]+"\n")
    sendinfo_ding(data)


def sendinfo_ding(data):
    url = '你的机器人地址'  #你的机器人webhook地址
    program = {
        "msgtype": "markdown",
        "markdown": {
            "title":"微博热搜",
            "text": ""+data+""
            },
    }
    headers = {'Content-Type': 'application/json'}
    f = requests.post(url, data=json.dumps(program), headers=headers)
    print(f)

main()

 

posted @ 2019-09-18 16:11  Alex_Mercer  阅读(362)  评论(0)    收藏  举报