数之联官网新闻爬取

import requests
import pandas as pd
import random
from time import sleep
import json


def shuzhilian(keyword):

    for i in range(1, 20):
        baseurl = 

f'https://www.17hongtu.cn/third_Party/Build/getArticle?id=340&page={i}&cate_id=0'
        headers = [  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" ]
        headers = {
            "User-Agent": str(random.choice(headers)),
        					}
        res = requests.get(url=baseurl, headers=headers).json()
        datalist = res['data']['data']
        for item in datalist:
            title = item['name']
            hrefid = item['id']
            href = f'http://www.unionbigdata.com/news/detail/news-9682-{id}-1.html'
            retime = item['create_time']
            intro = item['description']
            print(f'{title}的网址为{href}')
            info = pd.DataFrame(
                {'keyword': keyword, 'title': title, 'href': href, 'retime': retime, 'intro': intro},
                index=[1])
            info.to_csv(r'D:\桌面\shuzhilian.csv', mode='a', header=None, index=None, encoding='utf_8_sig')
            sleep(.3)
            
shuzhilian('成都数之联科技股份有限公司')

说明:代码仅供学习参考使用,请勿用于任何非法用途,否则自行承担法律责任

posted @ 2023-03-05 13:53  知识荒野  阅读(15)  评论(0)    收藏  举报