欢迎来到RankFan的Blogs

扩大
缩小

外汇新闻 爬虫

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import os
from docx import Document

def get_url(i):
    if i == 1:
        url = 'https://www.safe.gov.cn/safe/whxw/index.html'
    else:
        # F-Strings是开头有一个f的字符串文字,使用后包含表达式的大括号将被其值替换,
        # 表达式在运行时进行渲染,如果不加上f最后爬取出来的会是空列表
        url = f'https://www.safe.gov.cn/safe/whxw/index_{i}.html'
    return url

def get_html_text(url):
    """get text from html"""
    r = requests.get(url)
    r.encoding = r.apparent_encoding  # 防止因编码语言不同出现乱码
    text = r.text
    return text

def get_attr(li):
    """get attribute from the html"""
    title = li.find('a').text
    href = li.find('a').get('href')[1:]  # 获取外汇新闻链接后缀
    child_url = 'https://www.safe.gov.cn/' + href
    date = li.find('dd').text
    return (title, child_url, date)

def bs_content(soup):
    """get content by bs4"""
    title = soup.find('div', class_='detail_tit').text  # 获取新闻标题
    title = title.replace('/', '')  # 下载前要将文件名中带/号的去掉,因为文件命名规则不能带/号,否则程序会中断
    title = title.replace('|', '')  # 下载前要将文件名中带|号的去掉,因为文件命名规则不能带|号,否则程序会中断
    content = soup.find('div', class_='detail_content').text  # 获取新闻正文
    return title, content

def check_path(path):
    """check subapth"""
    if not os.path.exists(path):
        os.makedirs(path)

def save_content(title, content, save_method):
    """2 ways: txt or doc"""
    if save_method == 'txt':
        check_path('./text_file')

        title = title + '.txt'
        filepath = os.path.join('./text_file', title)
        with open(filepath, 'w', encoding='utf-8') as f:
            content = " ".join(content.split())
            f.write(content)

    elif save_method == 'doc':
        check_path('./text_doc')

        doc = Document()
        doc.add_heading(title)  # 添加标题
        doc.add_paragraph(date)  # 添加段落
        doc.add_paragraph(content)  # 添加段落
        doc.save(f'./text_doc/{title}.docx')  # 保存文档


if __name__ == '__main__':
    items = []  # 定义一个空列表,为后期数据的存储做好准备
    num_pages = 100
    for i in tqdm(range(1, num_pages)):  # range区间是“左闭右开”的
        url = get_url(i)
        text = get_html_text(url)

        soup = BeautifulSoup(text, 'html.parser')
        li_list = soup.find('div', class_='list_conr').find_all('li')
        for li in li_list:
            title, child_url, date = get_attr(li)
            items.append([title, date, child_url])  # 将爬取所有item作为一个整体添加到总列表中

    df = pd.DataFrame(items, columns=['标题', '发布时间', '链接'])  # 将总列表中的列标题分别命名为标题、发布时间、链接
    df.to_csv(f'国家外汇管理局外汇新闻信息{num_pages}页.csv', index=False, encoding='utf_8_sig')  # 一定要加上encoding=utf_8_sig,不然会出现乱码!

    for i in range(len(df)):
        url = df.iloc[i]['链接']
        date = df.iloc[i]['发布时间']
        text = get_html_text(url)
        soup = BeautifulSoup(text, 'html.parser')
        title, content = bs_content(soup)

        save_method = 'txt'
        save_content(title, content, save_method)

posted on 2022-07-01 10:13  RankFan  阅读(89)  评论(0)    收藏  举报

导航