import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import os
from docx import Document
def get_url(i):
if i == 1:
url = 'https://www.safe.gov.cn/safe/whxw/index.html'
else:
# F-Strings是开头有一个f的字符串文字,使用后包含表达式的大括号将被其值替换,
# 表达式在运行时进行渲染,如果不加上f最后爬取出来的会是空列表
url = f'https://www.safe.gov.cn/safe/whxw/index_{i}.html'
return url
def get_html_text(url):
"""get text from html"""
r = requests.get(url)
r.encoding = r.apparent_encoding # 防止因编码语言不同出现乱码
text = r.text
return text
def get_attr(li):
"""get attribute from the html"""
title = li.find('a').text
href = li.find('a').get('href')[1:] # 获取外汇新闻链接后缀
child_url = 'https://www.safe.gov.cn/' + href
date = li.find('dd').text
return (title, child_url, date)
def bs_content(soup):
"""get content by bs4"""
title = soup.find('div', class_='detail_tit').text # 获取新闻标题
title = title.replace('/', '') # 下载前要将文件名中带/号的去掉,因为文件命名规则不能带/号,否则程序会中断
title = title.replace('|', '') # 下载前要将文件名中带|号的去掉,因为文件命名规则不能带|号,否则程序会中断
content = soup.find('div', class_='detail_content').text # 获取新闻正文
return title, content
def check_path(path):
"""check subapth"""
if not os.path.exists(path):
os.makedirs(path)
def save_content(title, content, save_method):
"""2 ways: txt or doc"""
if save_method == 'txt':
check_path('./text_file')
title = title + '.txt'
filepath = os.path.join('./text_file', title)
with open(filepath, 'w', encoding='utf-8') as f:
content = " ".join(content.split())
f.write(content)
elif save_method == 'doc':
check_path('./text_doc')
doc = Document()
doc.add_heading(title) # 添加标题
doc.add_paragraph(date) # 添加段落
doc.add_paragraph(content) # 添加段落
doc.save(f'./text_doc/{title}.docx') # 保存文档
if __name__ == '__main__':
items = [] # 定义一个空列表,为后期数据的存储做好准备
num_pages = 100
for i in tqdm(range(1, num_pages)): # range区间是“左闭右开”的
url = get_url(i)
text = get_html_text(url)
soup = BeautifulSoup(text, 'html.parser')
li_list = soup.find('div', class_='list_conr').find_all('li')
for li in li_list:
title, child_url, date = get_attr(li)
items.append([title, date, child_url]) # 将爬取所有item作为一个整体添加到总列表中
df = pd.DataFrame(items, columns=['标题', '发布时间', '链接']) # 将总列表中的列标题分别命名为标题、发布时间、链接
df.to_csv(f'国家外汇管理局外汇新闻信息{num_pages}页.csv', index=False, encoding='utf_8_sig') # 一定要加上encoding=utf_8_sig,不然会出现乱码!
for i in range(len(df)):
url = df.iloc[i]['链接']
date = df.iloc[i]['发布时间']
text = get_html_text(url)
soup = BeautifulSoup(text, 'html.parser')
title, content = bs_content(soup)
save_method = 'txt'
save_content(title, content, save_method)