人民日报 自动下载脚本
功能:获取当天的报刊电子版(别的日期也行)。如果把此脚本设置为每日自动执行,效果就是报纸每天会自动呈现在你的电脑桌面上。
因为近期有某些原因需要下载这个报纸,于是写了这个脚本,正好熟悉一下python的pdf操作。当然期间也解决了一些小问题,例如正则表达式和中文乱码的问题。
代码放到github了,https://github.com/raddyfiy/The-Peoples-Daily-download
安装
安装所需模块:pip3 install PyPDF2
用法:
下载当天报纸: python3 peoples_daily_download.py
下载指定日期报纸:python3 peoples_daily_download.py -date 20221010
权限:目录下允许读写
最佳使用姿势:可以在windows建立一个计划任务,每天定时执行此脚本,报纸会自动出现在你的桌面。
注意:有时候官网会忘记上传某一页,默认重试五次,依然失败的页会丢弃,继续向后下载。
# -*- coding: UTF-8 -*-
import requests
import re
import PyPDF2
import os
import shutil
import datetime
import sys
import argparse
import warnings
warnings.filterwarnings("ignore")
fsock = open('./log', 'w')
sys.stderr = fsock
proxies = {
'http': '127.0.0.1:8080',
'https': '127.0.0.1:8080'
}
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"
}
def download(today,partpath,newspaperpatch):
today1=today
today2=today[0:4]+today[5:7]+today[8:10]
today3=today.replace("-","")
try:
os.mkdir(newspaperpatch)
except:
pass
try:
os.mkdir('./part')
except:
shutil.rmtree(partpath)
os.mkdir('./part')
filelist=os.listdir(newspaperpatch)
if "People's.Daily.{}.pdf".format(today2) in filelist:
print("该日期已经下载过了!")
print("You alreay download this newspaper!")
exit(0)
coverurl="http://paper.people.com.cn/rmrb/html/{}/nbs.D110000renmrb_01.htm".format(today1)
response=requests.get(coverurl,headers=headers)
pagenum=len(re.findall("nbs",response.text))#get page number
print(pagenum)
if pagenum!=0:#old date process
if response.status_code==403:
print("你选择的日期太久远,网站不提供。只有两年之内的。")
exit(0)
print("下载中……")
for page in range(1,pagenum+1):
for retry in range(5):
downtplurl="http://paper.people.com.cn/rmrb/images/{0}/{2}/rmrb{1}{2}.pdf"
formatpage="{0:0>2}".format(page)
downurl=downtplurl.format(today1,today2,formatpage)
filename='rmrb{}.pdf'.format(today2+formatpage)
response=requests.get(downurl,headers=headers)
file=response.content
# print(len(file))
if len(file)>1000:
break
else: #new rules after 2024.12.01
coverurl="http://paper.people.com.cn/rmrb/pc/layout/{0}/node_01.html".format(today3)
print(coverurl)
response=requests.get(coverurl,headers=headers)
pagenum=len(re.findall("pageLink",response.text))#get page number
for page in range(1,pagenum+1):
print("第{0}页下载中……".format(page))
for retry in range(5):
currentPageUrl="http://paper.people.com.cn/rmrb/pc/layout/{0}/node_{1:0>2}.html".format(today3,page)
print(currentPageUrl)
response=requests.get(currentPageUrl,headers=headers)
dumpUrl=re.findall(r'''attachement.*?\.pdf''',response.text)[0]
downloadUrl="http://paper.people.com.cn/rmrb/pc/"+dumpUrl
print(downloadUrl)
formatpage="{0:0>2}".format(page)
filename='rmrb{}.pdf'.format(today2+formatpage)
response=requests.get(url=downloadUrl,headers=headers,proxies=proxies)
file=response.content
print(len(file))
if len(file)>1000:
break
print(partpath+"/"+filename)
with open(partpath+"/"+filename,"wb") as fn:
fn.write(file)
def merge(partpath,newspaperpatch):
print("合并中……")
filelist=os.listdir(partpath)
filelist.sort()
try:
pdfFM=PyPDF2.PdfFileMerger(strict=False)
except:
pdfFM=PyPDF2.PdfMerger(strict=False)
for file in filelist:
fullpath=partpath+'/'+file
filesize=os.path.getsize(fullpath) #判断文件大小,有的页本身不支持下载,发现为空则合并
if filesize<10:
print("第{}页网站不支持下载,已跳过".format(file[-6:-4]))
continue
pdfFM.append(fullpath)
pdfFM.write(newspaperpatch+"/People's.Daily."+filelist[0][4:12]+".pdf") #保存新文件在newspaperpatch下
pdfFM.close()
def delete(partpath):
shutil.rmtree(partpath)
def menu():
pass
if __name__ == '__main__':
menu()
today=datetime.date.today().strftime("%Y-%m/%d")
argc=len(sys.argv)
if argc>1:
argv = sys.argv[1:]
parser = argparse.ArgumentParser(description='ArgUtils')
parser.add_argument('-date', type=str, default=today, help="data date")
parser.add_argument('--date', type=str, default=today, help="data date")
args = parser.parse_args()
today="{}-{}/{}".format(args.date[0:4],args.date[4:6],args.date[6:8])
print(today)
partpath="./part" #临时文件夹,存每一页的文件,每次运行会自动创建和删除
newspaperpatch='./newspaper' #报纸保存位置,没有就自动创建
# today="2024-12/05" #默认下载当天的,可以命令行-date传入,也可在此手动修改日期,去掉注释按格式设置日期
print("Date: "+today)
download(today,partpath,newspaperpatch) #分片下载
merge(partpath,newspaperpatch)#合并
delete(partpath)#删除临时文件夹partpath
print("下载成功! 文件在newspaper里。")


浙公网安备 33010602011771号