1 import requests
2 from urllib.request import urlretrieve,urljoin
3 import re,json,os,time
4 import glob,fitz
5
6 """爬取原创力文档内容,并转存为PDF格式"""
7
8 def get_params(url,headers):
9
10 res = requests.get(url=url,headers=headers)
11 dict1={}
12 dict1["title"] = re.findall("<title>(.*)</title>",res.text)[0]
13 dict1["aid"] = re.findall("aid: (.*), //解密后的id",res.text)[0]
14 dict1["view_token"] = re.findall("view_token: (.*) //预览的token",res.text)[0]
15 dict1["page"] = re.findall("actual_page: (.*), //真实页数",res.text)[0]
16 return dict1
17
18 def get_imgs(headers,title,aid,view_token,page,img_path):
19
20 url = "https://openapi.book118.com/getPreview.html"
21 data = {
22 "project_id": 1,
23 "aid": aid,
24 "view_token": view_token,
25 "page": page
26 }
27 r = int(page)/6
28 if r >1:
29 r=int(r)
30 else:
31 r=0
32 n = 1
33 for j in range(r+1):
34 data["page"] = j*6+1
35 res = requests.get(url=url,headers=headers,params=data)
36 time.sleep(2)
37 data1 = re.findall("{.*}",res.text)[0]
38 dic1 = json.loads(data1)
39 #print(data1)
40 for i in (dic1["data"].values()):
41 img_url = urljoin("https:",i)
42 urlretrieve(img_url,img_path + rf"\{n}.png")
43 n = n + 1
44
45 def img_pdf(img_path,pdf_name):
46 #打开空文档
47 doc = fitz.open()
48 for img in sorted(glob.glob(img_path + r"\*.png")):
49 imgdoc = fitz.open(img)
50 pdfbytes = imgdoc.convertToPDF()
51 # 将当前文档写入pdf
52 imgpdf = fitz.open("pdf", pdfbytes)
53 doc.insertPDF(imgpdf)
54 doc.save(img_path + rf"\{pdf_name}.pdf")
55 doc.close()
56
57 def main(url,img_path):
58 headers = {
59 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
60 }
61 dict1 = get_params(url,headers)
62 title = dict1["title"]
63 aid = dict1["aid"]
64 view_token =dict1["view_token"]
65 page = dict1["page"]
66 if os.path.exists(img_path):
67 pass
68 else:
69 os.mkdir(img_path)
70 get_imgs(headers,title,aid,view_token,page,img_path)
71 img_pdf(img_path,title)
72
73
74
75 if __name__ == "__main__":
76
77 url = "https://max.book118.com/html/2018/1027/5041323214001323.shtm"
78 #url = "https://max.book118.com/html/2018/0706/6225232151001204.shtm"
79 file = int(time.time())
80 img_path = rf"statics\{file}"
81 main(url,img_path)