1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/1/11 22:07
4 # @Author : lingxiangxiang
5 # @File : demon1.py
6 '''爬虫阿铭linux教程,保存为本地的pdf文件'''
7 # 需要知道三招教你做人
8 # pdfkit.from_string("hello world", "1.pdf")
9 # pdfkit.from_url("www.baidu.com", "2.pdf")
10 # pdfkit.from_file("hello.html", "3.pdf")
11 import re
12
13 import os
14
15 import pdfkit
16 import requests
17
18
19 if not os.path.exists("aminglinux"):
20 os.mkdir("aminglinux")
21 os.chdir("aminglinux")
22
23 url = "http://www.apelearn.com/study_v2/"
24 s = requests.session()
25 text = s.get(url).text
26 print(text)
27 reg = re.compile(r'<li class="toctree-l1"><a class="reference internal" href="(.*)">.*</a></li>')
28 result = reg.findall(text)
29 res = list(set(result))
30 pdfUrl = "http://www.apelearn.com/study_v2/"
31 for i in res:
32 url = "{0}{1}".format(pdfUrl, i)
33 pdfFileName = i.replace("html", "pdf")
34 print(pdfFileName)
35 try:
36 pdfkit.from_url(url, pdfFileName)
37 except:
38 continue