1 #!/usr/bin/python
2 # -*- coding: UTF-8 -*-
3 # python 36
4 __author_ = ''
5 import requests
6 import re
7 import os
8
9 domian = 'http://www.sjtxt.la'
10
11 def get_novel_sort_list():
12 13 response = requests.get('http://www.sjtxt.la/soft/7/Soft_007_1.html')
14 result = response.text
15 reg = r'<a href="([^=]*?)"><img src=".*?">(.*?)</a>'
16 novel_url_list = re.findall(reg,result)
17 # print(novel_url_list)
18 return novel_url_list
19
20 def get_novel_content(url):
21 url = '{}{}'.format(domian,url)
22 response = requests.get(url)
23 response.encoding = 'UTF-8'
24 result = response.text
25 reg = r'''<a class="downButton" href='(.*?)' title'''
26 chapter_url_content = re.findall(reg,result)[0]
27 return chapter_url_content
28
29 def get_chapter_list(url):
30 url = '{}{}'.format(domian,url)
31 response = requests.get(url)
32 response.encoding = 'UTF-8'
33 result = response.text
34 reg = r'<li><a href="(.*?\.html)">(.*?)</a></li>'
35 chapter_url_list = re.findall(reg,result)
36 return chapter_url_list
37
38 def get_chapter_content(url):
39 url = '{}{}'.format(domian,url)
40 response = requests.get(url)
41 response.encoding = 'UTF-8'
42 result = response.text
43 reg = r'id="content1">(.*?)<script type="text/javascript">read_bot'
44 chapter_content = re.findall(reg,result,re.S)[0]
45 return chapter_content
46
47 for novel_url,novel_name in get_novel_sort_list():
48 # print(novel_name,novel_url)
49
50 path = os.path.join('novel',novel_name)
51 if not os.path.exists(path):#判断当前文件是否存在
52 os.mkdir(path)
53 print('创建目录成功---{}'.format(novel_name))
54 else:
55 print('{}---当前目录已经存在,跳过'.format(novel_name))
56 chapter_url_content = get_novel_content(novel_url)
57 for chapter_url,chapter_name in get_chapter_list(chapter_url_content):
58 chapter_content = get_chapter_content(chapter_url_content + chapter_url)
59 tmp_path = os.path.join(path,chapter_name + '.txt')
60 print(tmp_path,111)
61 if not os.path.exists(tmp_path):
62 with open(tmp_path,'w') as fn:
63 fn.write(chapter_content)
64 print('{}---保存成功'.format(chapter_name))
65 else:
66 print('{}---章节存在,已跳过'.format(tmp_path))