'''
诗词名句网
1. 爬取固定书籍
2. 爬取书名
3. 爬取本部书的章回目录
4. 灵活处理,爬取任意书籍的章回目录
5. 加入异常处理
6. 爬取任意整本书
'''
import requests
import re
def bookSpider(oldurl,bookName):
url=oldurl+".html"
html=loadPage(url)
try:
with open("demo.txt",'w',encoding='utf-8') as f:
f.write(html)
except:
print("FILE OPERATION ERROR")
findTitle("demo.txt",bookName)
cnt=findTileOfPages("demo.txt",bookName)
getWholeBook(oldurl,bookName,cnt)
def findTitle(filename,bookName):
try:
f=open(filename,encoding='utf-8')
book=open("book.txt",'w',encoding='utf-8')
except:
print("FILE OPERATION ERROR")
while True:
line=f.readline()
#print("READ:"+line)
if not line:
break
pattern=re.compile(r'<title>《.{0,10}》')
bookName=re.search(pattern,line)
flag=False
if bookName:
print("书名:",end="")
for ch in str(bookName):
if ch == '《':
flag=True
if ch == '》':
flag=False
print("》")
book.write('》'+'\n')
if flag:
print(ch,end="")
book.write(ch)
def findTileOfPages(filename,bookName):
cnt=0
try:
f = open(filename,encoding='utf-8')
book = open("book.txt",'a', encoding='utf-8')
except:
print("FILE OPERATION ERROR")
book.write("目录:\n")
while True:
line = f.readline()
# print("READ:"+line)
if not line:
break
pattern = re.compile(r'<li><a href="/book/'+bookName+'/\d+.html">.{10,40}</a></li>')
titleOfpages = pattern.findall(line)
flag = False
if titleOfpages:
for i in range(0,len(titleOfpages)):
cnt+=1
for j in range(0,len(titleOfpages[i])):
if titleOfpages[i][j] == '第':
flag=True
if titleOfpages[i][j] == '<':
flag=False
if flag:
print(titleOfpages[i][j],end="")
book.write(titleOfpages[i][j])
print()
book.write('\n')
return cnt
def getWholeBook(url,bookName,cnt):
print("正在下载全本书,请稍后...")
for i in range(1,cnt+1):
newUrl=url+'/'+str(i)+".html"
print(newUrl)
html=loadPage(newUrl)
try:
with open("bookHtml.txt", 'w', encoding='utf-8') as f:
f.write(html)
except:
print("FILE OPERATION ERROR")
f = open('bookHtml.txt', 'r', encoding='utf-8')
bookContent = open('book.txt', 'a', encoding='utf-8')
while True:
line = f.readline()
# print("READ:"+line)
if not line:
break
pattern = re.compile(r'<p> .+</p>')
content = re.findall(pattern, line)
patternOfTitle=re.compile(r'<h1>.+</h1>')
contentOfTitle = re.findall(patternOfTitle, line)
flag=False
for i in range(0, len(contentOfTitle)):
for j in range(0, len(contentOfTitle[i])):
if contentOfTitle[i][j] == '>':
flag=True
continue
if contentOfTitle[i][j] == '<':
flag=False
continue
if flag:
bookContent.write(contentOfTitle[i][j])
bookContent.write('\n')
flag = False
for i in range(0, len(content)):
for j in range(0, len(content[i])):
if content[i][j] == '<':
flag=False
continue
if content[i][j] == ';' and content[i][j - 1] == 'p' and content[i][j + 1] != '&':
flag = True
continue
if flag:
bookContent.write(content[i][j])
bookContent.write('\n')
f.close()
bookContent.close()
def loadPage(url):
try:
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
response = requests.get(url, headers=header)
return response.content.decode('utf-8')
except:
print("PAGE LOAD ERROR")
if __name__ == "__main__":
bookName=input("请输入想看的书名:(全拼)")
url = "http://www.shicimingju.com/book/"+bookName
bookSpider(url,bookName)