爬虫练习
有个网站特别宝藏,整理收集了很多木原音濑的作品。因为原网站的字体太小了,手机看实在费眼,所以写了个爬虫导成了txt文件。
import  requests
import threading
from bs4 import BeautifulSoup
import re
import os
import time
import pdfkit
r=requests.get("") //原网站主页
soup=BeautifulSoup(r.text,"html.parser")
contents = soup.find_all("p")
//转成txt格式
i = 0
err = []
for c in contents:
    if c.a:
        i += 1
        series = c.strong.text
        series = series.replace('/', ' ')
        books = c.find_all("a")
        j = 0
        for b in books:
            if b.attrs['href'].split('.')[-1] == "jpg":
                continue
            else:
                j += 1
                filename = str(i)+"-"+series+'-'+str(j)+"-"+b.text+'.txt'
                res = requests.get(b.attrs['href'])
                res.encoding = 'gb18030' //这里乱码搞了好久
                soup = BeautifulSoup(res.text,'html.parser',from_encoding='gb18030')
                for body in soup.find_all('body'):
                    s = body.get_text()
                    print(s)
                    with open(filename,'w',1,'utf-8') as f:
                        f.write(s)
 
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号