单线程从url读取txt采集越南小说

#-*-coding:utf-8-*-
import requests
import re
import os
import time
import random

#for i in range(3,28,1):
#url = "https://www.baidu.com/page/"+str(i)
#print(url)
#time.sleep(1)
def subdomain():
    with open('xiaoshuourl.txt', 'r', encoding='utf-8') as f:
        for text in f.read().splitlines():
            try:
                url = "%s" % text
                headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
                res = requests.get(url,headers = headers)
                res.encoding = res.apparent_encoding
                body = res.text
                time.sleep(3)
                #print(body)
                p1 = re.compile('<p> (.*?)</p>') #第1次正则匹配规则
                p2 = re.compile('inherit;">(.*?)</font></font></a>')#第2次正则匹配规则
                link1 = p1.findall(body)#通过第一次正则规则去body里面匹配
                #link2 = str(link1)#把link转成字符串格式
                #link2 = p2.findall(link2)#通过第二次的正则规则去link2里面匹配 link2就是字符串化的link 也就是第一次匹配后的再匹配
                #link2 = str(link2)#把link2转成字符串格式

                newurl = str(link1)#转字符串 在这里更改可以设置一次匹配或者两次匹配!!!
                newurl = str(newurl)#转字符串
                newurl = newurl.replace("']", '').replace("['", '').replace("', '", '\n')#指定字符串过滤
                print(newurl)
                #newurl = newurl +'\n'#每个列表结束增加换行 加换行 肉眼看好一点 实际上有多余的空行

                with open('xiaoshuo.txt','a+',encoding="utf8") as f:
                    f.write(newurl)
                    f.close()
            except Exception as result:
                print("出错了", result)
            continue


if __name__ == '__main__':
    subdomain()
    print("恭喜你,爬取结束!")

 

posted @ 2022-07-16 15:09  射满东城湖  阅读(91)  评论(0编辑  收藏  举报