单线程从url读取txt采集越南小说
#-*-coding:utf-8-*- import requests import re import os import time import random #for i in range(3,28,1): #url = "https://www.baidu.com/page/"+str(i) #print(url) #time.sleep(1) def subdomain(): with open('xiaoshuourl.txt', 'r', encoding='utf-8') as f: for text in f.read().splitlines(): try: url = "%s" % text headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'} res = requests.get(url,headers = headers) res.encoding = res.apparent_encoding body = res.text time.sleep(3) #print(body) p1 = re.compile('<p> (.*?)</p>') #第1次正则匹配规则 p2 = re.compile('inherit;">(.*?)</font></font></a>')#第2次正则匹配规则 link1 = p1.findall(body)#通过第一次正则规则去body里面匹配 #link2 = str(link1)#把link转成字符串格式 #link2 = p2.findall(link2)#通过第二次的正则规则去link2里面匹配 link2就是字符串化的link 也就是第一次匹配后的再匹配 #link2 = str(link2)#把link2转成字符串格式 newurl = str(link1)#转字符串 在这里更改可以设置一次匹配或者两次匹配!!! newurl = str(newurl)#转字符串 newurl = newurl.replace("']", '').replace("['", '').replace("', '", '\n')#指定字符串过滤 print(newurl) #newurl = newurl +'\n'#每个列表结束增加换行 加换行 肉眼看好一点 实际上有多余的空行 with open('xiaoshuo.txt','a+',encoding="utf8") as f: f.write(newurl) f.close() except Exception as result: print("出错了", result) continue if __name__ == '__main__': subdomain() print("恭喜你,爬取结束!")