#-*-coding:utf-8-*-
import requests
import re
import os
import time
import random
#for i in range(3,28,1):
#url = "https://www.baidu.com/page/"+str(i)
#print(url)
#time.sleep(1)
def subdomain():
with open('xiaoshuourl.txt', 'r', encoding='utf-8') as f:
for text in f.read().splitlines():
try:
url = "%s" % text
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
res = requests.get(url,headers = headers)
res.encoding = res.apparent_encoding
body = res.text
time.sleep(3)
#print(body)
p1 = re.compile('<p> (.*?)</p>') #第1次正则匹配规则
p2 = re.compile('inherit;">(.*?)</font></font></a>')#第2次正则匹配规则
link1 = p1.findall(body)#通过第一次正则规则去body里面匹配
#link2 = str(link1)#把link转成字符串格式
#link2 = p2.findall(link2)#通过第二次的正则规则去link2里面匹配 link2就是字符串化的link 也就是第一次匹配后的再匹配
#link2 = str(link2)#把link2转成字符串格式
newurl = str(link1)#转字符串 在这里更改可以设置一次匹配或者两次匹配!!!
newurl = str(newurl)#转字符串
newurl = newurl.replace("']", '').replace("['", '').replace("', '", '\n')#指定字符串过滤
print(newurl)
#newurl = newurl +'\n'#每个列表结束增加换行 加换行 肉眼看好一点 实际上有多余的空行
with open('xiaoshuo.txt','a+',encoding="utf8") as f:
f.write(newurl)
f.close()
except Exception as result:
print("出错了", result)
continue
if __name__ == '__main__':
subdomain()
print("恭喜你,爬取结束!")