#! python3
# encoding = utf-8
# 功能:网络爬虫
import requests
import re
import webbrowser
pram = "(?:爷|爺|爸|お父|伯|叔|おじ|女儿|侄女|孙|義父|息子の嫁|禁断|介护|老人|老头|山口葉瑠)"
# pram ='(?:童颜|萝莉|高中生|18岁|美少女|女学生|漂亮)' # 非捕获
# pram ='(?:山口葉瑠)' # 非捕获
url = 'http://38.103.161.228/forum/'
# <a href="thread-10834064-1-3.html">VEC-448 搭不上末班車二人獨處…與平常嚴厲喝醉酒卻可愛的女上司幹到早上 君島美緒 [中文字幕]</a>
count = 0
page = 1
where = 'forum-230-'
temp = input('请输入搜索区,1:原创,2:转贴,默认1,按回车继续:')
try:
temp = int(temp)
if(temp==2):
where = 'forum-58-'
except:
pass
print(where)
def reg(text):
global count, page
# p = "<a href(?!.+<a href).+? 这里是参数 .*?<\/a>"
# p = '<a href=.{40,70}green">.+?</a>'
# p = '<a href=.*?green">.+?</a>'
# p = '<a href="th.+?green">.+?</a>'
# p = '<a href="th.+?green">.+?'+pram+'.+?</a>'
p = '<a href="thread.+?'+pram+'.+?</a>'
matchlist = re.findall(p, text)
for item in matchlist:
item = re.sub(r'href="', 'href="'+url, item)
ur = re.search('http:.+html', item).group()
# print(ur)
webbrowser.open(ur)
count += 1
if count > 10:
print("over,当前页:",page)
count = 0
return
else:
load()
# 下载网页
def load():
global page
# 发送GET请求:值在url的后面传递
ur = url + where + str(page) + '.html'
# print(ur)
with requests.get(ur) as response:
data = response.text
# response.close() with as 中会自动关闭
page += 1
reg(data)
while True:
info = input('请输入开始页,或直接按回车继续:')
try:
page = int(info)
except:
pass
load()