网络爬虫

 

#! python3
# encoding = utf-8
# 功能:网络爬虫
import requests
import re
import webbrowser
pram = "(?:爷|爺|爸|お父|伯|叔|おじ|女儿|侄女|孙|義父|息子の嫁|禁断|介护|老人|老头|山口葉瑠)"
# pram ='(?:童颜|萝莉|高中生|18岁|美少女|女学生|漂亮)'  # 非捕获
# pram ='(?:山口葉瑠)'  # 非捕获

url = 'http://38.103.161.228/forum/'
# <a href="thread-10834064-1-3.html">VEC-448 搭不上末班車二人獨處…與平常嚴厲喝醉酒卻可愛的女上司幹到早上 君島美緒 [中文字幕]</a>
count = 0
page = 1

where = 'forum-230-'
temp = input('请输入搜索区,1:原创,2:转贴,默认1,按回车继续:')
try:
    temp = int(temp)
    if(temp==2):
        where = 'forum-58-'
except:
    pass

print(where)
def reg(text):
    global count, page
    # p = "<a href(?!.+<a href).+?   这里是参数  .*?<\/a>"
    # p = '<a href=.{40,70}green">.+?</a>'
    # p = '<a href=.*?green">.+?</a>'
    # p = '<a href="th.+?green">.+?</a>'
    # p = '<a href="th.+?green">.+?'+pram+'.+?</a>'
    p = '<a href="thread.+?'+pram+'.+?</a>'
    matchlist = re.findall(p, text)
    for item in matchlist:
        item = re.sub(r'href="', 'href="'+url, item)
        ur = re.search('http:.+html', item).group()
        # print(ur)
        webbrowser.open(ur)
        count += 1
    if count > 10:
        print("over,当前页:",page)
        count = 0
        return
    else:
        load()


#  下载网页
def load():
    global page
    # 发送GET请求:值在url的后面传递
    ur = url + where + str(page) + '.html'
    # print(ur)
    with requests.get(ur) as response:
        data = response.text
        # response.close() with as 中会自动关闭
        page += 1
        reg(data)


while True:
    info = input('请输入开始页,或直接按回车继续:')
    try:
        page = int(info)
    except:
        pass
    load()

 

posted @ 2020-10-28 07:56  老谭爱blog  阅读(858)  评论(0)    收藏  举报