网络爬虫

#! python3
# encoding = utf-8
# 功能：网络爬虫
import requests
import re
import webbrowser
pram = "(?:爷|爺|爸|お父|伯|叔|おじ|女儿|侄女|孙|義父|息子の嫁|禁断|介护|老人|老头|山口葉瑠)"
# pram ='(?:童颜|萝莉|高中生|18岁|美少女|女学生|漂亮)'  # 非捕获
# pram ='(?:山口葉瑠)'  # 非捕获

url = 'http://38.103.161.228/forum/'
# <a href="thread-10834064-1-3.html">VEC-448 搭不上末班車二人獨處…與平常嚴厲喝醉酒卻可愛的女上司幹到早上 君島美緒 [中文字幕]</a>
count = 0
page = 1

where = 'forum-230-'
temp = input('请输入搜索区，1：原创，2：转贴，默认1，按回车继续：')
try:
    temp = int(temp)
    if(temp==2):
        where = 'forum-58-'
except:
    pass

print(where)
def reg(text):
    global count, page
    # p = "<a href(?!.+<a href).+?   这里是参数  .*?<\/a>"
    # p = '<a href=.{40,70}green">.+?</a>'
    # p = '<a href=.*?green">.+?</a>'
    # p = '<a href="th.+?green">.+?</a>'
    # p = '<a href="th.+?green">.+?'+pram+'.+?</a>'
    p = '<a href="thread.+?'+pram+'.+?</a>'
    matchlist = re.findall(p, text)
    for item in matchlist:
        item = re.sub(r'href="', 'href="'+url, item)
        ur = re.search('http:.+html', item).group()
        # print(ur)
        webbrowser.open(ur)
        count += 1
    if count > 10:
        print("over,当前页：",page)
        count = 0
        return
    else:
        load()


#  下载网页
def load():
    global page
    # 发送GET请求：值在url的后面传递
    ur = url + where + str(page) + '.html'
    # print(ur)
    with requests.get(ur) as response:
        data = response.text
        # response.close() with as 中会自动关闭
        page += 1
        reg(data)


while True:
    info = input('请输入开始页，或直接按回车继续：')
    try:
        page = int(info)
    except:
        pass
    load()
posted @ 2020-10-28 07:56 老谭爱blog 阅读(859) 评论(0) 收藏举报
刷新页面返回顶部
老谭爱blog

网络爬虫

公告