import requests
import re

url = 'https://ad-api.uogroup.com/bin/v1/getAdContent'
doumain = 'https://www.dytt89.com/'
agent = {
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'
    
    }

resp = requests.get(doumain,verify=False)# verify=False 去掉安全验证
#当前开发环境是utf-8编码，从源代码中查看编码charset=gb2312
resp.encoding = 'gb2312' # 指定字符集
# 拿到页面源代码
# print(resp.text)
# 编写正则表达式.*？惰性匹配所有，(?P<ul>.*?)命名想要获取的内容
obj = re.compile(r'2022新片精品.*?<ul>(?P<ul>.*?)</ul>',re.S)
# 编写正则表达式
obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
# 用正则提取内容
result = obj.finditer(resp.text)
# 定义一个空的列表
child_href_list = []

# 提取标签<ul>(?P<ul>.*?)</ul>中的内容
for it in result:
    ul = it.group('ul')
    # print(ul)
    # 提取子页面的链接
    result2 = obj2.finditer(ul)
    for itt in result2:
        # 拼接子页面的url地址：域名+子页面地址，去掉斜杠
        child_href = doumain + itt.group('href').strip("/")
        # 把链接装到列表中
        child_href_list.append(child_href)

# 请求子页面地址
for href in child_href_list:
    child_resp = requests.get(href,verify = False)
posted on 2022-04-07 23:24 connie_tong 阅读(159) 评论(0) 收藏举报
刷新页面返回顶部
导航