re模块之电影地址下载
这次爬取的是2021必看片下载地址

查看源代码,,我们能搜索到'2021必看片'

编码方式是:

首先我们获取的就是‘2021必看片’下面的列表嵌套的电影地址
获取地址之后,遍历每一个地址,在每个电影地址里查找下载地址

code:
# 通过域名请求2021必看片
# 获取子页面地址
# 请求子页面地址链接,获取 下载链接
import re
import csv
import requests
domain = "https://dytt89.com/"
headers = {
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0.1; Moto G (4)) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Mobile Safari/537.36 Edg/95.0.1020.40'
}
req = requests.get(domain, headers=headers, verify=False)
req.encoding = 'gb2312' # 字符集charset 是 gb2312 ,我们指定翻译
# print(req.text)
# 正则提取
obj1 = re.compile(r"2021必看热片.*?<ul>(?P<ul>.*?)</ul>", re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'", re.S)
obj3 = re.compile(r'◎片 名 .*?(?P<name>.*?)<br />.*?'
r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<download>.*?)"', re.S)
res1 = obj1.finditer(req.text)
child_href_list = []
for i in res1:
ul = i.group('ul')
res2 = obj2.finditer(ul)
for j in res2:
# 拼接一下域名和子页面地址
child_href = (domain + j.group('href')).strip('/')
child_href_list.append(child_href) # 信息加入到列表
# 提取子页面内容
for href in child_href_list:
child_req = requests.get(href, headers=headers)
child_req.encoding = 'gb2312'
req3 = obj3.search(child_req.text)
# print(req3.group('name'))
# print(req3.group('download'))
dic = req3.groupdict()
# print(dic.values())
with open('电影天堂.csv', 'a', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f) # 这里用csv。writer 写一下
csvwriter.writerow(dic.values())
req.close()

浙公网安备 33010602011771号