python爬虫学习(一)requests爬取dytt下载地址

当网址有加密发送安全证书时可以使用verify=False,因为dytt利用的字符编码是gb2312,所以解码也要用gb2312

import requests
domain = "https://dy.dytt8.net/index.htm"
resp = requests.get(domain,verify=False)
#verify=False 去掉安全验证
resp.encoding = 'gb2312'
print(resp.text)

通过re获取对应的url信息,子页面地址信息保存到数组中

#获取到ul里的li
obj = re.compile(r"最新电影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S)
result =obj.finditer(resp.text)
#print(result)
child_href_list = []
for i in result:
    ul = i.group('ul')
    
    #提取子页面连接
    result1 = obj1.finditer(ul)
    for i1 in result1:
        child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/")
        child_href_list.append(child_href)
print(child_href_list)

获取子页面内容

添加子页面电影名和下载地址

obj2 = re.compile(r'◎片  名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S)

将爬取内容保存到csv文件中

f = open("dytt.csv",mode="a+",newline="",encoding='utf8')
csv_writer = csv.writer(f)

完整代码

import requests
import re
import csv
header = {
    "User-Agent": "XXXXXXX"
}
domain = "https://dy.dytt8.net/index.htm"
resp = requests.get(domain,verify=False,headers = header)
#verify=False 去掉安全验证
resp.encoding = 'gb2312'
#print(resp.text)
#获取到ul里的li
obj = re.compile(r"最新电影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S)
obj2 = re.compile(r'◎片  名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S)
result =obj.finditer(resp.text)
f = open("dytt.csv",mode="a+",newline="",encoding='utf8')
csv_writer = csv.writer(f)
#print(result)
child_href_list = []
for i in result:
    ul = i.group('ul')
    #提取子页面连接
    result1 = obj1.finditer(ul)
    for i1 in result1:
        child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/")
        child_href_list.append(child_href)
#print(child_href_list)
for href in child_href_list:
    #print(href)
    child_resp = requests.get(href,verify=False,headers = header)
    child_resp.encoding = 'gb2312'
    #print(child_resp.text)
    movies = obj2.finditer(child_resp.text)
    for i in movies:    
        dic = i.groupdict()
        dic['movie'] = dic['movie'].strip('\u3000')
        dic['movie'] = dic['movie'].strip()
        #print(i.group("movie"))
        #print(i.group("magnet"))
        print(dic)
        #将字典里的内容写入csv文件中
        csv_writer.writerow(dic.values())

 

posted @ 2021-06-23 14:12  YuyuFishSmile  阅读(996)  评论(0)    收藏  举报