python爬虫学习(一)requests爬取dytt下载地址
当网址有加密发送安全证书时可以使用verify=False,因为dytt利用的字符编码是gb2312,所以解码也要用gb2312
import requests domain = "https://dy.dytt8.net/index.htm" resp = requests.get(domain,verify=False) #verify=False 去掉安全验证 resp.encoding = 'gb2312' print(resp.text)
通过re获取对应的url信息,子页面地址信息保存到数组中
#获取到ul里的li obj = re.compile(r"最新电影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S) obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S) result =obj.finditer(resp.text) #print(result) child_href_list = [] for i in result: ul = i.group('ul') #提取子页面连接 result1 = obj1.finditer(ul) for i1 in result1: child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/") child_href_list.append(child_href) print(child_href_list)
获取子页面内容
添加子页面电影名和下载地址
obj2 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S)
将爬取内容保存到csv文件中
f = open("dytt.csv",mode="a+",newline="",encoding='utf8') csv_writer = csv.writer(f)
完整代码
import requests import re import csv header = { "User-Agent": "XXXXXXX" } domain = "https://dy.dytt8.net/index.htm" resp = requests.get(domain,verify=False,headers = header) #verify=False 去掉安全验证 resp.encoding = 'gb2312' #print(resp.text) #获取到ul里的li obj = re.compile(r"最新电影更新:.*?<ul>(?P<ul>.*?)</ul>",re.S) obj1 = re.compile(r"<a href='(?P<link>.*?)'",re.S) obj2 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<magnet>.*?)">',re.S) result =obj.finditer(resp.text) f = open("dytt.csv",mode="a+",newline="",encoding='utf8') csv_writer = csv.writer(f) #print(result) child_href_list = [] for i in result: ul = i.group('ul') #提取子页面连接 result1 = obj1.finditer(ul) for i1 in result1: child_href = "https://dy.dytt8.net/" + i1.group('link').strip("/") child_href_list.append(child_href) #print(child_href_list) for href in child_href_list: #print(href) child_resp = requests.get(href,verify=False,headers = header) child_resp.encoding = 'gb2312' #print(child_resp.text) movies = obj2.finditer(child_resp.text) for i in movies: dic = i.groupdict() dic['movie'] = dic['movie'].strip('\u3000') dic['movie'] = dic['movie'].strip() #print(i.group("movie")) #print(i.group("magnet")) print(dic) #将字典里的内容写入csv文件中 csv_writer.writerow(dic.values())
 
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号