python爬虫--爬取cctv连续剧

  1 #encoding=utf-8
  2 import requests
  3 from bs4 import BeautifulSoup
  4 import re
  5 import os
  6 from aria2rpc import rpc_addUri
  7 class Cntv():
  8 
  9     def openUrl(self,url):
 10         """
 11         This method is used to open a web site
 12         :param url:Web site to request
 13         :return:Requested object
 14         """
 15         header = {
 16             "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36"
 17         }
 18         response = requests.get(url, header)
 19         return response
 20         # pass
 21     def getEachEpisodeUrl(self):
 22         """
 23         Get the address of each episode of the TV play
 24         :return:urls lists
 25         """
 26         urls = []
 27         # response = requests.get(self.url)
 28         url = "http://tv.cctv.com/2014/07/07/VIDA1404730290373811.shtml"
 29         response = self.openUrl(url)
 30         html = response.content.decode('utf-8')
 31         soup = BeautifulSoup(html,'html.parser')
 32         title = soup.select(".text_mod h3")
 33         print(title[0].text)
 34         episodes = soup.select('.img a')
 35         # print(episodes)
 36         for each in range(1,len(episodes),3):
 37             print(episodes[each]['title'],"link:"+episodes[each]['href'])
 38             urls.append(episodes[each]['href'])
 39         print("Get Each Episode Url Come Over !!!")
 40         return urls
 41     def getEachDLUrl(self):
 42         urls = self.getEachEpisodeUrl()
 43         links = []
 44         for num,url in enumerate(urls):
 45             response = self.openUrl(url)
 46             html = response.text
 47             # soup = BeautifulSoup(html, 'html.parser')
 48             match = re.search(r'guid = "(\w+?)";', html)
 49             pid = match.group(1)
 50             # print(pid)
 51             link = "http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=%s&tz=%s&from=%s&url=%s&idl=%s&idlr=%s&modifyed=%s" %(pid,'-8','000news',url,'32','32','false')
 52             links.append(link)
 53             print("获取第%d集" %(num))
 54             # print(urls)
 55         return links
 56     def getDLList(self):
 57         """
 58         Get the download address for each episode of the TV play
 59         :return:ownload address list
 60         """
 61         links = self.getEachDLUrl()
 62         # links = ["http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=59381a0e55404cf5b101f7d3bcad2da8&tz=-8&from=000news&url=http://tv.cctv.com/2014/07/15/VIDE1405435161521590.shtml&idl=32&idlr=32&modifyed=false"]
 63         dl_urls = []
 64         for link in links:
 65             dl_url = []
 66             response = self.openUrl(link)
 67             # html = response.content.decode('utf-8')
 68             dl_list = response.json()['video']['chapters4']
 69             for each in range(len(dl_list)):
 70                 downloadurl = dl_list[each]['url']
 71                 dl_url.append(downloadurl)
 72                 print(downloadurl)
 73             dl_urls.append(dl_url)
 74         return dl_urls
 75     def _add_aria2_task(self, url, name):
 76         """
 77         :param url:download url
 78         :param name:dowmload tv name
 79         :return:
 80         """
 81         try:
 82             result = rpc_addUri(url, {'out': name})
 83             return result
 84         except Exception as e:
 85             print(e)
 86             return None
 87 
 88 
 89 # response.json()['video']['lowChapters'][0]['url']
 90 # response.json()['video']['chapters4'][0]['url']
 91 """    
 92     def dlTv(self):
 93       
 94         dl_urls_list = self.getDLList()
 95         if os.path.exists("tv_list") == False:
 96             os.mkdir("tv_list")
 97         os.chdir("tv_list")
 98         for dl_urls in dl_urls_list:
 99             for dl_url in dl_urls:
100                 print("download" + dl_url)
101                 # response = self.openUrl(dl_url)
102                 # with open("first.mp4",'ab') as tl:
103                 #     tl.write(response.content)
104             print("-"*20)
105 """
106 if __name__ == "__main__":
107     cm = Cntv()
108     # cm.getUrl()
109     # cm.openUrl()
110 
111     lists = cm.getDLList()
112     for num,list in enumerate(lists):
113         for i,url in enumerate(list):
114             cm._add_aria2_task(url, str(num+1)+'_'+str(i+1)+'.mp4')

 

posted @ 2017-09-22 09:14  RoyFans  阅读(2205)  评论(1编辑  收藏  举报