python(17) 获取acfun弹幕,评论和视频信息

每天一点linux命令:新建文件夹

一,使用python获得acfun的所有番剧的信息,评论,弹幕

 1 #! /usr/bin/env python
 2 # -*- coding=utf-8 -*-
 3 import re
 4 import requests
 5 import sys
 6 import json
 7 reload(sys)
 8 sys.setdefaultencoding("utf-8")
 9 num = 1
10 head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'} #防陷阱
11 def dm(ht):
12     oldURL= 'http://danmu.aixifan.com/V2/' + ht + '?pageSize=500&pageNo=0'
13     #print oldURL
14     for i in range(1,5):
15         newURL = re.sub('pageNo=\d+','pageNo=%d'%i,oldURL,re.S)
16         print newURL
17         html = requests.get(newURL,headers = head)
18         type = sys.getfilesystemencoding()
19         aa = json.loads(html.text)
20         #print len(aa[1])
21         try:
22             for i in range(0,501):
23                 print aa[2][i]['m']
24         except Exception,e:
25           break
26 def PL(ht):
27      url = 'http://www.acfun.tv/comment/bangumi/web/list?bangumiId=' + ht #评论首地址,可获得评论数,评论的
28      print url
29      jscontent = requests.get(url,headers = head).content
30      jsDict = json.loads(jscontent)
31      pag =  jsDict['data']['totalPage']
32      print pag
33      nurl = url + '&pageNo=1'
34      for i in range(1,pag+1):
35          ourl = re.sub('pageNo=\d+','pageNo=%d'%i,nurl,re.S)
36          jscontent = requests.get(ourl,headers = head).content
37          jsDict = json.loads(jscontent)
38 
39 def geturl():
40     ourl = 'http://www.acfun.tv/bangumi/bangumi/page?pageSize=42&isWeb=1&pageNo=1&sort=1'
41     for i in range(1,8):
42         nurl = re.sub('pageNo=\d+','pageNo=%d'%i,ourl,re.S)
43         print nurl
44         jscontent = requests.get(nurl,headers = head).content
45         jsDict = json.loads(jscontent)
46         for j in range(1,42):
47            info( str(jsDict['data']['list'][j]['id']) )
48            break
49         break
50 def info(ht):
51     url = "http://www.acfun.tv/v/ab" + ht
52     sc = "http://www.acfun.tv/bangumi/stow/isStowed?bangumiId=" + ht           #收藏数
53     pl = "http://www.acfun.tv/bangumi/count/bangumi_view.aspx?bangumiId="+ht   #评论数
54     html = requests.get(url)
55     htpl = requests.get(pl)
56     title = re.findall('h3 class="title">(.*?)</h3><span',html.text,re.S)[0]
57     print '名称:' + title
58     up = re.findall('</h3><span class="last">(.*?)</span>',html.text,re.S)[0]
59     print '更新:'+ up
60     pp = re.search('\[(.*?)\]',htpl.text,re.S).group(1)
61     print '评论总数:' + pp
62     jsconten = requests.get(sc,headers = head).content
63     jsDict = json.loads(jsconten)
64     print '收藏总数:' + str(jsDict['data']['stowCount'])
65     jianjie = re.findall('pan class="desc">(.*?)</span>',html.text,re.S)[0]
66     print '简介:' + jianjie
67     page = re.findall('" data-count="(.*?)" data-index="',html.text,re.S)[0]
68     page = int(page)
69     nurl = url + '_1'
70     for i in range(1,page+1):#有多少话 多少页
71           nurl = re.sub('_\d+','_%d'%i,nurl,re.S)#每个话的地址
72           print nurl
73           print '' + str(i) + '话弹幕:'
74           html = requests.get(nurl)
75           id = re.findall('data-vid="(.*?)" data-sid',html.text,re.S)[0]#获取每个话的弹幕,地址
76           # dm(id)
77           print '' + str(i) + '话评论:'
78           PL(ht)
79 if __name__ == "__main__":
80     geturl()



posted on 2016-03-15 10:44  细雨微光  阅读(1040)  评论(0编辑  收藏  举报