BiliBili视频信息爬虫可视化

BiliBili视频信息爬虫可视化

说明

该可视化一共使用了四个数据源,分别是2017,2018,2019三年各五千条连续数据,以及总十万余条视频数据,用于总体与对比分析。**

可视化主要手段包括matplotlib以及pyecharts的两个版本(新版本1.7.1,旧版本0.1.9.4,两者语法区别很大)

本文包含两段代码,部分分析内容相同,仅供参考用法,首段使用matlabplotlib以及旧版本的pyecharts,第一段代码分为很多块,写的时候没考虑那么多,所以不要统一去注释然后运行,会因为变量问题报错,一块一块去注释运行就可以。

第二段代码使用pyecharts1.7.1版本,为jupyter文件

部分结果展示

第一段代码

import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import datetime
import jieba
import time
from time import strftime
from pyecharts import Line, Page, Bar, WordCloud, Scatter, Pie,Scatter3D
from pyecharts import option as opt

os.chdir(r"D:\Big_data\PycharmWorkSpace\MostUse\BigWork\creeper")
# 修改字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 连续三年 各取连续的五千条左右数据 进行对比分析

with open("final-2017.pkl", 'rb') as fin:
    data = pickle.load(fin)
blbl_inf_2017 = pd.DataFrame(data).T

with open("final-2018.pkl", 'rb') as fin:
    data = pickle.load(fin)
blbl_inf_2018 = pd.DataFrame(data).T

with open("final-2019.pkl", 'rb') as fin:
    data = pickle.load(fin)
blbl_inf_2019 = pd.DataFrame(data).T

all_data = pd.read_csv("all_data.csv")
all_data['times'] = pd.to_datetime(all_data['times'])

# 转化为日期类型
blbl_inf_2017['times'] = pd.to_datetime(blbl_inf_2017['times'])
blbl_inf_2018['times'] = pd.to_datetime(blbl_inf_2018['times'])
blbl_inf_2019['times'] = pd.to_datetime(blbl_inf_2019['times'])
# 除去处于正常日期范围外的数据
blbl_inf_2017 = blbl_inf_2017.loc[blbl_inf_2017['times'] < '2017-04-23']
blbl_inf_2018 = blbl_inf_2018.loc[blbl_inf_2018['times'] < '2018-02-25']
blbl_inf_2019 = blbl_inf_2019.loc[blbl_inf_2019['times'] < '2019-01-06']

# TODO 播放量、弹幕量可视化 分别以散点图、折线图呈现
# bif_2017 = blbl_inf_2017.sort_values(by=['times'], ascending=True)
# bif_2018 = blbl_inf_2018.sort_values(by=['times'], ascending=True)
# bif_2019 = blbl_inf_2019.sort_values(by=['times'], ascending=True)
#
# ax1 = plt.subplot(1,3,1)
# ax2 = plt.subplot(1,3,2)
# ax3 = plt.subplot(1,3,3)
#
# plt.sca(ax1)
# plt.scatter(blbl_inf_2017['times'], blbl_inf_2017["view"], color='blue', label="播放量", s=2)
# plt.scatter(blbl_inf_2017['times'], blbl_inf_2017["danmaku"], color='red', label="弹幕量", s=2)
# # plt.scatter(blbl_inf_2017['times'],blbl_inf_2017["coin"],color='green',label="硬币量",s=2)
# plt.legend(loc='upper right',prop={'size':6})
# plt.title("2017")
#
#
# plt.sca(ax2)
# plt.scatter(blbl_inf_2018['times'], blbl_inf_2018["view"], color='blue', label="播放量", s=2)
# plt.scatter(blbl_inf_2018['times'], blbl_inf_2018["danmaku"], color='red', label="弹幕量", s=2)
# #plt.scatter(blbl_inf_2018['times'],blbl_inf_2018["coin"],color='green',label="硬币量",s=2)
# plt.legend(loc='upper right',prop={'size':6})
# plt.title("2018")
#
# plt.sca(ax3)
# plt.scatter(blbl_inf_2019['times'], blbl_inf_2019["view"], color='blue', label="播放量", s=2)
# plt.scatter(blbl_inf_2019['times'], blbl_inf_2019["danmaku"], color='red', label="弹幕量", s=2)
# #plt.scatter(blbl_inf_2019['times'],blbl_inf_2019["coin"],color='green',label="硬币量",s=2)
# plt.legend(loc='upper right',prop={'size':6})
# plt.title("2019")
#
# plt.subplots_adjust(left=0.5, bottom=0.5, right=1, top=1,
#                 wspace=10, hspace=100)
# plt.gcf().autofmt_xdate()
# plt.savefig("播放量、弹幕量可视化1.jpg")#图片存在变形 未解决
# plt.show()
#
#
# ax1 = plt.subplot(1,3,1)
# ax2 = plt.subplot(1,3,2)
# ax3 = plt.subplot(1,3,3)
# plt.sca(ax1)
# plt.plot(bif_2017['times'], blbl_inf_2017["view"], color='blue', label="播放量")
# plt.plot(bif_2017['times'], blbl_inf_2017["danmaku"], color='red', label="弹幕量")
# # plt.plot(bif['times'],bilibili_inf["coin"],color='green',label="硬币量")
# plt.legend(loc='upper right',prop={'size':6})
#
# plt.sca(ax2)
# plt.plot(bif_2018['times'], blbl_inf_2018["view"], color='blue', label="播放量")
# plt.plot(bif_2018['times'], blbl_inf_2018["danmaku"], color='red', label="弹幕量")
# #plt.plot(bif['times'],bilibili_inf["coin"],color='green',label="硬币量")
# plt.legend(loc='upper right',prop={'size':6})
#
# plt.sca(ax3)
# plt.plot(bif_2019['times'], blbl_inf_2019["view"], color='blue', label="播放量")
# plt.plot(bif_2019['times'], blbl_inf_2019["danmaku"], color='red', label="弹幕量")
# #plt.plot(bif_2019['times'],bif_2019["coin"],color='green',label="硬币量")
# plt.legend(loc='upper right',prop={'size':6})
# plt.subplots_adjust(left=0.5, bottom=0.5, right=1, top=1,
#                 wspace=10, hspace=100)
# plt.gcf().autofmt_xdate()
# plt.savefig("播放量、弹幕量可视化2.jpg")#图片存在变形 未解决
# plt.show()

#
#
# TODO 统计视频大类与小类数量并绘制图表 年份构成变化分析
#
# ax1 = plt.subplot(1,3,1)
# ax2 = plt.subplot(1,3,2)
# ax3 = plt.subplot(1,3,3)
# tag1_2017 = [i[0] for i in blbl_inf_2017['tags'].values.tolist()]
# tag1_2018 = [i[0] for i in blbl_inf_2018['tags'].values.tolist()]
# tag1_2019 = [i[0] for i in blbl_inf_2019['tags'].values.tolist()]
# dic_tag_2017 = {}
# dic_tag_2018 = {}
# dic_tag_2019 = {}
#
# for item in tag1_2017:
#     dic_tag_2017[item]= dic_tag_2017.get(item,0)+1
# for item in tag1_2018:
#     dic_tag_2018[item]= dic_tag_2018.get(item,0)+1
# for item in tag1_2019:
#     dic_tag_2019[item]= dic_tag_2019.get(item,0)+1
# dic_tag_2017 = {k:v for k,v in dic_tag_2017.items() if v >100}
# dic_tag_2018 = {k:v for k,v in dic_tag_2018.items() if v >100}
# dic_tag_2019 = {k:v for k,v in dic_tag_2019.items() if v >100}
#
# plt.sca(ax1)
# plt.pie(dic_tag_2017.values(),labels = dic_tag_2017.keys(),autopct='%.1f %%',shadow=True)
# plt.title("2017热门视频主题")
# plt.sca(ax2)
# plt.pie(dic_tag_2018.values(),labels = dic_tag_2018.keys(),autopct='%.1f %%',shadow=True)
# plt.title("2018热门视频主题")
# plt.sca(ax3)
# plt.pie(dic_tag_2019.values(),labels = dic_tag_2019.keys(),autopct='%.1f %%',shadow=True)
# plt.title("2019热门视频主题")
# #plt.savefig("统计视频大类与小类数量并绘制图表 年份构成变化分析.jpg")#图片存在变形 未解决
# plt.show()

# TODO 平均播放量对比
# page = Page()
# # 年平均(数据量有限 只能做短期)
# bar = Bar("每年短时间段内平均播放量对比")
# views_2017 = sum(blbl_inf_2017['view']) // len(blbl_inf_2017['view'])
# views_2018 = sum(blbl_inf_2018['view']) // len(blbl_inf_2018['view'])
# views_2019 = sum(blbl_inf_2019['view']) // len(blbl_inf_2019['view'])
# bar.add("每年平均视频播放量对比", ['2017', '2018', '2019'], [views_2017, views_2018, views_2019], is_stack=True)
# bar.render("每年短时间段内平均播放量对比.html")
# page.add(bar)
# #年平均硬币
# bar = Bar("每年短时间段内平均硬币量对比")
# coin_2017 = sum(blbl_inf_2017['coin']) // len(blbl_inf_2017['coin'])
# coin_2018 = sum(blbl_inf_2018['coin']) // len(blbl_inf_2018['coin'])
# coin_2019 = sum(blbl_inf_2019['coin']) // len(blbl_inf_2019['coin'])
# bar.add("每年平均视频硬币量对比", ['2017', '2018', '2019'], [coin_2017, coin_2018, coin_2019], is_stack=True)
# bar.render("每年短时间段内平均硬币量对比.html")
# page.add(bar)
# #年平均弹幕
# bar = Bar("每年短时间段内平均弹幕量对比")
# danmaku_2017 = sum(blbl_inf_2017['danmaku']) // len(blbl_inf_2017['danmaku'])
# danmaku_2018 = sum(blbl_inf_2018['danmaku']) // len(blbl_inf_2018['danmaku'])
# danmaku_2019 = sum(blbl_inf_2019['danmaku']) // len(blbl_inf_2019['danmaku'])
# bar.add("每年平均视频弹幕量对比", ['2017', '2018', '2019'], [danmaku_2017, danmaku_2018, danmaku_2019], is_stack=True)
# bar.render("每年短时间段内平均弹幕量对比.html")
# page.add(bar)
#
# page.render("年平均数据对比.html")

# TODO 年份间视频投稿速度分析 /第一个视频投稿后时间段内投稿视频数量
# blbl_num_2017_inhalfhour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=0.5))])
# blbl_num_2018_inhalfhour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=0.5))])
# blbl_num_2019_inhalfhour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=0.5))])
#
# blbl_num_2017_in1hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=1))])
# blbl_num_2018_in1hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=1))])
# blbl_num_2019_in1hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=1))])
#
# blbl_num_2017_in2hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=2))])
# blbl_num_2018_in2hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=2))])
# blbl_num_2019_in2hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=2))])
#
# blbl_num_2017_in4hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=4))])
# blbl_num_2018_in4hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=4))])
# blbl_num_2019_in4hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=4))])
#
# blbl_num_2017_in6hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=6))])
# blbl_num_2018_in6hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=6))])
# blbl_num_2019_in6hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=6))])
#
# blbl_num_2017_in8hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=8))])
# blbl_num_2018_in8hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=8))])
# blbl_num_2019_in8hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=8))])
#
# blbl_num_2017_in12hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=12))])
# blbl_num_2018_in12hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=12))])
# blbl_num_2019_in12hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=12))])
#
# blbl_num_2017_in24hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=24))])
# blbl_num_2018_in24hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=24))])
# blbl_num_2019_in24hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=24))])
#
# line = Line("三年间视频投稿速度分析")
# timeline = [0.5,1,2,4,6,8,12,24]
# nums_2017 = [blbl_num_2017_inhalfhour,blbl_num_2017_in1hour,blbl_num_2017_in2hour,blbl_num_2017_in4hour,blbl_num_2017_in6hour,blbl_num_2017_in8hour,blbl_num_2017_in12hour,blbl_num_2017_in24hour]
# nums_2018 = [blbl_num_2018_inhalfhour,blbl_num_2018_in1hour,blbl_num_2018_in2hour,blbl_num_2018_in4hour,blbl_num_2018_in6hour,blbl_num_2018_in8hour,blbl_num_2018_in12hour,blbl_num_2018_in24hour]
# nums_2019 = [blbl_num_2019_inhalfhour,blbl_num_2019_in1hour,blbl_num_2019_in2hour,blbl_num_2019_in4hour,blbl_num_2019_in6hour,blbl_num_2019_in8hour,blbl_num_2019_in12hour,blbl_num_2019_in24hour]
# line.add('2017年',timeline,nums_2017)
# line.add('2018年',timeline,nums_2018)
# line.add('2019年',timeline,nums_2019)
# line.render("三年间视频投稿速度分析.html")


# TODO 发视频时间段统计 use all_data
# all_inf = all_data
# all_inf['times'] = [i.strftime("%H") for i in all_inf['times'].tolist()]
# timedata = all_inf['times'].value_counts().sort_index()
# line = Line("用户视频投稿时间分布")
# line.add("投稿数/小时", timedata.index, timedata.values, mark_point=['max','average','min'])
# line.render('用户视频投稿时间分布.html')


# TODO 播放/硬币 或者 播放/收藏 比值高的视频通常为什么主题 use all_data
# pie = Pie("硬币/播放比值较高的视频主题分析",title_pos='center',width=800,height=800)
# all_inf = all_data
# idxlst1 = [all_inf['coin']/(1+all_inf['view']) > i for i in [j*0.03 for j in range(0,5)]]
# idx2 = all_inf['view'] > 2000
# high_v2c_leveldata_lst = [all_inf.loc[idx1&idx2] for idx1 in idxlst1]
# high_v2c_leveldata = high_v2c_leveldata_lst[0]
# pie.add("rate = 0.03",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[0,10],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2c_leveldata = high_v2c_leveldata_lst[1]
# pie.add("rate = 0.06",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[20,30],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2c_leveldata = high_v2c_leveldata_lst[2]
# pie.add("rate = 0.09",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[40,50],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2c_leveldata = high_v2c_leveldata_lst[3]
# pie.add("rate = 0.12",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[60,70],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2c_leveldata = high_v2c_leveldata_lst[4]
# pie.add("rate = 0.15",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[80,90],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# pie.render('播放-硬币比值较高的视频主题分析.html')
#
#
# pie = Pie("收藏/播放比值较高的视频主题分析",title_pos='center',width=800,height=800)
# idxlst3 = [all_inf['favorite']/(1+all_inf['view']) > i for i in [j*0.03 for j in range(0,5)]]
# idx4 = all_inf['view'] > 2000
# high_v2f_leveldata_lst = [all_inf.loc[idx3&idx2] for idx3 in idxlst3]
# high_v2f_leveldata = high_v2f_leveldata_lst[0]
# pie.add("rate = 0.03",high_v2f_leveldata['tags'].value_counts().head(5).index,high_v2f_leveldata['tags'].value_counts().head(5).values,radius=[0,10],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2f_leveldata = high_v2f_leveldata_lst[1]
# pie.add("rate = 0.06",high_v2f_leveldata['tags'].value_counts().head(5).index,high_v2f_leveldata['tags'].value_counts().head(5).values,radius=[20,30],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2f_leveldata = high_v2f_leveldata_lst[2]
# pie.add("rate = 0.09",high_v2f_leveldata['tags'].value_counts().head(5).index,high_v2f_leveldata['tags'].value_counts().head(5).values,radius=[40,50],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2f_leveldata = high_v2f_leveldata_lst[3]
# pie.add("rate = 0.12",high_v2f_leveldata['tags'].value_counts().head(5).index,high_v2f_leveldata['tags'].value_counts().head(5).values,radius=[60,70],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# high_v2f_leveldata = high_v2f_leveldata_lst[4]
# pie.add("rate = 0.15",high_v2f_leveldata['tags'].value_counts().head(5).index,high_v2f_leveldata['tags'].value_counts().head(5).values,radius=[80,90],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
# pie.render('播放-收藏比值较高的视频主题分析.html')

# TODO 视频播放量 视频硬币 视频弹幕 三维图 use all_data
# all_inf = all_data
# data = [[val['view'], val['danmaku'], val['coin']] for index, val in all_inf.iterrows()]
# range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
#                '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
# page = Page()
# scatter3D = Scatter3D("视频播放量 视频硬币 视频弹幕 三维图") #设置图表的高和宽
# scatter3D.add("", data, visual_range_color=range_color,xaxis3d_name="播放量",yaxis3d_name="硬币",zaxis3d_name="弹幕") #视觉映射和颜色选择 ["播放量","硬币","弹幕"]
# #scatter3D.render("视频播放量 视频硬币 视频弹幕 三维图.html")
# page.add(scatter3D)
# page.render("视频播放量 视频硬币 视频弹幕 三维图.html")

# TODO 视频关键词图云 use all_data
# all_inf = all_data
# words = jieba.cut_for_search(" ".join(all_inf['titles'].tolist()))
# counts={}
# for i in words:
#     counts[i] = counts.get(i, 0) + 1
# counts = {k:v for k,v in counts.items() if (v >1000 and len(k)>=2 and not k.isdigit())}
# wordcould = WordCloud()
# wordcould.add("",counts.keys(),counts.values(),word_size_range=[20, 100])
# wordcould.render("wordcloud.html")



# TODO 十万条数据汇总分析 
# all_data.describe().to_csv('汇总分析.csv')

第二段代码

import os
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import datetime
import jieba
import time
from time import strftime
from pyecharts.charts import Line, Page, Bar, WordCloud, Scatter, Pie,Scatter3D
from pyecharts import options as opts
from pyecharts.globals import CurrentConfig,NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB
from pyecharts.commons.utils import JsCode


os.chdir(r"D:\Big_data\PycharmWorkSpace\dataVisualization")
# 修改字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 连续三年 各取连续的五千条左右数据 进行对比分析

with open("final-2017.pkl", 'rb') as fin:
    data = pickle.load(fin)
blbl_inf_2017 = pd.DataFrame(data).T

with open("final-2018.pkl", 'rb') as fin:
    data = pickle.load(fin)
blbl_inf_2018 = pd.DataFrame(data).T

with open("final-2019.pkl", 'rb') as fin:
    data = pickle.load(fin)
blbl_inf_2019 = pd.DataFrame(data).T

all_data = pd.read_csv("all_data.csv")
all_data['times'] = pd.to_datetime(all_data['times'])

# 转化为日期类型
blbl_inf_2017['times'] = pd.to_datetime(blbl_inf_2017['times'])
blbl_inf_2018['times'] = pd.to_datetime(blbl_inf_2018['times'])
blbl_inf_2019['times'] = pd.to_datetime(blbl_inf_2019['times'])
# 除去处于正常日期范围外的数据
blbl_inf_2017 = blbl_inf_2017.loc[blbl_inf_2017['times'] < '2017-04-23']
blbl_inf_2018 = blbl_inf_2018.loc[blbl_inf_2018['times'] < '2018-02-25']
blbl_inf_2019 = blbl_inf_2019.loc[blbl_inf_2019['times'] < '2019-01-06']
bif_2017 = blbl_inf_2017.sort_values(by=['times'], ascending=True)
bif_2018 = blbl_inf_2018.sort_values(by=['times'], ascending=True)
bif_2019 = blbl_inf_2019.sort_values(by=['times'], ascending=True)
# TODO 播放量、弹幕量可视化 分别以散点图、折线图呈现
idx1 = blbl_inf_2017['view']>50000
idx2 = blbl_inf_2017['danmaku']>1000
blbl_inf_2017 = blbl_inf_2017.loc[idx1&idx2]
idx3 = blbl_inf_2018['view']>50000
idx4 = blbl_inf_2018['danmaku']>1000
blbl_inf_2018 = blbl_inf_2018.loc[idx3&idx4]
idx5 = blbl_inf_2019['view']>50000
idx6 = blbl_inf_2019['danmaku']>1000
blbl_inf_2019 = blbl_inf_2019.loc[idx5&idx6]
page = Page("播放量、弹幕量可视化")
scatter1 = Scatter(init_opts=opts.InitOpts(width="1200px", height="750px"))
scatter1.add_xaxis([i.strftime('%Y-%m-%d %H:%M:%S') for i in blbl_inf_2017['times']])
scatter1.add_yaxis('播放量',blbl_inf_2017["view"],label_opts=opts.LabelOpts(is_show=True))
scatter1.add_yaxis('弹幕量',blbl_inf_2017["danmaku"],label_opts=opts.LabelOpts(is_show=True))
scatter1.set_global_opts(
        title_opts=opts.TitleOpts(title="2017年播放量,弹幕量可视化"),
        visualmap_opts=opts.VisualMapOpts(type_="size", max_=2000000, min_=1))

scatter2 = Scatter(init_opts=opts.InitOpts(width="1200px", height="750px"))
scatter2.add_xaxis([i.strftime('%Y-%m-%d %H:%M:%S') for i in blbl_inf_2018['times']])
scatter2.add_yaxis('播放量',blbl_inf_2018["view"],label_opts=opts.LabelOpts(is_show=True))
scatter2.add_yaxis('弹幕量',blbl_inf_2018["danmaku"],label_opts=opts.LabelOpts(is_show=True))
scatter2.set_global_opts(
        title_opts=opts.TitleOpts(title="2018年播放量,弹幕量可视化"),
        visualmap_opts=opts.VisualMapOpts(type_="size", max_=2000000, min_=1))

scatter3 = Scatter(init_opts=opts.InitOpts(width="1200px", height="750px"))
scatter3.add_xaxis([i.strftime('%Y-%m-%d %H:%M:%S') for i in blbl_inf_2019['times']])
scatter3.add_yaxis('播放量',blbl_inf_2019["view"],label_opts=opts.LabelOpts(is_show=True))
scatter3.add_yaxis('弹幕量',blbl_inf_2019["danmaku"],label_opts=opts.LabelOpts(is_show=True))
scatter3.set_global_opts(
        title_opts=opts.TitleOpts(title="2019年播放量,弹幕量可视化"),
        visualmap_opts=opts.VisualMapOpts(type_="size", max_=2000000, min_=1))

page.add(scatter1)
page.add(scatter2)
page.add(scatter3)
page.render('热门视频播放量、弹幕量可视化.html')

themedic = {}
viewdic = {}
all_data['tags_tmp'] = [i[2:4] for i in all_data['tags']]
for i in all_data['tags_tmp']:
    themedic[i] = themedic.get(i,0) + 1
datatmp = all_data.groupby('tags_tmp')
index = [i for i in datatmp.size().index]
values = [i for i in datatmp.sum()['view']]
for i in range(0,len(index)):
    viewdic[index[i]] = viewdic.get(index[i],0)+values[i]
page = Page()
bar1 = Bar(init_opts=opts.InitOpts(width="1200px", height="600px", bg_color=None))
bar1.add_xaxis(list(themedic.keys()))\
    .add_yaxis('视频种类/数量',list(themedic.values()))\
    .set_global_opts(xaxis_opts=opts.AxisOpts(type_='category'),
                     title_opts={"text": "所有视频主题大类投稿量对比"})\
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=True),markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(y=10000, name="yAxis=10000")] ))
bar2 = Bar(init_opts=opts.InitOpts(width="1200px", height="600px", bg_color=None))
bar2.add_xaxis(list(viewdic.keys()))\
    .add_yaxis('视频种类/数量',list(viewdic.values()))\
    .set_global_opts(xaxis_opts=opts.AxisOpts(type_='category'),
                     title_opts={"text": "所有视频主题大类播放量对比"})\
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=True),markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(y=100000000, name="yAxis=10000")] ))

page.add(bar1)
page.add(bar2)
page.render('所有视频主题大类投稿量、播放量对比.html')
# TODO 统计视频大类与小类数量并绘制图表 年份构成变化分析
pie = Pie(init_opts=opts.InitOpts(width="1600px", height="1000px"))
page = Page()
tag1_2017 = [i[0] for i in blbl_inf_2017['tags'].values.tolist()]
tag1_2018 = [i[0] for i in blbl_inf_2018['tags'].values.tolist()]
tag1_2019 = [i[0] for i in blbl_inf_2019['tags'].values.tolist()]
dic_tag_2017 = {}
dic_tag_2018 = {}
dic_tag_2019 = {}

for item in tag1_2017:
    dic_tag_2017[item]= dic_tag_2017.get(item,0)+1
for item in tag1_2018:
    dic_tag_2018[item]= dic_tag_2018.get(item,0)+1
for item in tag1_2019:
    dic_tag_2019[item]= dic_tag_2019.get(item,0)+1
dic_tag_2017 = {k:v for k,v in dic_tag_2017.items() if v >100}
dic_tag_2018 = {k:v for k,v in dic_tag_2018.items() if v >100}
dic_tag_2019 = {k:v for k,v in dic_tag_2019.items() if v >100}
dic_tag_2017 = [list(x) for x in dic_tag_2017.items()]
dic_tag_2018 = [list(x) for x in dic_tag_2018.items()]
dic_tag_2019 = [list(x) for x in dic_tag_2019.items()]

pie1 = Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c"))
pie1.add(series_name="2017热门视频主题",
        data_pair=dic_tag_2017,
        rosetype="radius",
        radius="55%",
        center=["50%", "50%"],
        label_opts=opts.LabelOpts(is_show=False, position="center"))
pie1.set_global_opts(
        title_opts=opts.TitleOpts(
            title="2017热门视频主题",
            pos_left="center",
            pos_top="20",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
        ),
        legend_opts=opts.LegendOpts(is_show=False))
pie1.set_series_opts(
        tooltip_opts=opts.TooltipOpts(
            trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
        ),
        label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"))
page.add(pie1)

pie2 = Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c"))
pie2.add(series_name="2018热门视频主题",
        data_pair=dic_tag_2018,
        rosetype="radius",
        radius="55%",
        center=["50%", "50%"],
        label_opts=opts.LabelOpts(is_show=False, position="center"))
pie2.set_global_opts(
        title_opts=opts.TitleOpts(
            title="2018热门视频主题",
            pos_left="center",
            pos_top="20",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
        ),
        legend_opts=opts.LegendOpts(is_show=False))
pie2.set_series_opts(
        tooltip_opts=opts.TooltipOpts(
            trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
        ),
        label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"))
page.add(pie2)

pie3 = Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c"))
pie3.add(series_name="2019热门视频主题",
        data_pair=dic_tag_2019,
        rosetype="radius",
        radius="55%",
        center=["50%", "50%"],
        label_opts=opts.LabelOpts(is_show=False, position="center"))
pie3.set_global_opts(
        title_opts=opts.TitleOpts(
            title="2019热门视频主题",
            pos_left="center",
            pos_top="20",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
        ),
        legend_opts=opts.LegendOpts(is_show=False))
pie3.set_series_opts(
        tooltip_opts=opts.TooltipOpts(
            trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
        ),
        label_opts=opts.LabelOpts(color="rgba(255, 255, 255, 0.3)"))
page.add(pie3)

page.render("统计视频大类与小类数量并绘制图表 年份构成变化分析.html")
# TODO 平均播放量对比
page = Page()
# 年平均(数据量有限 只能做短期)
bar = Bar(init_opts=opts.InitOpts(width="1200px", height="600px", bg_color=None))
from math import log2
views_2017 = sum(blbl_inf_2017['view']) // len(blbl_inf_2017['view'])
views_2018 = sum(blbl_inf_2018['view']) // len(blbl_inf_2018['view'])
views_2019 = sum(blbl_inf_2019['view']) // len(blbl_inf_2019['view'])
coin_2017 = sum(blbl_inf_2017['coin']) // len(blbl_inf_2017['coin'])
coin_2018 = sum(blbl_inf_2018['coin']) // len(blbl_inf_2018['coin'])
coin_2019 = sum(blbl_inf_2019['coin']) // len(blbl_inf_2019['coin'])
danmaku_2017 = sum(blbl_inf_2017['danmaku']) // len(blbl_inf_2017['danmaku'])
danmaku_2018 = sum(blbl_inf_2018['danmaku']) // len(blbl_inf_2018['danmaku'])
danmaku_2019 = sum(blbl_inf_2019['danmaku']) // len(blbl_inf_2019['danmaku'])
bar.add_xaxis(['2017', '2018', '2019'])\
    .add_yaxis('2017年平均播放量(log2)',[log2(x) for x in [views_2017,views_2018,views_2019]],category_gap=40)\
    .add_yaxis('2017年平均硬币量(log2)',[log2(x) for x in [coin_2017,coin_2018,coin_2019]],category_gap=40)\
    .add_yaxis('2017年平均弹幕量(log2)',[log2(x) for x in [danmaku_2017,danmaku_2018,danmaku_2019]],category_gap=40)\
    .set_global_opts(yaxis_opts=opts.AxisOpts(type_='value',max_=15),
                     title_opts={"text": "三年平均数据对比", "subtext": "所有数据经过log2处理"})\
    .set_series_opts(
        label_opts=opts.LabelOpts(is_show=True),markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(y=3, name="yAxis=50")] ))

page.add(bar)
page.render('年平均数据对比.html')
# TODO 年份间视频投稿速度分析 /第一个视频投稿后时间段内投稿视频数量
blbl_num_2017_inhalfhour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=0.5))])
blbl_num_2018_inhalfhour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=0.5))])
blbl_num_2019_inhalfhour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=0.5))])

blbl_num_2017_in1hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=1))])
blbl_num_2018_in1hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=1))])
blbl_num_2019_in1hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=1))])

blbl_num_2017_in2hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=2))])
blbl_num_2018_in2hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=2))])
blbl_num_2019_in2hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=2))])

blbl_num_2017_in4hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=4))])
blbl_num_2018_in4hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=4))])
blbl_num_2019_in4hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=4))])

blbl_num_2017_in6hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=6))])
blbl_num_2018_in6hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=6))])
blbl_num_2019_in6hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=6))])

blbl_num_2017_in8hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=8))])
blbl_num_2018_in8hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=8))])
blbl_num_2019_in8hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=8))])

blbl_num_2017_in12hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=12))])
blbl_num_2018_in12hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=12))])
blbl_num_2019_in12hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=12))])

blbl_num_2017_in24hour = len(blbl_inf_2017.loc[blbl_inf_2017['times'] < (blbl_inf_2017['times'].min()+datetime.timedelta(hours=24))])
blbl_num_2018_in24hour = len(blbl_inf_2018.loc[blbl_inf_2018['times'] < (blbl_inf_2018['times'].min()+datetime.timedelta(hours=24))])
blbl_num_2019_in24hour = len(blbl_inf_2019.loc[blbl_inf_2019['times'] < (blbl_inf_2019['times'].min()+datetime.timedelta(hours=24))])


timeline = ['0.5','1','2','4','6','8','12','24']
nums_2017 = [blbl_num_2017_inhalfhour,blbl_num_2017_in1hour,blbl_num_2017_in2hour,blbl_num_2017_in4hour,blbl_num_2017_in6hour,blbl_num_2017_in8hour,blbl_num_2017_in12hour,blbl_num_2017_in24hour]
nums_2018 = [blbl_num_2018_inhalfhour,blbl_num_2018_in1hour,blbl_num_2018_in2hour,blbl_num_2018_in4hour,blbl_num_2018_in6hour,blbl_num_2018_in8hour,blbl_num_2018_in12hour,blbl_num_2018_in24hour]
nums_2019 = [blbl_num_2019_inhalfhour,blbl_num_2019_in1hour,blbl_num_2019_in2hour,blbl_num_2019_in4hour,blbl_num_2019_in6hour,blbl_num_2019_in8hour,blbl_num_2019_in12hour,blbl_num_2019_in24hour]

background_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#FFEDBF'}, {offset: 1, color: '#C3EFFE'}], false)"
)
area_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#C64000'}, {offset: 1, color: '#3fbbff0d'}], false)"
)
line = Line(init_opts=opts.InitOpts(width="1200px", height="600px", bg_color=JsCode(background_color_js)))
line.add_xaxis(timeline)\
    .add_yaxis("2017年视频投稿速度", nums_2017, is_smooth=True)\
    .add_yaxis("2018年视频投稿速度", nums_2018, is_smooth=True)\
    .add_yaxis("2019年视频投稿速度", nums_2019, is_smooth=True)\
    .set_series_opts(areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=0.6),label_opts=opts.LabelOpts(is_show=False),)\
    .set_global_opts(
            tooltip_opts=opts.TooltipOpts(trigger="axis"),
            toolbox_opts=opts.ToolboxOpts(is_show=True),
            title_opts=opts.TitleOpts(title="三年间视频同时段投稿数量分析"),xaxis_opts=opts.AxisOpts(
            axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
            type_='category',
            is_scale=False,
            boundary_gap=False))
line.render('三年间视频同时段投稿数量分析.html')
    
# TODO 发视频时间段统计 use all_data
all_inf = all_data
all_inf['times1'] = [i.strftime("%H") for i in all_inf['times'].tolist()]
all_inf['times2'] = [i.strftime("%A") for i in all_inf['times'].tolist()]
print(all_inf['times2'])
timedata1 = all_inf['times1'].value_counts().sort_index()
timedata2 = all_inf['times1'].value_counts().sort_index()
background_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#FFEDBF'}, {offset: 1, color: '#C3EFFE'}], false)"
)
area_color_js = (
    "new echarts.graphic.LinearGradient(0, 0, 0, 1, "
    "[{offset: 0, color: '#C64000'}, {offset: 1, color: '#3fbbff0d'}], false)"
)
page = Page()
line1 = Line(init_opts=opts.InitOpts(width="1200px", height="600px", bg_color=JsCode(background_color_js)))
line1.add_xaxis([i for i in range(0,24)])\
    .add_yaxis("投稿数/小时",timedata1.values.tolist(),is_smooth=True,is_hover_animation=True)\
    .set_series_opts(areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=0.6))\
    .set_global_opts(title_opts=opts.TitleOpts(title="用户视频投稿时间分布"),
                     xaxis_opts=opts.AxisOpts(axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                        type_='category',
                        boundary_gap=False))
page = Page()
line2 = Line(init_opts=opts.InitOpts(width="1200px", height="600px", bg_color=JsCode(background_color_js)))
line2.add_xaxis(["Monday","Tuesday","Wednesday", "Thursday", "Friday", "Saturday","Sunday"])\
    .add_yaxis("投稿数/天",timedata2.values.tolist(),is_smooth=True,is_hover_animation=True)\
    .set_series_opts(areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=0.6))\
    .set_global_opts(title_opts=opts.TitleOpts(title="用户视频投稿时间分布"),
                     xaxis_opts=opts.AxisOpts(axistick_opts=opts.AxisTickOpts(is_align_with_label=True),
                        type_='category',
                        boundary_gap=False))
page.add(line1)
page.add(line2)
page.render('用户视频投稿时间分布.html')

# TODO 播放/硬币 或者 播放/收藏 比值高的视频通常为什么主题 use all_data

all_inf = all_data
idxlst1 = [all_inf['coin']/(1+all_inf['view']) > i for i in [j*0.03 for j in range(0,5)]]
idx2 = all_inf['view'] > 2000
high_v2c_leveldata_lst = [all_inf.loc[idx1&idx2] for idx1 in idxlst1]
high_v2c_leveldata = high_v2c_leveldata_lst[0]
#pie.add("rate = 0.03",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[0,10],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
high_v2c_leveldata = high_v2c_leveldata_lst[1]
#pie.add("rate = 0.06",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[20,30],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
high_v2c_leveldata = high_v2c_leveldata_lst[2]
#pie.add("rate = 0.09",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[40,50],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
high_v2c_leveldata = high_v2c_leveldata_lst[3]
#pie.add("rate = 0.12",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[60,70],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
high_v2c_leveldata = high_v2c_leveldata_lst[4]
#pie.add("rate = 0.15",high_v2c_leveldata['tags'].value_counts().head(5).index,high_v2c_leveldata['tags'].value_counts().head(5).values,radius=[80,90],legend_orient='vertical',legend_pos='left',center=["50%", "50%"])
#pie.render('播放-硬币比值较高的视频主题分析.html')
page = Page()
pie1 = Pie(init_opts=opts.InitOpts(width="1600px", height="800px", bg_color="#2c343c"))
pie1.add(series_name="rate = 0.03",
        data_pair=high_v2c_leveldata['tags'].value_counts().head(5),
        rosetype="radius",
        radius="55%",
        center=["50%", "50%"])
pie1.set_global_opts(
        title_opts=opts.TitleOpts(
            title="播放/硬币比值高的视频主题分析",
            pos_left="center",
            pos_top="20",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff"),
        ),
        legend_opts=opts.LegendOpts(is_show=False))
pie1.render('播放/硬币比值高的视频主题分析.html')


# TODO 视频关键词图云 use all_data
# all_inf = all_data
# words = jieba.cut_for_search(" ".join(all_inf['titles'].tolist()))
# counts={}
# for i in words:
#     counts[i] = counts.get(i, 0) + 1
# counts = {k:v for k,v in counts.items() if (v >1000 and len(k)>=2 and not k.isdigit())}
# wordcould = WordCloud()
# wordcould.add("",counts.keys(),counts.values(),word_size_range=[20, 100])
# wordcould.render("wordcloud.html")


下载链接

https://files.cnblogs.com/files/Do-n/bilibili_data1.zip
https://files.cnblogs.com/files/Do-n/bilibili_data2.zip
https://files.cnblogs.com/files/Do-n/BiliBili视频信息爬虫及可视化.zip

posted @ 2020-07-27 16:12  -拂石-  阅读(356)  评论(0)    收藏  举报