python大作业

数据分析大作业,这里做个简单的记录,数据集下载https://gitee.com/Arno_vc/python.git

#通用函数
import pandas as pd;
import re
import numpy as np
import matplotlib.pyplot as plt

#plt字体设置
plt.rcParams['font.sans-serif']=['SimHei'];
plt.rcParams['font.size'] = '16';

def drawBox(data,title):
    data.plot.box(title=title,figsize=(10,10));
    plt.grid(linestyle="--", alpha=0.3);
    plt.show();
    
def drawBar(data,title):
    data.plot.bar(stacked=True,title=title,figsize=(10,10));
    plt.show();

def drawPie(data,title):
    data.plot.pie(title=title,figsize=(10,10));
    plt.show();
    
def drawScatter(data,x,y):
    data.plot.scatter(x=x, y=y,figsize=(10,10))
    plt.show();

def drawLine(data,title):
    data.plot.line(title=title,figsize=(10,10));
    plt.show();
    
def count(data):
    return len(data);

#获得年份
def getYear(data):
    res = re.findall(r'[(](.*?)[)]',data);
    return res[len(res)-1];

综合实验 MovieLens 1M数据分析

  1. 数据集说明:

    MovieLens 1M Dataset:该数据集采集了一组从20世纪90年末到21世纪初由MovieLens用户提供的电影怦分数据。这些数据中包括电影评分、电影数据(风格类型和年代)以及关于用户的人口统计学数据(年龄、邮编、性别和职业等)。

    来源:https://grouplens.org/datasets/movielens/

    3个数据文件:

    • 电影:movies.dat,列:movie_id, title, genres
    • 用户:users.dat,列:user_id, gender, age, occupation, zip
    • 评分:ratings.dat,列:user_id, movie_id, rating, timestamp
  2. 概要统计

    电影数量:总数、按年代、风格统计

    用户数量:总数、按性别、年龄、职业统计

    评分条数:总数、按电影、性别、职业统计

  3. 分析目标

    (1) 每部电影的得分情况分析

    (2) 每种风格电影的得分情况分析

    (3) 不同性别用户偏爱的电影分析

    (4) 不同年龄段用户偏爱的电影分析

    (5) 不同性别用户偏爱的电影,随着年代的变化情况分析

#电影数量:总数、按年代、风格统计
import pandas as pd;
import re
import numpy as np
import matplotlib.pyplot as plt

#plt字体设置
plt.rcParams['font.sans-serif']=['SimHei']

#sep:分隔符,names:列名
movies = pd.read_table("../大作业2020/大作业题目1/movielens/movies.dat",sep="::",names=["movie_id","title","genres"],engine='python');
movies["year"] = movies.agg({"title":getYear},axios=1);
movies.head(1000)
movie_id title genres year
0 1 Toy Story (1995) Animation|Children's|Comedy 1995
1 2 Jumanji (1995) Adventure|Children's|Fantasy 1995
2 3 Grumpier Old Men (1995) Comedy|Romance 1995
3 4 Waiting to Exhale (1995) Comedy|Drama 1995
4 5 Father of the Bride Part II (1995) Comedy 1995
... ... ... ... ...
995 1008 Davy Crockett, King of the Wild Frontier (1955) Western 1955
996 1009 Escape to Witch Mountain (1975) Adventure|Children's|Fantasy 1975
997 1010 Love Bug, The (1969) Children's|Comedy 1969
998 1011 Herbie Rides Again (1974) Adventure|Children's|Comedy 1974
999 1012 Old Yeller (1957) Children's|Drama 1957

1000 rows × 4 columns

检查是否有缺失值

print("是否有缺失值:");
movies[movies.isnull().values==True]
是否有缺失值:
movie_id title genres year

查看电影总数:共3883条

print("电影总数:{}\n".format(len(movies)));
电影总数:3883

电影年份相关统计:分析:该涵盖了1919年到2000年的电影,总数共3883条,统计不同年代的电影发行数并按升序排列,如图可知,发行最少的年代是1921年,只有1部;最多的年代是1996年,多达345部,且平均每年电影发行数为48部.通过变化的折线图可以看出,自1919年起,电影发行数目一直在逐年缓慢攀升,并从90年代开始有了剧烈的增长,1996到达顶峰(结合上一张图),之后到2000急剧下降.同时根据该箱线图,可以看出超过一半的年份的电影发行数在50以下,只有10个年份的电影发行数超过100.由以上电影类别统计的柱形图可知,Film-Noir(黑色电影)风格的电影最少,在这近100年来仅44部;最多的是Drama(戏剧)和Comedy(喜剧)

print("年份统计:\n");
ageCount = movies[["title","year"]].groupby(by="year").count().sort_values(by="year");
drawLine(ageCount.loc[:,"title"],"不同年份的电影发行统计折线图");
drawBox(ageCount.loc[:,"title"],"不同年份的电影发行数统计箱线图");
ageCount
年份统计:

png

png

title
year
1919 3
1920 2
1921 1
1922 2
1923 3
... ...
1996 345
1997 315
1998 337
1999 283
2000 156

81 rows × 1 columns

电影类别统计:可以看出电影最多的是Drama类型的电影,有1606;最少的Film-Noir类型的电影,仅44部

genres=[];
for i in range(0,len(movies)):
    temp = movies.loc[i,"genres"];
    temp = temp.split("|");
    for j in range(0,len(temp)):
        genres.append([movies.loc[i,"movie_id"],temp[j]]);
        
genres = pd.DataFrame(genres,columns=["movie_id","genres"]);
genresCount = genres.groupby(by="genres").count().sort_values(by="movie_id");
#关于类别统计的箱线图
drawBox(genresCount.loc[:,["movie_id"]],"电影类别统计箱线图")
drawBar(genresCount,"电影类别统计柱形图");
print("电影类别统计\n");
genresCount

png

电影类别统计
movie_id
genres
Film-Noir 44
Fantasy 68
Western 68
Animation 105
Mystery 106
Musical 114
Documentary 127
War 143
Crime 211
Children's 251
Sci-Fi 276
Adventure 283
Horror 343
Romance 471
Thriller 492
Action 503
Comedy 1200
Drama 1603

导入用户数据,同时检查用户数据

#用户数量:总数、按性别、年龄、职业统计
import pandas as pd;
import re
import numpy as np

#sep:分隔符,names:列名,occupation:职业,zip:邮编
users = pd.read_table("../大作业2020/大作业题目1/movielens/users.dat",sep="::",names=["user_id","gender","age","occupation","zip"],engine='python');
users.head(1000);

检查是否有缺失值

print("是否有缺失值");
users[users.isnull().values==True]
是否有缺失值
user_id gender age occupation zip

用户总数

print("用户总数:{}\n".format(len(users)));
用户总数:6040

性别统计:用户总数多达6040人,其中有1709人的女性,4331人的男性.由对应的饼图可以看出,男性占比接近3/4.

genderCount = users.loc[:,["user_id","gender"]].groupby(by="gender").count();
print("性别统计");
drawPie(genderCount.loc[:,"user_id"],"用户分布");
genderCount
性别统计

png

user_id
gender
F 1709
M 4331
观众年龄统计:由以上关于用户机器年龄分布的表图近似于正态分布,用户主要为25岁的**年轻人**,主要区间也聚集在15~35岁的青少年即成年人.
ageCount = users.loc[:,["user_id","age"]].groupby(by="age").count();
drawBar(ageCount.loc[:,"user_id"],"观众年龄统计条形图");
print("年龄统计");
ageCount

png

年龄统计
user_id
age
1 222
18 1103
25 2096
35 1193
45 550
50 496
56 380

观众职业统计:由于无法得知具体的职业名称,这里无法做过多的评判.

occupationCount = users.loc[:,["user_id","occupation"]].groupby(by="occupation").count();
drawBar(occupationCount.loc[:,"user_id"],"观众职业分布图");
print("职业统计\n{}\n");
occupationCount

png

user_id
occupation
0 711
1 528
2 267
3 173
4 759
5 112
6 236
7 679
8 17
9 92
10 195
11 129
12 388
13 142
14 302
15 144
16 241
17 502
18 70
19 72
20 281

评分统计

#评分条数:总数、按电影、性别、职业统计
import pandas as pd;
import re
import numpy as np

#sep:分隔符,names:列名,occupation:职业,zip:邮编
rating = pd.read_table("../大作业2020/大作业题目1/movielens/ratings.dat",sep="::",names=["user_id","movie_id","rating","timestamp"],engine='python');
movies = pd.read_table("../大作业2020/大作业题目1/movielens/movies.dat",sep="::",names=["movie_id","title","genres"],engine='python');
data = pd.merge(movies,rating,how="inner",on="movie_id");
users = pd.read_table("../大作业2020/大作业题目1/movielens/users.dat",sep="::",names=["user_id","gender","age","occupation","zip"],engine='python');
data = pd.merge(data,users,how="inner",on="user_id");
rating.head(1000)
user_id movie_id rating timestamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
... ... ... ... ...
995 10 3704 2 978228364
996 10 1020 3 978228726
997 10 784 3 978230946
998 10 858 3 978224375
999 10 1022 5 979775689

1000 rows × 4 columns

评分总条数统计

print("用户总数:{}\n".format(len(rating)));
用户总数:1000209

按电影统计评分:由以下相关的统计数据可知,总评条数为1000209条,评论最少的电影有多部,都为1条;评论最多的电影为American Beaty,有3428条.平均每部电影有900条,然而超过50%的电影甚至连500条都远远没有达到.

titleCount = data.loc[:,["movie_id","title"]].groupby(by="title").count().sort_values(by="movie_id");
print("按电影统计评分");
drawBox(titleCount.loc[:,"movie_id"],"电影评分条数统计");
titleCount.head(1000)
按电影统计评分

png

movie_id
title
Another Man's Poison (1952) 1
Night Tide (1961) 1
Shadows (Cienie) (1988) 1
McCullochs, The (1975) 1
Anna (1996) 1
... ...
Cats Don't Dance (1997) 37
How I Won the War (1967) 37
Idiots, The (Idioterne) (1998) 37
Isn't She Great? (2000) 37
Boys of St. Vincent, The (1993) 37

1000 rows × 1 columns

职业统计:可以看出0号和4号职业的电影评论较多,8号职业的评论格外少

occupationCount = data.loc[:,["movie_id","occupation"]].groupby(by="occupation").count();
drawBar(occupationCount.loc[:,"movie_id"],"电影评分与职业统计");
print("按职业统计评分");
occupationCount

png

按职业统计评分
movie_id
occupation
0 130499
1 85351
2 50068
3 31623
4 131032
5 21850
6 37205
7 105425
8 2706
9 11345
10 23290
11 20563
12 57214
13 13754
14 49109
15 22951
16 46021
17 72816
18 12086
19 14904
20 60397

电影评分与性别统计:这里,尽管男性用户整体占比没有超过3/4,但男性评论的条数占比超过了3/4,说明了看电影的男性中有不少也是很感性的.

genderCount = data.loc[:,["movie_id","gender"]].groupby(by="gender").count();
drawPie(genderCount.loc[:,"movie_id"],"电影评分与性别统计");
print("按性别统计评分");
genderCount

png

按性别统计评分
movie_id
gender
F 246440
M 753769
1. 每部电影的得分情况分析:由对应的箱线图数据可知,超过50%的电影得分均在2.7~3.7区间内,只有极少数异常的电影评分低于或等于1.5.
#电影得分分析
titleCount = data.loc[:,["title","rating"]].groupby(by="title").mean().sort_values(by="rating");
drawBox(titleCount,"各电影评价得分箱线图");
print("各电影评价得分");
titleCount.head(1000)

png

各电影评价得分
rating
title
Elstree Calling (1930) 1.000000
Get Over It (1996) 1.000000
Venice/Venice (1992) 1.000000
Windows (1980) 1.000000
Kestrel's Eye (Falkens 鰃a) (1998) 1.000000
... ...
Net, The (1995) 2.869947
End of Violence, The (1997) 2.870370
Renaissance Man (1994) 2.870968
Funeral, The (1996) 2.870968
Robert A. Heinlein's The Puppet Masters (1994) 2.871508

1000 rows × 1 columns

  1. 每种风格电影的得分情况分析:由上图可知,即便不同电影的评价得分有较大的变化区间,不同类别电影之间的评分差距却显得较为平和,可以认为无论是哪一类电影,都是既有糟糕的作品,也有优秀的作品.其中,平均得分最高的是Film-Noir电影,高达4.7,最低的是Horror类型的电影
#分隔信息表:movie_id与genres不是一一对应的关系
import pandas  as pd

del data["genres"];  #如果没有删除则需要删除
genres = pd.merge(genres,data,how="inner",on="movie_id");
ratingCount = genres.loc[:,["genres","rating"]].groupby(by="genres").mean().sort_values(by="genres");
drawBar(ratingCount.loc[:,"rating"],"不同电影风格得分统计");
print("不同风格电影得分情况比较");
ratingCount

png

不同风格电影得分情况比较
rating
genres
Action 3.491185
Adventure 3.477257
Animation 3.684868
Children's 3.422035
Comedy 3.522099
Crime 3.708679
Documentary 3.933123
Drama 3.766332
Fantasy 3.447371
Film-Noir 4.075188
Horror 3.215013
Musical 3.665519
Mystery 3.668102
Romance 3.607465
Sci-Fi 3.466521
Thriller 3.570466
War 3.893327
Western 3.637770
  1. 不同性别用户偏爱的电影分析:这里分析了不同类型电影的男女观影人数比较,可以看出有人男性占据绝对的人数优势,在所有电影的观看人次上都超过了女性,其中差距自大的Action类型电影,最小的是Documentary类型电影,当然这在一定程度上也和该电影的总体观影人数较少有关.
#使用交叉表展示性别与电影评分的关系
crosstab = pd.crosstab(genres["genres"],genres["gender"],values=genres["rating"],aggfunc=count);
crosstab.loc[:,"dis"] = crosstab.loc[:,"F"] - crosstab.loc[:,"M"]
#绘制双柱形图
crosstab.plot(kind="bar",title="不同性别用户偏爱的电影类别分析",figsize=(10,10))
plt.show();
print("不同性别用户偏爱的电影类别分析");
crosstab
c:\users\gcl\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\backends\backend_agg.py:238: RuntimeWarning: Glyph 8722 missing from current font.
  font.set_text(s, 0.0, flags=flags)
c:\users\gcl\appdata\local\programs\python\python37-32\lib\site-packages\matplotlib\backends\backend_agg.py:201: RuntimeWarning: Glyph 8722 missing from current font.
  font.set_text(s, 0, flags=flags)

png

不同性别用户偏爱的电影类别分析
gender F M dis
genres
Action 45650 211807 -166157
Adventure 27332 106621 -79289
Animation 12221 31072 -18851
Children's 21317 50869 -29552
Comedy 96271 260309 -164038
Crime 16442 63099 -46657
Documentary 1940 5970 -4030
Drama 98153 256376 -158223
Fantasy 8718 27583 -18865
Film-Noir 4202 14059 -9857
Horror 14635 61751 -47116
Musical 13505 28028 -14523
Mystery 9976 30202 -20226
Romance 50297 97226 -46929
Sci-Fi 27400 129894 -102494
Thriller 40308 149372 -109064
War 14093 54434 -40341
Western 3477 17206 -13729
  1. 不同年龄段用户偏爱的电影分析:各个年龄段的用户都喜欢看Action,Comedy,Drama;年级为1的用户最喜欢看Comedy;25岁,35岁的青壮年用户对于Romance,Sci-Fi(科幻小说),THriller也有不错的偏好.到45岁后用户的电影观看数量多比较少
crosstab = pd.crosstab(genres["genres"],genres["age"],values=genres["rating"],aggfunc=count);
drawBar(crosstab,"不同年龄段用户偏爱的电影类别分析");
print("不同年龄段用户偏爱的电影分析");
crosstab

png

age 1 18 25 35 45 50 56
genres
Action 6578 50186 105678 50503 19357 17012 8143
Adventure 3998 26324 52633 26682 10738 9090 4488
Animation 2449 10269 16454 8117 2889 2032 1083
Children's 4337 16924 25743 14004 5400 3890 1888
Comedy 11162 69980 143210 69244 27890 23133 11961
Crime 1701 15373 33030 14895 6048 5520 2974
Documentary 130 1081 3489 1708 687 555 260
Drama 7483 58104 138695 71590 32141 29247 17269
Fantasy 1360 7875 14290 7006 2695 2127 948
Film-Noir 330 2280 6539 4175 1860 1870 1207
Horror 2211 15184 31235 15122 6192 4681 1761
Musical 1647 7555 14705 8746 3898 3093 1889
Mystery 920 6401 15160 8179 3851 3520 2147
Romance 3599 25656 58003 29330 13283 11373 6279
Sci-Fi 4178 29033 63156 32333 13040 10674 4880
Thriller 4824 35877 77429 36840 14933 13240 6537
War 1578 10874 24830 14514 6642 6314 3775
Western 335 2863 7053 4546 2133 2420 1333
  1. 不同性别用户偏爱的电影,随着年代的变化情况分析:
#通用函数
import pandas as pd;
import re
import numpy as np
import matplotlib.pyplot as plt

#plt字体设置
plt.rcParams['font.sans-serif']=['SimHei'];
plt.rcParams['font.size'] = '16';

def drawBox(data,title):
    data.plot.box(title=title,figsize=(10,10));
    plt.grid(linestyle="--", alpha=0.3);
    plt.show();
    
def drawBar(data,title):
    data.plot.bar(stacked=True,title=title,figsize=(10,10));
    plt.show();

def drawPie(data,title):
    data.plot.pie(title=title,figsize=(10,10));
    plt.show();
    
def drawScatter(data,x,y):
    data.plot.scatter(x=x, y=y,figsize=(10,10))
    plt.show();

def drawLine(data,title):
    data.plot.line(title=title,figsize=(20,20));
    plt.show();
    
def count(data):
    return len(data);

#获得年份
def getYear(data):
    res = re.findall(r'[(](.*?)[)]',data);
    return res[len(res)-1];
        
#sep:分隔符,names:列名,occupation:职业,zip:邮编
rating = pd.read_table("../大作业2020/大作业题目1/movielens/ratings.dat",sep="::",names=["user_id","movie_id","rating","timestamp"],engine='python');
movies = pd.read_table("../大作业2020/大作业题目1/movielens/movies.dat",sep="::",names=["movie_id","title","genres"],engine='python');
movies["year"] = movies.agg({"title":getYear},axios=1);
data = pd.merge(movies,rating,how="inner",on="movie_id");
users = pd.read_table("../大作业2020/大作业题目1/movielens/users.dat",sep="::",names=["user_id","gender","age","occupation","zip"],engine='python');
data = pd.merge(data,users,how="inner",on="user_id");
rating

genres=[];
for i in range(0,len(movies)):
    temp = movies.loc[i,"genres"];
    temp = temp.split("|");
    for j in range(0,len(temp)):
        genres.append([movies.loc[i,"movie_id"],temp[j]]);
        
genres = pd.DataFrame(genres,columns=["movie_id","genres"]);
genresCount = genres.groupby(by="genres").count().sort_values(by="movie_id");

del data["genres"]
data = pd.merge(data,genres,how="inner",on="movie_id");
data.head(1000)
movie_id title year user_id rating timestamp gender age occupation zip genres
0 1 Toy Story (1995) 1995 1 5 978824268 F 1 10 48067 Animation
1 1 Toy Story (1995) 1995 1 5 978824268 F 1 10 48067 Children's
2 1 Toy Story (1995) 1995 1 5 978824268 F 1 10 48067 Comedy
3 1 Toy Story (1995) 1995 6 4 978237008 F 50 9 55117 Animation
4 1 Toy Story (1995) 1995 6 4 978237008 F 50 9 55117 Children's
... ... ... ... ... ... ... ... ... ... ... ...
995 1 Toy Story (1995) 1995 973 4 975860648 F 25 1 80026 Comedy
996 1 Toy Story (1995) 1995 977 3 975106685 M 25 2 80110 Animation
997 1 Toy Story (1995) 1995 977 3 975106685 M 25 2 80110 Children's
998 1 Toy Story (1995) 1995 977 3 975106685 M 25 2 80110 Comedy
999 1 Toy Story (1995) 1995 979 4 988055769 M 1 10 48073 Animation

1000 rows × 11 columns

import pandas as pd;
import re
import numpy as np
import matplotlib.pyplot as plt

pivot_table = pd.pivot_table(data,index="genres",columns=["gender","year"],aggfunc=count);
#pivot_table[pivot_table.isnull().values==False]=0
pivot_table.fillna(value=0,inplace=True);
#使用transpose实现行列转换
drawLine(pivot_table[("age","M")].transpose(),"1919~2020年男性电影观看变化");
drawLine(pivot_table[("age","F")].transpose(),"1919~2020年女性电影观看变化");
pivot_table.head(1000)

png

png

age ... zip
gender F ... M
year 1919 1920 1921 1922 1923 1925 1926 1927 1928 1929 ... 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000
genres
Action 2.0 0.0 11.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 5557.0 8286.0 10967.0 9739.0 13057.0 12812.0 16087.0 16399.0 12329.0 9797.0
Adventure 0.0 0.0 0.0 0.0 0.0 0.0 7.0 0.0 0.0 0.0 ... 2094.0 1346.0 4810.0 4366.0 6120.0 7453.0 7446.0 4133.0 4986.0 1848.0
Animation 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 824.0 1280.0 984.0 964.0 2694.0 1965.0 851.0 3004.0 3354.0 2159.0
Children's 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1029.0 1964.0 2208.0 2746.0 4539.0 2637.0 1924.0 3562.0 3231.0 1826.0
Comedy 24.0 4.0 0.0 0.0 1.0 76.0 1.0 33.0 3.0 0.0 ... 4970.0 11283.0 12658.0 16739.0 14666.0 14462.0 15431.0 17832.0 26541.0 11129.0
Crime 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 248.0 5468.0 2075.0 3737.0 6011.0 4659.0 6912.0 6971.0 2364.0 1509.0
Documentary 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 148.0 33.0 260.0 1048.0 297.0 693.0 437.0 415.0 664.0 334.0
Drama 2.0 0.0 0.0 0.0 1.0 51.0 16.0 13.0 0.0 0.0 ... 7834.0 9497.0 11740.0 13705.0 18379.0 15226.0 18115.0 16693.0 21320.0 9881.0
Fantasy 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 983.0 352.0 348.0 1816.0 888.0 1787.0 494.0 343.0 2039.0 110.0
Film-Noir 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 23.0 0.0 272.0 242.0 1787.0 932.0 0.0 0.0
Horror 0.0 0.0 0.0 51.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 309.0 2389.0 1495.0 1868.0 1809.0 2609.0 2646.0 2790.0 5189.0 1846.0
Musical 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15.0 ... 1201.0 1158.0 934.0 844.0 666.0 1277.0 742.0 660.0 324.0 147.0
Mystery 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 1120.0 713.0 980.0 475.0 975.0 1831.0 6457.0 3965.0 1200.0 445.0
Romance 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0 0.0 0.0 ... 2454.0 4957.0 5728.0 9492.0 7489.0 7146.0 6180.0 8539.0 5597.0 1382.0
Sci-Fi 0.0 0.0 0.0 0.0 0.0 0.0 65.0 0.0 0.0 0.0 ... 4136.0 3479.0 4782.0 2849.0 5958.0 6270.0 10448.0 7026.0 8233.0 4608.0
Thriller 0.0 0.0 0.0 0.0 0.0 0.0 8.0 0.0 0.0 2.0 ... 5227.0 5824.0 5594.0 6460.0 10813.0 12358.0 16133.0 14574.0 16534.0 7839.0
War 0.0 0.0 0.0 0.0 0.0 50.0 0.0 12.0 0.0 0.0 ... 66.0 1696.0 2404.0 2531.0 3463.0 3478.0 2523.0 3348.0 1161.0 1021.0
Western 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 838.0 547.0 1808.0 538.0 238.0 0.0 20.0 735.0 0.0

18 rows × 1296 columns

temp = pivot_table[("age","M")].transpose();
temp.head(50)
genres Action Adventure Animation Children's Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
year
1919 2.0 3.0 0.0 0.0 14.0 0.0 0.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1920 0.0 0.0 0.0 0.0 20.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1921 51.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1922 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 187.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1923 0.0 0.0 0.0 0.0 8.0 0.0 0.0 7.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1925 0.0 0.0 0.0 0.0 246.0 0.0 0.0 188.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 184.0 0.0
1926 0.0 7.0 0.0 0.0 9.0 1.0 0.0 23.0 0.0 0.0 0.0 0.0 0.0 0.0 323.0 13.0 0.0 0.0
1927 0.0 0.0 0.0 0.0 173.0 0.0 0.0 33.0 0.0 0.0 0.0 0.0 0.0 33.0 0.0 0.0 33.0 0.0
1928 0.0 0.0 0.0 0.0 24.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1929 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17.0 0.0 0.0 0.0 9.0 0.0 0.0
1930 0.0 0.0 0.0 0.0 1.0 1.0 1.0 72.0 0.0 0.0 0.0 1.0 8.0 0.0 0.0 7.0 221.0 0.0
1931 0.0 0.0 0.0 0.0 216.0 248.0 0.0 198.0 0.0 248.0 527.0 0.0 0.0 198.0 0.0 248.0 0.0 22.0
1932 0.0 0.0 0.0 0.0 37.0 0.0 0.0 76.0 0.0 0.0 127.0 0.0 0.0 203.0 0.0 1.0 39.0 0.0
1933 649.0 649.0 0.0 0.0 474.0 0.0 0.0 3.0 0.0 0.0 822.0 1.0 0.0 0.0 188.0 0.0 473.0 0.0
1934 0.0 0.0 0.0 0.0 370.0 0.0 0.0 27.0 0.0 0.0 0.0 54.0 196.0 54.0 0.0 96.0 0.0 0.0
1935 0.0 176.0 0.0 0.0 133.0 0.0 0.0 0.0 0.0 0.0 172.0 133.0 0.0 133.0 0.0 195.0 0.0 0.0
1936 0.0 0.0 0.0 0.0 364.0 0.0 0.0 15.0 0.0 0.0 0.0 24.0 2.0 0.0 0.0 61.0 0.0 0.0
1937 0.0 0.0 512.0 512.0 109.0 8.0 0.0 315.0 0.0 0.0 0.0 621.0 0.0 107.0 0.0 8.0 124.0 0.0
1938 294.0 294.0 0.0 0.0 431.0 0.0 0.0 137.0 0.0 0.0 0.0 0.0 129.0 138.0 0.0 129.0 0.0 0.0
1939 0.0 1294.0 83.0 1320.0 131.0 9.0 0.0 2278.0 0.0 0.0 53.0 1211.0 0.0 801.0 0.0 0.0 715.0 0.0
1940 0.0 81.0 1150.0 1150.0 1137.0 0.0 0.0 432.0 0.0 0.0 37.0 802.0 0.0 678.0 9.0 283.0 0.0 34.0
1941 0.0 0.0 368.0 368.0 119.0 0.0 0.0 982.0 0.0 808.0 115.0 375.0 913.0 111.0 0.0 105.0 0.0 0.0
1942 93.0 0.0 395.0 395.0 462.0 0.0 0.0 1426.0 0.0 0.0 60.0 162.0 0.0 1164.0 0.0 71.0 1426.0 8.0
1943 0.0 83.0 12.0 12.0 21.0 0.0 0.0 81.0 0.0 162.0 105.0 0.0 9.0 0.0 0.0 237.0 85.0 20.0
1944 89.0 0.0 0.0 0.0 485.0 579.0 0.0 259.0 0.0 656.0 78.0 101.0 726.0 0.0 0.0 788.0 259.0 0.0
1945 0.0 0.0 94.0 94.0 56.0 0.0 0.0 234.0 0.0 0.0 82.0 150.0 132.0 196.0 0.0 132.0 26.0 0.0
1946 0.0 153.0 172.0 172.0 0.0 16.0 0.0 787.0 0.0 769.0 0.0 172.0 416.0 357.0 0.0 317.0 174.0 60.0
1947 0.0 9.0 27.0 27.0 58.0 25.0 0.0 426.0 0.0 118.0 0.0 27.0 0.0 98.0 0.0 0.0 0.0 43.0
1948 0.0 364.0 19.0 19.0 171.0 361.0 0.0 621.0 0.0 398.0 165.0 19.0 0.0 0.0 0.0 508.0 0.0 0.0
1949 0.0 111.0 0.0 114.0 0.0 1.0 0.0 217.0 0.0 0.0 0.0 136.0 377.0 1.0 0.0 377.0 0.0 0.0
1950 0.0 0.0 353.0 362.0 546.0 123.0 0.0 255.0 0.0 476.0 0.0 353.0 19.0 0.0 105.0 19.0 0.0 0.0
1951 853.0 853.0 338.0 338.0 0.0 0.0 0.0 915.0 0.0 347.0 0.0 534.0 0.0 1046.0 588.0 347.0 853.0 0.0
1952 4.0 65.0 0.0 0.0 219.0 1.0 0.0 94.0 0.0 0.0 0.0 454.0 0.0 613.0 60.0 0.0 0.0 340.0
1953 668.0 0.0 396.0 396.0 245.0 0.0 0.0 1074.0 396.0 15.0 0.0 434.0 0.0 462.0 585.0 50.0 1086.0 249.0
1954 784.0 479.0 0.0 478.0 201.0 404.0 0.0 956.0 478.0 0.0 182.0 169.0 1010.0 172.0 986.0 1250.0 240.0 0.0
1955 0.0 0.0 545.0 545.0 1154.0 0.0 0.0 1063.0 0.0 33.0 56.0 689.0 108.0 977.0 103.0 384.0 333.0 114.0
1956 0.0 235.0 0.0 0.0 217.0 112.0 0.0 269.0 0.0 183.0 556.0 217.0 0.0 0.0 1020.0 280.0 18.0 210.0
1957 0.0 0.0 0.0 210.0 257.0 0.0 1.0 2062.0 0.0 0.0 56.0 128.0 0.0 139.0 67.0 32.0 999.0 0.0
1958 319.0 318.0 0.0 0.0 289.0 405.0 0.0 554.0 214.0 405.0 1036.0 292.0 671.0 369.0 879.0 1076.0 369.0 53.0
1959 576.0 683.0 319.0 544.0 697.0 577.0 0.0 2047.0 107.0 0.0 286.0 319.0 150.0 0.0 133.0 983.0 100.0 0.0
1960 0.0 203.0 0.0 281.0 617.0 19.0 0.0 988.0 0.0 0.0 1472.0 30.0 0.0 0.0 177.0 1228.0 0.0 0.0
1961 453.0 146.0 378.0 909.0 544.0 0.0 0.0 1806.0 381.0 0.0 81.0 644.0 0.0 835.0 146.0 16.0 484.0 189.0
1962 1042.0 746.0 0.0 12.0 0.0 0.0 0.0 1306.0 0.0 859.0 157.0 53.0 0.0 0.0 158.0 1082.0 991.0 43.0
1963 670.0 917.0 207.0 320.0 721.0 0.0 0.0 721.0 0.0 0.0 889.0 0.0 185.0 185.0 1245.0 683.0 1740.0 211.0
1964 1236.0 0.0 0.0 657.0 1081.0 0.0 0.0 157.0 0.0 0.0 59.0 1466.0 0.0 336.0 0.0 221.0 0.0 477.0
1965 407.0 0.0 0.0 72.0 689.0 0.0 0.0 595.0 0.0 0.0 12.0 918.0 72.0 408.0 57.0 186.0 385.0 518.0
1966 723.0 292.0 0.0 0.0 30.0 0.0 120.0 601.0 0.0 0.0 0.0 0.0 190.0 83.0 292.0 42.0 0.0 749.0
1967 636.0 180.0 445.0 474.0 1703.0 515.0 0.0 2914.0 0.0 0.0 0.0 630.0 258.0 886.0 59.0 0.0 668.0 303.0
1968 973.0 317.0 428.0 187.0 840.0 289.0 0.0 1907.0 0.0 0.0 1112.0 912.0 1372.0 202.0 3254.0 2100.0 75.0 0.0
1969 1735.0 917.0 0.0 172.0 1512.0 11.0 0.0 1001.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 42.0 304.0 1858.0
temp = pivot_table[("age","F")].transpose();
temp.head(50)
genres Action Adventure Animation Children's Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
year
1919 2.0 0.0 0.0 0.0 24.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1920 0.0 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1921 11.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1922 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 51.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1923 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1925 0.0 0.0 0.0 0.0 76.0 0.0 0.0 51.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 50.0 0.0
1926 0.0 7.0 0.0 0.0 1.0 1.0 0.0 16.0 0.0 0.0 0.0 0.0 0.0 0.0 65.0 8.0 0.0 0.0
1927 0.0 0.0 0.0 0.0 33.0 0.0 0.0 13.0 0.0 0.0 0.0 0.0 0.0 12.0 0.0 0.0 12.0 0.0
1928 0.0 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1929 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15.0 0.0 0.0 0.0 2.0 0.0 0.0
1930 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30.0 0.0 0.0 0.0 0.0 3.0 0.0 0.0 3.0 47.0 0.0
1931 0.0 0.0 0.0 0.0 76.0 60.0 0.0 73.0 0.0 60.0 117.0 0.0 0.0 73.0 0.0 60.0 0.0 7.0
1932 0.0 0.0 0.0 0.0 15.0 0.0 0.0 25.0 0.0 0.0 35.0 0.0 0.0 67.0 0.0 0.0 17.0 0.0
1933 124.0 124.0 0.0 0.0 121.0 0.0 0.0 2.0 0.0 0.0 166.0 0.0 0.0 0.0 44.0 0.0 121.0 0.0
1934 0.0 0.0 0.0 0.0 196.0 0.0 0.0 31.0 0.0 0.0 0.0 51.0 96.0 51.0 0.0 32.0 0.0 0.0
1935 0.0 49.0 0.0 0.0 118.0 0.0 0.0 0.0 0.0 0.0 44.0 118.0 0.0 118.0 0.0 58.0 0.0 0.0
1936 0.0 0.0 0.0 0.0 129.0 0.0 0.0 9.0 0.0 0.0 0.0 12.0 0.0 0.0 0.0 12.0 0.0 0.0
1937 0.0 0.0 251.0 251.0 91.0 2.0 0.0 131.0 0.0 0.0 0.0 342.0 0.0 91.0 0.0 2.0 41.0 0.0
1938 84.0 84.0 0.0 0.0 258.0 0.0 0.0 73.0 0.0 0.0 0.0 0.0 70.0 77.0 0.0 70.0 0.0 0.0
1939 0.0 541.0 34.0 592.0 85.0 2.0 0.0 1108.0 0.0 0.0 12.0 507.0 0.0 487.0 0.0 0.0 441.0 0.0
1940 0.0 24.0 488.0 488.0 608.0 0.0 0.0 167.0 0.0 0.0 8.0 338.0 0.0 525.0 3.0 196.0 0.0 12.0
1941 0.0 0.0 200.0 200.0 66.0 0.0 0.0 363.0 0.0 235.0 19.0 200.0 297.0 74.0 0.0 62.0 0.0 0.0
1942 8.0 0.0 194.0 194.0 239.0 0.0 0.0 587.0 0.0 0.0 9.0 113.0 0.0 505.0 0.0 18.0 587.0 5.0
1943 0.0 15.0 3.0 3.0 7.0 0.0 0.0 37.0 0.0 71.0 19.0 0.0 4.0 0.0 0.0 108.0 15.0 3.0
1944 12.0 0.0 0.0 0.0 250.0 264.0 0.0 55.0 0.0 299.0 20.0 88.0 425.0 0.0 0.0 394.0 55.0 0.0
1945 0.0 0.0 32.0 32.0 47.0 0.0 0.0 138.0 0.0 0.0 21.0 79.0 74.0 110.0 0.0 74.0 3.0 0.0
1946 0.0 68.0 69.0 69.0 0.0 4.0 0.0 297.0 0.0 324.0 0.0 69.0 125.0 187.0 0.0 171.0 62.0 12.0
1947 0.0 3.0 8.0 8.0 7.0 11.0 0.0 236.0 0.0 41.0 0.0 8.0 0.0 85.0 0.0 0.0 0.0 8.0
1948 0.0 89.0 5.0 5.0 41.0 129.0 0.0 212.0 0.0 137.0 41.0 5.0 0.0 0.0 0.0 173.0 0.0 0.0
1949 0.0 33.0 0.0 38.0 0.0 1.0 0.0 64.0 0.0 0.0 0.0 87.0 103.0 0.0 0.0 103.0 0.0 0.0
1950 0.0 0.0 224.0 224.0 291.0 28.0 0.0 157.0 0.0 145.0 0.0 224.0 7.0 0.0 12.0 7.0 0.0 0.0
1951 309.0 309.0 187.0 187.0 0.0 0.0 0.0 306.0 0.0 135.0 0.0 335.0 0.0 488.0 106.0 135.0 309.0 0.0
1952 0.0 15.0 0.0 0.0 80.0 0.0 0.0 43.0 0.0 0.0 0.0 297.0 0.0 364.0 13.0 0.0 0.0 63.0
1953 116.0 0.0 198.0 198.0 251.0 0.0 0.0 313.0 198.0 3.0 0.0 231.0 0.0 327.0 75.0 19.0 237.0 56.0
1954 145.0 97.0 0.0 97.0 201.0 127.0 0.0 246.0 97.0 0.0 41.0 102.0 419.0 193.0 180.0 459.0 40.0 0.0
1955 0.0 0.0 319.0 319.0 574.0 0.0 0.0 401.0 0.0 5.0 6.0 435.0 39.0 585.0 12.0 206.0 88.0 16.0
1956 0.0 53.0 0.0 0.0 52.0 18.0 0.0 141.0 0.0 45.0 143.0 155.0 0.0 0.0 221.0 124.0 1.0 35.0
1957 0.0 0.0 0.0 91.0 205.0 0.0 2.0 519.0 0.0 0.0 7.0 118.0 0.0 166.0 8.0 6.0 169.0 0.0
1958 83.0 71.0 0.0 0.0 223.0 109.0 0.0 336.0 44.0 109.0 229.0 226.0 234.0 201.0 188.0 343.0 131.0 11.0
1959 128.0 179.0 192.0 281.0 295.0 255.0 0.0 636.0 51.0 0.0 72.0 192.0 49.0 0.0 36.0 332.0 7.0 0.0
1960 0.0 73.0 0.0 145.0 292.0 6.0 0.0 358.0 0.0 0.0 459.0 19.0 0.0 0.0 42.0 364.0 0.0 0.0
1961 49.0 34.0 187.0 456.0 172.0 0.0 0.0 741.0 161.0 0.0 26.0 400.0 0.0 602.0 34.0 7.0 62.0 26.0
1962 161.0 172.0 0.0 6.0 0.0 0.0 0.0 488.0 0.0 267.0 31.0 69.0 0.0 0.0 34.0 374.0 192.0 7.0
1963 101.0 223.0 86.0 140.0 303.0 0.0 0.0 232.0 0.0 0.0 286.0 0.0 121.0 121.0 258.0 295.0 323.0 68.0
1964 183.0 0.0 0.0 354.0 513.0 0.0 0.0 95.0 0.0 0.0 19.0 850.0 0.0 300.0 0.0 62.0 0.0 45.0
1965 56.0 0.0 0.0 51.0 260.0 0.0 0.0 234.0 0.0 0.0 3.0 419.0 51.0 189.0 7.0 54.0 179.0 87.0
1966 99.0 56.0 0.0 0.0 19.0 0.0 29.0 257.0 0.0 0.0 0.0 0.0 56.0 57.0 56.0 16.0 0.0 101.0
1967 71.0 81.0 219.0 235.0 652.0 171.0 0.0 1038.0 0.0 0.0 0.0 309.0 90.0 375.0 10.0 0.0 76.0 20.0
1968 189.0 69.0 192.0 91.0 264.0 65.0 0.0 576.0 0.0 0.0 329.0 358.0 344.0 154.0 725.0 608.0 1.0 0.0
1969 358.0 153.0 0.0 70.0 404.0 2.0 0.0 298.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10.0 17.0 376.0
* 对比图表和相关的统计数据,可以看出男女性观看电影的**第一波高潮**都是在1938年的Drama,最一开始使人误以为是戏剧,但这里通过对比具体的电影可以发现应该是指剧情片.将1939年的剧情类电影进行统计,可以进一步看出最受关注的是gone with the wind(飘)和Wizard of Oz, The(绿野仙踪)
tempData = data[(data["year"]=="1939")&(data["genres"]=="Drama")];
dramaCount = tempData.loc[:,["title","movie_id"]].groupby(by=["title"]).count();
dramaCount
movie_id
title
Gone with the Wind (1939) 1156
Jamaica Inn (1939) 8
Little Princess, The (1939) 77
Mr. Smith Goes to Washington (1939) 383
Only Angels Have Wings (1939) 33
They Made Me a Criminal (1939) 11
Wizard of Oz, The (1939) 1718
* 80年代前期,Sci-Fi一度成为了最受欢迎的电影.但是到了中期,最受欢迎的电影类型是Comedy,对应的电影是Terminator, The(终结者) * 90年代中期,依然还是Drama类型的电影,最受欢迎的此类电影是Bravehear(勇敢的心) * 男性观影的最高峰是在1999年,对应的电影类型又成了Comedy,对应的电影是American Beauty(美国丽人)
temp = pivot_table[("age","M")].transpose();
temp.tail(20)
genres Action Adventure Animation Children's Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
year
1981 6489.0 4396.0 835.0 558.0 2086.0 534.0 43.0 2973.0 1431.0 0.0 1435.0 97.0 172.0 1151.0 3059.0 1495.0 909.0 0.0
1982 3993.0 2635.0 293.0 2529.0 3118.0 285.0 42.0 4758.0 3072.0 1488.0 2292.0 776.0 0.0 456.0 6707.0 1550.0 323.0 0.0
1983 2944.0 2661.0 0.0 158.0 5111.0 0.0 138.0 3870.0 0.0 0.0 1489.0 0.0 0.0 2552.0 2661.0 737.0 2368.0 0.0
1984 5590.0 4834.0 0.0 752.0 8039.0 512.0 182.0 6486.0 2125.0 514.0 3385.0 868.0 443.0 2746.0 6614.0 2747.0 1352.0 0.0
1985 3033.0 3218.0 147.0 1216.0 9588.0 0.0 0.0 5084.0 1623.0 0.0 1317.0 0.0 775.0 3926.0 4736.0 1066.0 755.0 289.0
1986 7279.0 5710.0 571.0 1889.0 9993.0 739.0 0.0 7372.0 518.0 0.0 2681.0 674.0 930.0 2470.0 4456.0 3195.0 3219.0 470.0
1987 9240.0 3650.0 0.0 41.0 9542.0 3459.0 0.0 6717.0 30.0 352.0 1817.0 396.0 484.0 3121.0 4120.0 2568.0 2435.0 0.0
1988 4468.0 3713.0 1884.0 96.0 8866.0 1098.0 357.0 4010.0 2950.0 1381.0 1126.0 105.0 0.0 1210.0 1278.0 2433.0 483.0 484.0
1989 6877.0 5392.0 800.0 1618.0 9875.0 2435.0 642.0 8763.0 791.0 0.0 1608.0 661.0 0.0 2547.0 4105.0 2251.0 1212.0 0.0
1990 9797.0 3362.0 364.0 1186.0 6294.0 3882.0 93.0 7150.0 557.0 661.0 2353.0 0.0 533.0 3316.0 4819.0 8600.0 0.0 2342.0
1991 5557.0 2094.0 824.0 1029.0 4970.0 248.0 148.0 7834.0 983.0 0.0 309.0 1201.0 1120.0 2454.0 4136.0 5227.0 66.0 0.0
1992 8286.0 1346.0 1280.0 1964.0 11283.0 5468.0 33.0 9497.0 352.0 0.0 2389.0 1158.0 713.0 4957.0 3479.0 5824.0 1696.0 838.0
1993 10967.0 4810.0 984.0 2208.0 12658.0 2075.0 260.0 11740.0 348.0 23.0 1495.0 934.0 980.0 5728.0 4782.0 5594.0 2404.0 547.0
1994 9739.0 4366.0 964.0 2746.0 16739.0 3737.0 1048.0 13705.0 1816.0 0.0 1868.0 844.0 475.0 9492.0 2849.0 6460.0 2531.0 1808.0
1995 13057.0 6120.0 2694.0 4539.0 14666.0 6011.0 297.0 18379.0 888.0 272.0 1809.0 666.0 975.0 7489.0 5958.0 10813.0 3463.0 538.0
1996 12812.0 7453.0 1965.0 2637.0 14462.0 4659.0 693.0 15226.0 1787.0 242.0 2609.0 1277.0 1831.0 7146.0 6270.0 12358.0 3478.0 238.0
1997 16087.0 7446.0 851.0 1924.0 15431.0 6912.0 437.0 18115.0 494.0 1787.0 2646.0 742.0 6457.0 6180.0 10448.0 16133.0 2523.0 0.0
1998 16399.0 4133.0 3004.0 3562.0 17832.0 6971.0 415.0 16693.0 343.0 932.0 2790.0 660.0 3965.0 8539.0 7026.0 14574.0 3348.0 20.0
1999 12329.0 4986.0 3354.0 3231.0 26541.0 2364.0 664.0 21320.0 2039.0 0.0 5189.0 324.0 1200.0 5597.0 8233.0 16534.0 1161.0 735.0
2000 9797.0 1848.0 2159.0 1826.0 11129.0 1509.0 334.0 9881.0 110.0 0.0 1846.0 147.0 445.0 1382.0 4608.0 7839.0 1021.0 0.0
tempData = data[(data["year"]=="1995")&(data["genres"]=="Drama") & (data["gender"] == "M")];
dramaCount = tempData.loc[:,["title","movie_id"]].groupby(by=["title"]).count().sort_values(by="movie_id");
dramaCount
movie_id
title
Diebinnen (1995) 1
To Have, or Not (1995) 1
Sleepover (1995) 1
Boy Called Hate, A (1995) 1
Billy's Holiday (1995) 1
... ...
Apollo 13 (1995) 923
Get Shorty (1995) 1070
Babe (1995) 1172
Twelve Monkeys (1995) 1233
Braveheart (1995) 1897

135 rows × 1 columns

  • 与男性不同,80年代中期的终结者并没有吸引太多的女性用户,同期的Drama更为卖座
  • 女性在2000年前的一个观影小高峰在1995,还是Drama类型的电影,最受关注的该类型电影是Bravehear(勇敢的心)和Babe(小猪宝贝)
  • 女性电影观看的最高峰是在1999年,同样是Commedy类型的电影,最受关注的电影是
temp = pivot_table[("age","F")].transpose();
temp.tail(20)
genres Action Adventure Animation Children's Comedy Crime Documentary Drama Fantasy Film-Noir Horror Musical Mystery Romance Sci-Fi Thriller War Western
year
1981 1376.0 983.0 182.0 232.0 573.0 140.0 19.0 853.0 321.0 0.0 253.0 15.0 85.0 382.0 487.0 348.0 173.0 0.0
1982 621.0 453.0 84.0 819.0 1010.0 82.0 4.0 1653.0 877.0 312.0 446.0 348.0 0.0 238.0 1536.0 353.0 64.0 0.0
1983 789.0 733.0 0.0 55.0 1686.0 0.0 44.0 1486.0 0.0 0.0 327.0 0.0 0.0 822.0 733.0 164.0 697.0 0.0
1984 1254.0 1260.0 0.0 288.0 2468.0 67.0 57.0 1981.0 708.0 114.0 973.0 250.0 55.0 978.0 1296.0 565.0 289.0 0.0
1985 550.0 934.0 53.0 440.0 3187.0 0.0 0.0 2113.0 518.0 0.0 199.0 0.0 323.0 1801.0 1249.0 425.0 107.0 23.0
1986 1508.0 1485.0 195.0 567.0 3351.0 141.0 0.0 2783.0 212.0 0.0 684.0 267.0 275.0 1224.0 871.0 684.0 546.0 99.0
1987 1959.0 1019.0 0.0 22.0 3189.0 732.0 0.0 2007.0 22.0 96.0 294.0 291.0 149.0 1463.0 651.0 634.0 602.0 0.0
1988 864.0 941.0 511.0 48.0 2849.0 171.0 89.0 1785.0 948.0 418.0 223.0 25.0 0.0 611.0 210.0 561.0 47.0 78.0
1989 1548.0 1333.0 450.0 721.0 3416.0 611.0 183.0 3124.0 254.0 0.0 322.0 374.0 0.0 1329.0 987.0 482.0 326.0 0.0
1990 2076.0 878.0 149.0 439.0 2088.0 950.0 48.0 2542.0 154.0 224.0 562.0 0.0 134.0 1656.0 910.0 2156.0 0.0 626.0
1991 1366.0 448.0 437.0 483.0 1926.0 98.0 61.0 3298.0 307.0 0.0 53.0 539.0 441.0 1245.0 846.0 1584.0 25.0 0.0
1992 1693.0 425.0 566.0 847.0 4154.0 1339.0 12.0 3788.0 88.0 0.0 574.0 548.0 201.0 2461.0 712.0 1341.0 633.0 159.0
1993 2336.0 1187.0 322.0 870.0 4804.0 344.0 91.0 4896.0 125.0 9.0 273.0 345.0 404.0 2984.0 1017.0 1671.0 796.0 93.0
1994 2191.0 1138.0 420.0 1179.0 6433.0 1191.0 318.0 6098.0 715.0 0.0 507.0 357.0 222.0 4437.0 525.0 1922.0 979.0 512.0
1995 2685.0 1474.0 1061.0 2087.0 6578.0 1503.0 159.0 8011.0 323.0 107.0 333.0 316.0 282.0 4783.0 1177.0 2680.0 961.0 75.0
1996 2639.0 1908.0 710.0 1222.0 5680.0 1328.0 204.0 6962.0 499.0 48.0 605.0 656.0 550.0 4118.0 1247.0 3274.0 1005.0 18.0
1997 3613.0 1818.0 366.0 883.0 6021.0 2068.0 144.0 7172.0 206.0 576.0 652.0 420.0 1886.0 3095.0 2248.0 4418.0 556.0 0.0
1998 3705.0 1071.0 1122.0 1462.0 7079.0 1752.0 165.0 7037.0 89.0 167.0 623.0 221.0 1158.0 4748.0 1344.0 3297.0 776.0 6.0
1999 3039.0 1368.0 1136.0 1269.0 10328.0 602.0 221.0 8282.0 626.0 0.0 1411.0 129.0 535.0 3064.0 2074.0 4885.0 275.0 167.0
2000 2327.0 462.0 696.0 659.0 4386.0 411.0 88.0 3781.0 25.0 0.0 519.0 53.0 132.0 754.0 997.0 2247.0 288.0 0.0
tempData = data[(data["year"]=="1995")&(data["genres"]=="Drama") & (data["gender"] == "F")];
dramaCount = tempData.loc[:,["title","movie_id"]].groupby(by=["title"]).count().sort_values(by="movie_id");
dramaCount
movie_id
title
Killer: A Journal of Murder (1995) 1
Confessional, The (Le Confessionnal) (1995) 1
Fall Time (1995) 1
Midaq Alley (Callej髇 de los milagros, El) (1995) 1
Neon Bible, The (1995) 1
... ...
Apollo 13 (1995) 328
American President, The (1995) 379
Sense and Sensibility (1995) 420
Braveheart (1995) 546
Babe (1995) 579

119 rows × 1 columns

综上:可以看出在电影历史上,男女生对电影到的品味有着较高的相似度——Comedy和Drama长期占据着票房的头把交椅,但是对于科幻电影,男生显然更为喜欢一点.

posted @ 2021-01-18 10:10  Arno_vc  阅读(886)  评论(0编辑  收藏  举报