import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 读取用户表
users = pd.read_table('./users.dat', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], sep='::',
engine='python')
# print(users.head())
# 读取评分表
ratings = pd.read_table('./ratings.dat', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], sep='::',
engine='python')
# print(ratings.head())
# 读取电影详情表
movies = pd.read_table('./movies.dat', header=None, names=['MovieID', 'Title', 'Genres'], sep='::', engine='python')
# print(movies.head())
# 将表进行合并
data = pd.merge(pd.merge(ratings, users), movies)
print(data.head())
# 使用pivot_table方法查看,每一部电影不同性别的平均评分
data_gender = pd.pivot_table(data, index='Title', columns='Gender', values='Rating', margins=True)
# data_gender = data.pivot_table(index='Title', columns='Gender', values='Rating')
print(data_gender.head())
# 使用groupby方法
data_gender = data.groupby(['Title', 'Gender']).agg({'Rating': 'mean'})
print(data_gender.head())
# 使用crosstab方法查看每一部电影不同性别的平均评分
data_gender = pd.crosstab(data.Title, data.Gender, data.Rating, aggfunc='mean')
print(data_gender.head())
输出结果:
UserID MovieID ... Title Genres
0 1 1193 ... One Flew Over the Cuckoo's Nest (1975) Drama
1 2 1193 ... One Flew Over the Cuckoo's Nest (1975) Drama
2 12 1193 ... One Flew Over the Cuckoo's Nest (1975) Drama
3 15 1193 ... One Flew Over the Cuckoo's Nest (1975) Drama
4 17 1193 ... One Flew Over the Cuckoo's Nest (1975) Drama
[5 rows x 10 columns]
Gender F M All
Title
$1,000,000 Duck (1971) 3.375000 2.761905 3.027027
'Night Mother (1986) 3.388889 3.352941 3.371429
'Til There Was You (1997) 2.675676 2.733333 2.692308
'burbs, The (1989) 2.793478 2.962085 2.910891
...And Justice for All (1979) 3.828571 3.689024 3.713568
Rating
Title Gender
$1,000,000 Duck (1971) F 3.375000
M 2.761905
'Night Mother (1986) F 3.388889
M 3.352941
'Til There Was You (1997) F 2.675676
Gender F M
Title
$1,000,000 Duck (1971) 3.375000 2.761905
'Night Mother (1986) 3.388889 3.352941
'Til There Was You (1997) 2.675676 2.733333
'burbs, The (1989) 2.793478 2.962085
...And Justice for All (1979) 3.828571 3.689024
import pandas as pd
data = pd.DataFrame({'Sample': range(1, 11),
'Gender': ['Female', 'Male', 'Female', 'Male', 'Male', 'Male', 'Female', 'Female', 'Male',
'Female'],
'Handedness': ['Right-handed', 'Left-handed', 'Right-handed', 'Right-handed', 'Left-handed',
'Right-handed', 'Right-handed', 'Left-handed', 'Right-handed', 'Right-handed']})
print(data)
# 方法1 :使用pivot_table
data1 = pd.pivot_table(data, index='Gender', columns='Handedness', aggfunc=len, margins=True)
print(data1)
# 方法2:使用crosstab
data2 = pd.crosstab(data.Gender, data.Handedness, data.Sample, aggfunc=len, margins=True)
print(data2)
输出结果:
Sample Gender Handedness
0 1 Female Right-handed
1 2 Male Left-handed
2 3 Female Right-handed
3 4 Male Right-handed
4 5 Male Left-handed
5 6 Male Right-handed
6 7 Female Right-handed
7 8 Female Left-handed
8 9 Male Right-handed
9 10 Female Right-handed
Sample
Handedness Left-handed Right-handed All
Gender
Female 1 4 5
Male 2 3 5
All 3 7 10
Handedness Left-handed Right-handed All
Gender
Female 1 4 5
Male 2 3 5
All 3 7 10