ml pandas(数据分析处理库)学习笔记
pandas
数据读取
import pandas csv_info = pandas.read_csv('food_info.csv') print(type(csv_info)) #<class 'pandas.core.frame.DataFrame'> print(csv_info.dtypes) #ps:字符型为object print(csv_info.head()) #可视化读入数据,以表格的形式 print(csv_info.head(3)) #可视化读入数据,以表格的形式,显示前3条商品数据 print(csv_info.tail(3)) #可视化读入数据,以表格的形式,显示最后3条商品数据 print(csv_info.columns) #显示csv表格商品的每个属性的名称,以列表的形式 print(csv_info.shape) #显示读入数据的规模,即行和列 此例中的值为:(8618, 36)
索引,提取所需数据
import pandas csv_info = pandas.read_csv('food_info.csv') print(csv_info.loc[0]) #第一个商品的所有属性 数据类型为<class 'pandas.core.series.Series'> print(csv_info.loc[3:6]) #切片操作,取出特定的数据 此数据为第3,4,5,6行的商品数据 print(csv_info.loc[1,3,6]) #切片操作,取出特定的数据 此数据为第1,3,6行的商品数据 columns = ['Lipid_Tot_(g)','Fiber_TD_(g)'] #指定2个属性 print(csv_info[columns]) #打印这两列属性与索引编号 list = csv_info.columns.tolist() #将所有属性名做成一个列表 a = [] for i in list: if i.endswith("(g)"): #提取出所有以g为单位的属性,并以刘表的形式表出 a.append(i) b = csv_info[a] #打印b为以g为单位的属性商品数据
进行加减乘除运算
import pandas csv_info = pandas.read_csv('food_info.csv') div_1000 = csv_info["Iron_(mg)"] / 1000 #单位由mg换为g add_100 = csv_info["Iron_(mg)"] + 100 sub_100 = csv_info["Iron_(mg)"] - 100 mult_2 = csv_info["Iron_(mg)"]*2
添加一列的属性:(这个属性由已知属性计算得出)
import pandas csv_info = pandas.read_csv('food_info.csv') water_energy = csv_info["Water_(g)"] * csv_info["Energ_Kcal"] #新的属性由两个已知属性的乘积得到 iron_grams = csv_info["Iron_(mg)"] / 1000 #进行单位换算 csv_info["Iron_(g)"] = iron_grams #添加新的属性
找出某一列的最大值
import pandas csv_info = pandas.read_csv('food_info.csv') c = csv_info['Lipid_Tot_(g)'].max() print(c)
对某一属性进行升序或者降序排序
import pandas csv_info = pandas.read_csv('food_info.csv') csv_info.sort_values("Sodium_(mg)", inplace=True) #进行升序排序,inplace=True表示是新建立内存空间 csv_info.sort_values("Sodium_(mg)", inplace=True, ascending=False) #加入aseending=False,表示不按照升序排序,也就是按照降序排序
数据预处理经典案例:泰坦尼克号登船人员信息
q:年龄缺失的成员有多少人?
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 age = survival['Age'] #提取出age属性对其操作 age_null = age.isnull() #如果缺失,属性值为true 如果存在则为false age_nulltrue = age[age_null] #提取出属性值为true的商品 print(len(age_nulltrue)) #计算出其长度 也就是缺失年龄数据的船员人数
q:为什么要提取缺失成员并去掉?
a:如果有缺失值,不能对此属性数据进行运算。例如求平均年龄等(ps:平均年龄求法:mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]))
q:如何筛选出不是nan的值?
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 age = survival['Age'] #提取出age属性对其操作 age_null = age.isnull() #如果缺失,属性值为true 如果存在则为false good_age = survival['Age'][age_null == False] print(good_age)
如果想求平均值,还有一个方法(内置方法,忽略nan值):
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 age = survival['Age'].mean() print(age)
q:对一二三等舱求对应的平均价格怎么求:(*****)
正常思路:
passenger_classes = [1, 2, 3] fares_by_class = {} for this_class in passenger_classes: pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] pclass_fares = pclass_rows["Fare"] fare_for_class = pclass_fares.mean() fares_by_class[this_class] = fare_for_class print fares_by_class
pandas方法:
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 passenger_survival = survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean) print(passenger_survival)
各个舱位的获救概率也可以利用此方法:
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 passenger_survival = survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean) print(passenger_survival)
三个码头与获救人数和价格的关系(此时求的是和,不是均值)
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 passenger_survival = survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum) print(passenger_survival)
在pivot_table中不写aggfunc= 默认输出平均值
删除一些有nan的行,让数据都是可处理的
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 drop_na_columns = survival.dropna(axis=1) new_titanic_survival = survival.dropna(axis=0,subset=["Age", "Sex"]) #不要age或者sex里面为空的数据 print(new_titanic_survival)
通过索引的方法找到具体需要的数据
row_index_83_age = titanic_survival.loc[83,"Age"] row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
如果要把排序好的index(索引)值也发生相应的改变,变为排序好的
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 new_titanic_survival = survival.sort_values("Age",ascending=False) print(new_titanic_survival[0:10]) itanic_reindexed = new_titanic_survival.reset_index(drop=True) print(survival.iloc[0:10])
自定义函数(将自己编好的函数在对象中实现调用)
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 def a(): pass b = survival.apply(a)
def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) print column_null_count
def which_class(row): pclass = row['Pclass'] if pd.isnull(pclass): return "Unknown" elif pclass == 1: return "First Class" elif pclass == 2: return "Second Class" elif pclass == 3: return "Third Class" classes = titanic_survival.apply(which_class, axis=1) print classes
def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label, axis=1) print age_labels
titanic_survival['age_labels'] = age_labels age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived") print age_group_survival
dataframe 是由许多的series组成的,series也就是数据的其中一行或者其中一列
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 series = survival['Name'] print(type(series)) #<class 'pandas.core.series.Series'>
import pandas as pd import numpy as np survival = pd.read_csv('titanic_train.csv') #读入文件 series = survival['Name'] a = series.values print(type(a)) #<class 'numpy.ndarray'>
会发现pandas其实是封装在numpy里的
用匿名函数求标准差
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']] rt_mt_user.apply(lambda x: np.std(x), axis=1)

浙公网安备 33010602011771号