数据预处理--数据选择
筛选空值
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value. #we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values age = titanic_survival["Age"] #print(age.loc[0:10]) #true or false age_is_null = pd.isnull(age) #print age_is_null #value or nan age_null_true = age[age_is_null] #print age_null_true age_null_count = len(age_null_true) print(age_null_count)
求均值
# 方法一 #we have to filter out the missing values before we calculate the mean. good_ages = titanic_survival["Age"][age_is_null == False] #print good_ages correct_mean_age = sum(good_ages) / len(good_ages) print correct_mean_age # 方法二 # missing data is so common that many pandas methods automatically filter for it correct_mean_age = titanic_survival["Age"].mean() print correct_mean_age #错误的方法 #The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"]) print mean_age
数据透视表
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
排序
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
列名处理
col_names = food_info.columns.tolist() #print col_names gram_columns = [] for c in col_names: if c.endswith("(g)"): gram_columns.append(c) gram_df = food_info[gram_columns] print(gram_df.head(3))
按列类型过滤
# 查找类型列 cat_features = list(train.select_dtypes(include=['object']).columns) print "Categorical: {} features".format(len(cat_features)) # 查找连续数值列 cont_features = [cont for cont in list(train.select_dtypes( include=['float64', 'int64']).columns) if cont not in ['loss', 'id']] print "Continuous: {} features".format(len(cont_features))
查看类型变量类别个数
cat_uniques = [] for cat in cat_features: cat_uniques.append(len(train[cat].unique())) uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)])
类型转换
data['id']=data['id'].astype('int64')
数据筛选
# 在df1,不在df2 df1=df1[~df1['cust_no'].isin(df2['cust_no'].tolist())] # 日期转时间 data['date']=data['date'].astype("str").apply(lambda x:datetime.strptime(x,'%Y%M%D'))
多个dataframe合并处理
input1=pd.read_csv(path+folder+"01.csv",encoding="utf-8") input2=pd.read_csv(path+folder+"02.csv",encoding="utf-8") input3=pd.read_csv(path+folder+"03.csv",encoding="utf-8") inputs=[input1,input2,input3] df_all=reduce(lambda left,right:pd.merge(left,right,on="cust_no",how="inner"),inputs)
多个列合并
credit_type = pd.get_dummies(data["credit_type"],drop_first=True,prefix="credit_type") tran_branch = pd.get_dummies(data["tran_branch"],prefix="branch") data=pd.concat([data,credit_type,tran_branch],axis=1)