数据预处理--数据选择

筛选空值

#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
#print(age.loc[0:10])

#true or false
age_is_null = pd.isnull(age)
#print age_is_null

#value or nan
age_null_true = age[age_is_null]
#print age_null_true


age_null_count = len(age_null_true)
print(age_null_count)

 

求均值

# 方法一
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print correct_mean_age

# 方法二
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print correct_mean_age

#错误的方法
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print mean_age

数据透视表

port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)

排序

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)

列名处理

col_names = food_info.columns.tolist()
#print col_names
gram_columns = []

for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

按列类型过滤

# 查找类型列
cat_features = list(train.select_dtypes(include=['object']).columns)
print "Categorical: {} features".format(len(cat_features))

# 查找连续数值列
cont_features = [cont for cont in list(train.select_dtypes(
                 include=['float64', 'int64']).columns) if cont not in ['loss', 'id']]
print "Continuous: {} features".format(len(cont_features))

查看类型变量类别个数

cat_uniques = []
for cat in cat_features:
    cat_uniques.append(len(train[cat].unique()))
    
uniq_values_in_categories = pd.DataFrame.from_items([('cat_name', cat_features), ('unique_values', cat_uniques)])

 

类型转换

data['id']=data['id'].astype('int64')

数据筛选

# 在df1,不在df2
df1=df1[~df1['cust_no'].isin(df2['cust_no'].tolist())]

# 日期转时间
data['date']=data['date'].astype("str").apply(lambda x:datetime.strptime(x,'%Y%M%D'))

多个dataframe合并处理

input1=pd.read_csv(path+folder+"01.csv",encoding="utf-8")
input2=pd.read_csv(path+folder+"02.csv",encoding="utf-8")
input3=pd.read_csv(path+folder+"03.csv",encoding="utf-8")
inputs=[input1,input2,input3]
df_all=reduce(lambda left,right:pd.merge(left,right,on="cust_no",how="inner"),inputs)

多个列合并

credit_type = pd.get_dummies(data["credit_type"],drop_first=True,prefix="credit_type")
tran_branch = pd.get_dummies(data["tran_branch"],prefix="branch")
data=pd.concat([data,credit_type,tran_branch],axis=1)

 

posted on 2018-10-25 16:13  布衣小工  阅读(275)  评论(0编辑  收藏  举报

导航