数据清洗和数据预处理

摘要:

内容:

  我的github 源代码:https://github.com/Tongzhenguo/Python-Project/blob/master/learntoscikit/preprocessing/demo.py

 

  1 # coding=utf-8
  2 __author__ = 'arachis'
  3 
  4 import numpy as np
  5 from sklearn import preprocessing
  6 
  7 """
  8     缺失值处理(填充负值,填充中值,填充众数,剔除,单独作为一个特征)
  9 """
 10 
 11 ##直接使用pandas 中的异常值处理
 12 import pandas as pd
 13 import numpy as np
 14 dates = pd.date_range('20130101', periods=6)
 15 df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
 16 df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
 17 # df1.fillna(-1)  ##填充负值
 18 df1.dropna() ## 剔除
 19 
 20 
 21 """
 22     异常值处理(剔除)
 23 """
 24 
 25 
 26 """
 27     z-score:均值为0,方差为1(标准化)(基于列向量)
 28 """
 29 X_train = np.array([[ 1., -1.,  2.],\
 30            [ 2.,  0.,  0.],\
 31            [ 0.,  1., -1.]])
 32 X_test = np.array([[ -3., -1.,  4.]])
 33 X_scaled = preprocessing.scale(X_train)
 34 print  X_scaled
 35 
 36 #Scaled data has zero mean and unit variance:
 37 print X_scaled.mean(axis=0)
 38 print X_scaled.std(axis=0)
 39 
 40 #Scaler
 41 scaler = preprocessing.StandardScaler().fit(X_train)
 42 print scaler.transform(X_train)
 43 print scaler.transform(X_test)
 44 
 45 print scaler.mean_
 46 print scaler.scale_
 47 
 48 """
 49     min-max score:映射到区间[0,1](最小-最大规范化)(基于列向量)
 50 """
 51 scaler = preprocessing.MinMaxScaler()
 52 print scaler.fit_transform(X_train)
 53 print scaler.transform(X_test) #新的数据可能会不在[0,1]区间内
 54 
 55 
 56 """
 57     规范化(Normalization)(归一化)(基于行向量)
 58 """
 59 normalizer = preprocessing.Normalizer(norm='l2')
 60 print normalizer.fit_transform(X_train)
 61 print normalizer.fit_transform(X_test)
 62 
 63 
 64 """
 65     二值化(Binarization)
 66 """
 67 #给定阈值,将特征转换为0/1
 68 binarizer = preprocessing.Binarizer(threshold=1.1)
 69 print binarizer.transform(X_train)
 70 print binarizer.transform(X_test)
 71 
 72 
 73 """
 74     类别特征编码(Encoding categorical features)
 75 """
 76 #知道各个类别的数目,可通过n_values指定
 77 enc = preprocessing.OneHotEncoder()
 78 print enc.fit([[1, 2, 3], [0, 2, 0]])
 79 print enc.transform([[1, 0, 0]]).toarray()
 80 
 81 
 82 """
 83     标签编码(Label encoding)
 84 """
 85 #非数值型转化为数值型
 86 le = preprocessing.LabelEncoder()
 87 le.fit(["paris", "paris", "tokyo", "amsterdam"])
 88 print le.transform(["tokyo", "tokyo", "paris"])
 89 
 90 
 91 """
 92     生成多项式特征(Generating polynomial features)
 93 """
 94 # (x1,x2) => (1,x1,x2,x1^2,x1*x2,x2^2)
 95 from sklearn.preprocessing import PolynomialFeatures
 96 X = np.arange(6).reshape(3, 2)
 97 poly = PolynomialFeatures(2)
 98 print poly.fit_transform(X)
 99 
100 
101 
102 """
103     滤除方差小的数据(Removing features with low variance)
104 """
105 from sklearn.feature_selection import VarianceThreshold
106 sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
107 sel.fit_transform(X)

 

posted @ 2016-12-07 18:01  混沌战神阿瑞斯  阅读(863)  评论(0编辑  收藏  举报