1 from sklearn.preprocessing import OneHotEncoder
2 import numpy
3 onehot = OneHotEncoder()
4 #建立一个映射,将多个特征共存的情况定义为单个数表示比如有特征{a,b},a用1,b用2,ab用3----(1)
5 import pandas
6 data = pandas.read_csv("userFeature_part_bat.csv")
7
8 #列出所有特征值,遍历,uid是用户唯一标识,不应该算作特征
9
10 userfeature = ["age","carrier","consumptionAbility","ct","education","gender","house","interest1","interest2","interest3","interest4","interest5","kw1","kw2","kw3","marriageStatus","os","topic1","topic2","topic3","LBS","appIdAction","appIdInstall"]
11 #for index in data[feature] :
12 userdata = []
13 for index in range(len(data["uid"])):
14 feature_li = []
15 for feature in userfeature:
16 # a = data[feature]
17 # print(a[0],type(a[index]),isinstance(a[0],(numpy.int64)))
18
19 if isinstance(data[feature][index],numpy.int64):
20 feature_li.append(int(data[feature][index]))
21 elif isinstance(data[feature][index],numpy.float64):
22 feature_li.append(0)#缺失值用0填充,这是不合理的,有待改进
23 elif isinstance(data[feature][index], numpy.float):
24 feature_li.append(0)
25 else :
26 trans = data[feature][index].strip().split(" ")
27 trans = map(int,trans)
28 trans = sorted(trans)
29 #print(trans)
30 s = 0
31 for num in trans :
32 s += num
33 feature_li.append(s)
34
35 print(feature_li)
36 userdata.append(feature_li)
37 userdata = numpy.array(userdata)
38 print("--------------------------------------------------------------------")
39 print(userdata)
40 '''
41 第一步要把所有特征值id提取,
42 特征特征值id进行排序,组合成一个特征值,这样就实现了(1)的目标
43 多特征值id时我们把特征值排序再进行onehot编码,带来的缺点:
44 例如 特征a有特征id1,2,3 b有特征id1,2他们显然是相近的,我的方法onehot
45 编码后他们不再相关;我想应该有更好的方法可以解决,比如:word2vec
46 '''