1 from sklearn.neighbors import KNeighborsClassifier
2 from sklearn.externals import joblib
3 onehot = OneHotEncoder()
4 for b in range(1,115):
5 addata = pd.read_csv("adFeature.csv")
6 userdata = pd.read_csv("userFeature_%d_part_bat.csv"%(b*100000))
7 traindata = pd.read_csv("train.csv")
8 data = pd.merge(userdata,traindata)
9 data = pd.merge(data,addata)
10 #data.to_csv("111.csv")
11 #列出所有特征值,遍历,uid是用户唯一标识,不应该算作特征
12
13 userfeature = ["age","carrier","consumptionAbility","ct","education","gender","house","interest1","interest2","interest3","interest4","interest5","kw1","kw2","kw3","marriageStatus","os","topic1","topic2","topic3","LBS","appIdAction","appIdInstall","campaignId","creativeId","creativeSize","adCategoryId","advertiserId","productId","productType"]
14 #for index in data[feature] :
15 userdata = []
16 for index in range(len(data["uid"])):
17 feature_li = []
18 for feature in userfeature:
19 # a = data[feature]
20 # print(a[0],type(a[index]),isinstance(a[0],(numpy.int64)))
21
22 if isinstance(data[feature][index],numpy.int64):
23 feature_li.append(int(data[feature][index]))
24 elif isinstance(data[feature][index],numpy.float64):
25 feature_li.append(0)#缺失值用0填充,这是不合理的,有待改进
26 elif isinstance(data[feature][index], numpy.float):
27 feature_li.append(0)
28 else :
29 trans = data[feature][index].strip().split(" ")
30 trans = map(int,trans)
31 trans = sorted(trans)
32 #print(trans)
33 s = 0
34 for num in trans :
35 s += num
36 feature_li.append(s)
37 # print(feature_li)
38 userdata.append(feature_li)
39 userdata = numpy.array(userdata)
40 onehot.fit(userdata)
41 print("--------------------------------------------------------------------")
42 Y = numpy.array(data["label"])
43 print("--------------------------------------------------------------------")
44 X = onehot.transform(userdata)
45 print(X)
46 print(numpy.shape(X))
47 knn = KNeighborsClassifier()
48 model = knn.fit(X[:99000],Y[:99000])
49 joblib.dump(model,"%d.model"%(b))
50 result = model.predict(X[-900:])
51 print("-----------------------------------------------------------------------")
52 #print(Y[-900:])
53 print("--------------------------------------------------------------------------")
54 s = 0
55 for i in range(len(result)):
56 if result[i] == Y[-900:][i] :
57 s +=1
58 a = s/len(result)
59 f = open("result.txt","a",encoding="utf-8")
60 f.write(str(a))
61 print(a)