1 from sklearn.multioutput import MultiOutputClassifier
2 from sklearn.ensemble import RandomForestClassifier
3 from sklearn.ensemble import ExtraTreesClassifier
4 import numpy as np
5 from pandas import read_csv
6 import pandas as pd
7
8 root1="F:/goverment/shuili2/techproblem_text_train.csv"
9 root2="F:/goverment/shuili2/techproblem_text_test.csv"
10 root3="F:/goverment/shuili2/text_train_4problem.csv"
11 root4="F:/goverment/shuili2/text_test_4problem.csv"
12
13
14 '''大类小类一起预测'''
15 #root2="./id="+str(id_num)+"_process_data.csv"
16 dataset1 = read_csv(root1) #数据转化为数组
17 dataset1=dataset1.values
18 dataset2 = read_csv(root2) #数据转化为数组
19 dataset2=dataset2.values
20 X_train=dataset1[:,:28]# 到28之前都是变量
21 Y_train=dataset1[:,28:]# 28到之后都是lable
22 X_test=dataset2[:,:28]
23 Y_test=dataset2[:,28:]
24
25 print('多输出多分类器真实输出分类:\n',Y_train)
26 n_samples, n_features = X_train.shape #4000 29
27 n_outputs = Y_train.shape[1] # 4000*8
28 n_classes = 50 # 每种输出有50种分类
29 forest = RandomForestClassifier(n_estimators=500,random_state=1) # 生成随机森林多分类器
30 multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器
31 y_pred = multi_target_forest.fit(X_train, Y_train).predict(X_train)
32 print('多输出多分类器预测输出分类:\n',y_pred)
33 pp=multi_target_forest.predict(X_test)
34 a=pp
35 k=0
36 for i in range(len(a)):
37 if a[i][0]==Y_test[i][0] and a[i][1]==Y_test[i][1] and a[i][2]==Y_test[i][2] and a[i][3]==Y_test[i][3] and a[i][4]==Y_test[i][4] and a[i][5]==Y_test[i][5] and a[i][6]==Y_test[i][6] and a[i][7]==Y_test[i][7]:
38 k+=1
39 aa=k/1328*1.0
40 print(aa)
41
42
43 '''只预测大类'''
44 #root2="./id="+str(id_num)+"_process_data.csv"
45 dataset3 = read_csv(root1) #数据转化为数组
46 dataset3=dataset3.values
47 dataset4 = read_csv(root2) #数据转化为数组
48 dataset4=dataset4.values
49 X_train_big=dataset3[:,:28]
50 Y_train_big=dataset3[:,28:32]
51 X_test_big=dataset4[:,:28]
52 Y_test_big=dataset4[:,28:32]
53 print('只预测大类:多输出多分类器真实输出分类:\n',Y_train_big)
54 n_samples, n_features = X_train_big.shape #4000 29
55 n_outputs = Y_train_big.shape[1] # 4000*8
56 n_classes = 11 # 每种输出有11种分类
57 forest = RandomForestClassifier(n_estimators=200,random_state=1) # 生成随机森林多分类器
58 multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器
59 y_pred = multi_target_forest.fit(X_train_big, Y_train_big).predict(X_train_big)
60 print('多输出多分类器预测输出分类:\n',y_pred)
61 pp=multi_target_forest.predict(X_test_big)
62 a=pp
63 k=0
64 for i in range(len(a)):
65 if a[i][0]==Y_test_big[i][0] and a[i][1]==Y_test_big[i][1] and a[i][2]==Y_test_big[i][2] and a[i][3]==Y_test_big[i][3]:
66 k+=1
67 aa=k/1328*1.0
68 print(aa)
69
70
71 '''只预测小类'''
72 #root2="./id="+str(id_num)+"_process_data.csv"
73 dataset4 = read_csv(root3) #数据转化为数组
74 dataset4=dataset4.values
75 dataset5 = read_csv(root4) #数据转化为数组
76 dataset5=dataset5.values
77 X_train_samll=dataset4[:,:28]
78 Y_train_samll=dataset4[:,28:32]
79 X_test_samll=dataset5[:,:28]
80 Y_test_samll=dataset5[:,28:32]
81 print('只预测小类:多输出多分类器真实输出分类:\n',Y_train_samll)
82 n_samples, n_features = X_train_samll.shape #4000 29
83 n_outputs = Y_train_samll.shape[1] # 4000*4
84 n_classes = 61 # 每种输出有61种分类
85 forest = RandomForestClassifier(n_estimators=200,random_state=1) # 生成随机森林多分类器
86 multi_target_forest = MultiOutputClassifier(forest) # 构建多输出多分类器
87 y_pred = multi_target_forest.fit(X_train_samll, Y_train_samll).predict(X_train_samll)
88 print('多输出多分类器预测输出分类:\n',y_pred)
89 pp=multi_target_forest.predict(X_test_samll)
90 a=pp
91 k=0
92 for i in range(len(a)):
93 if a[i][0]==Y_test_samll[i][0] and a[i][1]==Y_test_samll[i][1] and a[i][2]==Y_test_samll[i][2] and a[i][3]==Y_test_samll[i][3]:
94 k+=1
95 aa=k/1328*1.0
96 print(aa)
97
98
99
100 '''
101 from pandas import read_csv
102 import pandas as pd
103 import numpy as np
104 from skmultilearn.problem_transform import BinaryRelevance
105 from sklearn.naive_bayes import GaussianNB
106 from sklearn.metrics import accuracy_score
107
108
109 root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/shuili2/data.csv"
110 #root2="./id="+str(id_num)+"_process_data.csv"
111 dataset = read_csv(root1) #数据转化为数组
112 dataset=dataset.values
113 x_train=dataset[:4000,:29]
114 y_train=dataset[:4000,29:]
115
116 x_test=dataset[4000:,:29]
117 y_test=dataset[4000:,29:]
118
119 # initialize binary relevance multi-label classifier
120 # with a gaussian naive bayes base classifier
121 classifier = BinaryRelevance(GaussianNB())
122
123 # train
124 classifier.fit(x_train, y_train)
125
126 # predict
127 predictions = classifier.predict(x_test)
128 accuracy_score(y_test,predictions)
129 '''
130
131
132 '''---------------------------------'''
133 '''
134 import numpy as np
135 import pandas as pd
136 from keras.models import Sequential
137 from keras.layers import Dense, Dropout
138 from keras.wrappers.scikit_learn import KerasClassifier
139 from keras.utils import np_utils
140 from sklearn.model_selection import train_test_split, KFold, cross_val_score
141 from sklearn.preprocessing import LabelEncoder
142 from pandas import read_csv
143 from sklearn.naive_bayes import GaussianNB
144 from sklearn.metrics import accuracy_score
145
146
147 root1="D:/Anaconda3-5.0.1-Windows-x86_64/anaconda/work/shuili2/data.csv"
148 #root2="./id="+str(id_num)+"_process_data.csv"
149 dataset = read_csv(root1) #数据转化为数组
150 dataset=dataset.values
151
152 # load dataset
153 dataframe = pd.read_csv("data.csv", header=None)
154 dataset = dataframe.values
155 X = dataset[:, 0:29].astype(float)
156 Y = dataset[:, 29:]
157
158 # encode class values as integers
159 #encoder = LabelEncoder()
160 #encoded_Y = encoder.fit_transform(Y)
161 # convert integers to dummy variables (one hot encoding)
162 #dummy_y = np_utils.to_categorical(encoded_Y)
163
164 # define model structure
165 def baseline_model():
166 model = Sequential()
167 model.add(Dense(output_dim=10, input_dim=29, activation='relu'))
168 model.add(Dropout(0.2))
169 model.add(Dense(output_dim=8, input_dim=10, activation='softmax'))
170 # Compile model
171 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
172 return model
173 estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=200, batch_size=50)
174 # splitting data into training set and test set. If random_state is set to an integer, the split datasets are fixed.
175 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.01, random_state=0)
176 estimator.fit(X_train, Y_train)
177
178 # make predictions
179 pred = estimator.predict(X_test)
180
181
182 # inverse numeric variables to initial categorical labels
183 #init_lables = encoder.inverse_transform(pred)
184
185 # k-fold cross-validate
186 seed = 42
187 np.random.seed(seed)
188 kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
189 results = cross_val_score(estimator, X, Y, cv=kfold)
190 '''
from pandas import read_csv
root1="F:/goverment/shuili2/techproblem_text_train.csv"
root2="F:/goverment/shuili2/techproblem_text_test.csv"
root3="F:/goverment/shuili2/text_train_4problem.csv"
root4="F:/goverment/shuili2/text_test_4problem.csv"
'''大类小类一起预测'''
#root2="./id="+str(id_num)+"_process_data.csv"
dataset1 = read_csv(root1) #数据转化为数组
dataset1=dataset1.values
dataset2 = read_csv(root2) #数据转化为数组
dataset2=dataset2.values
X_train=dataset1[:,:28]
Y_train=dataset1[:,28:]
X_test=dataset2[:,:28]
Y_test=dataset2[:,28:]
from pprint import pprint
pprint(dataset1)
##使用二进制相关性
#scikit-multilearn
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
#initialize二进制相关多标签分类器
#用高斯朴素贝叶斯基分类器
classifier = BinaryRelevance(GaussianNB())
#训练
classifier.fit(X_train, Y_train)
#预测
predictions = classifier.predict(X_test)
#计算精度用
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,predictions)