1 # coding = utf-8
2 import numpy as np
3 import os
4 from sklearn.metrics import precision_score, recall_score
5 from sklearn import metrics, preprocessing
6 from scipy.sparse.csr import csr_matrix
7 from sklearn.model_selection import train_test_split
8 from sklearn.svm import SVC
9 from sklearn.naive_bayes import MultinomialNB
10 from sklearn.neighbors import KNeighborsClassifier
11
12 def calculate_result(actual, pred):
13 m_precision = metrics.precision_score(actual, pred)
14 m_recall = metrics.recall_score(actual, pred)
15 print 'predict info:'
16 print("precision:"+format(m_precision))
17 print("recall:"+format(m_recall))
18 print("f1-score:"+format(metrics.f1_score(actual, pred)))
19
20 def split_data(content, label):
21 training_data, test_data, training_target, test_target = train_test_split(content, label, test_size=0.2, random_state=20)
22 return training_data, test_data, training_target, test_target
23
24 def csr(process_data):
25 indptr = [0]
26 indices = []
27 data1 = []
28 vocabulary = {}
29 for d in process_data:
30 for term in d:
31 index = vocabulary.setdefault(term, len(vocabulary))
32 indices.append(index)
33 data1.append(1)
34 indptr.append(len(indices))
35 data_transform = csr_matrix((data1, indices, indptr), dtype=np.float64).toarray()
36 return data_transform
37
38 f = open('/home/liumingyu/fenci/code/input.txt', 'r')
39 lines = f.readlines()
40 f.close()
41 dataset = []
42 for line in lines:
43 slope = line.index("\n")
44 line1 = line[0:slope-1]
45 dataset.append(list(line1))
46 datasets = np.array(dataset)
47 np.random.shuffle(datasets)
48 content = datasets[:, 2:]
49 label = datasets[:, 0]
50
51 training_data, test_data, training_target, test_target = split_data(content, label)
52 train_data1 = csr(training_data)
53 test_data1 = csr(test_data)
54 training_target1 = np.array(training_target, dtype='float64')
55 test_target1 = np.array(test_target, dtype='float64')
56
57 ######################################################
58 # SVM Classifier
59 print '*************************\nSVM\n*************************'
60 clf = SVC(C=1000, gamma=0.01) # default with 'rbf'
61 clf.fit(train_data1, training_target1)
62 pred = clf.predict(test_data1)
63 calculate_result(test_target1, pred)
64
65 ######################################################
66 # Multinomial Naive Bayes Classifier
67 print '*************************\nNaive Bayes\n*************************'
68 clf = MultinomialNB(alpha=0.01)
69 clf.fit(train_data1, training_target1)
70 pred = clf.predict(test_data1)
71 calculate_result(test_target1, pred)
72
73 ######################################################
74 # KNN Classifier
75 print '*************************\nKNN\n*************************'
76 knnclf = KNeighborsClassifier()# default with k=5
77 knnclf.fit(train_data1, training_target1)
78 pred = knnclf.predict(test_data1)
79 calculate_result(test_target1, pred)