文本分类

 1 # coding = utf-8
 2 import numpy as np
 3 import os
 4 from sklearn.metrics import precision_score, recall_score
 5 from sklearn import metrics, preprocessing
 6 from scipy.sparse.csr import csr_matrix
 7 from sklearn.model_selection import train_test_split
 8 from sklearn.svm import SVC
 9 from sklearn.naive_bayes import MultinomialNB
10 from sklearn.neighbors import KNeighborsClassifier
11 
12 def calculate_result(actual, pred):
13     m_precision = metrics.precision_score(actual, pred)
14     m_recall = metrics.recall_score(actual, pred)
15     print 'predict info:'
16     print("precision:"+format(m_precision))
17     print("recall:"+format(m_recall))
18     print("f1-score:"+format(metrics.f1_score(actual, pred)))
19 
20 def split_data(content, label):
21     training_data, test_data, training_target, test_target = train_test_split(content, label, test_size=0.2, random_state=20)
22     return training_data, test_data, training_target, test_target
23 
24 def csr(process_data):
25     indptr = [0]
26     indices = []
27     data1 = []
28     vocabulary = {}
29     for d in process_data:
30         for term in d:
31             index = vocabulary.setdefault(term, len(vocabulary))
32             indices.append(index)
33             data1.append(1)
34         indptr.append(len(indices))
35     data_transform = csr_matrix((data1, indices, indptr), dtype=np.float64).toarray()
36     return data_transform
37 
38 f = open('/home/liumingyu/fenci/code/input.txt', 'r')
39 lines = f.readlines()
40 f.close()
41 dataset = []
42 for line in lines:
43     slope = line.index("\n")
44     line1 = line[0:slope-1]
45     dataset.append(list(line1))
46 datasets = np.array(dataset)
47 np.random.shuffle(datasets)
48 content = datasets[:, 2:]
49 label = datasets[:, 0]
50 
51 training_data, test_data, training_target, test_target = split_data(content, label)
52 train_data1 = csr(training_data)
53 test_data1 = csr(test_data)
54 training_target1 = np.array(training_target, dtype='float64')
55 test_target1 = np.array(test_target, dtype='float64')
56 
57 ######################################################
58 # SVM Classifier
59 print '*************************\nSVM\n*************************'
60 clf = SVC(C=1000, gamma=0.01)  # default with 'rbf'
61 clf.fit(train_data1, training_target1)
62 pred = clf.predict(test_data1)
63 calculate_result(test_target1, pred)
64 
65 ######################################################
66 # Multinomial Naive Bayes Classifier
67 print '*************************\nNaive Bayes\n*************************'
68 clf = MultinomialNB(alpha=0.01)
69 clf.fit(train_data1, training_target1)
70 pred = clf.predict(test_data1)
71 calculate_result(test_target1, pred)
72 
73 ######################################################
74 # KNN Classifier
75 print '*************************\nKNN\n*************************'
76 knnclf = KNeighborsClassifier()# default with k=5
77 knnclf.fit(train_data1, training_target1)
78 pred = knnclf.predict(test_data1)
79 calculate_result(test_target1, pred)

 

posted on 2017-03-30 14:45  Pod32gleo  阅读(164)  评论(0)    收藏  举报

导航