import xlrd
import numpy as np
from sklearn.model_selection import train_test_split #划分测试集和训练集
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score #交叉验证
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay #混淆矩阵
# 特异度函数
def specificity_loss_func(ground_truth, predictions):
tp, tn, fn, fp = 0.0,0.0,0.0,0.0
for l,m in enumerate(ground_truth):
if m==predictions[l] and m==1:
tp+=1
if m==predictions[l] and m==0:
tn+=1
if m!=predictions[l] and m==1:
fn+=1
if m!=predictions[l] and m==0:
fp+=1
return tn/(tn+fp)
# 读取表格
input_list = []
input_sheet = xlrd.open_workbook("./data.xls")
input_sheet = input_sheet.sheet_by_index(0)
nrows = input_sheet.nrows
for i in range(nrows):
input_list.append(np.array(input_sheet.row_values(i)))
input_data = np.array(input_list)
# 去除表头第一行
input_data = input_data[1:,:]
# 去除第一列
input_data = input_data[:,1:]
input_data = np.array(input_data, dtype=float)
svc_list = []
tree_list = []
adaboost_list =[]
train_x,test_x,train_y,test_y = train_test_split(input_data[:,:-1],input_data[:,-1],test_size=0.2,random_state=10)#30
# 1.决策树模型
# 1-1.评估指标
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion = 'entropy',)
model.fit(train_x,train_y)
result = model.predict(test_x)
prob = model.predict_proba(test_x)
acc = np.mean(result == test_y)
scores = cross_val_score(model, input_data[:,:-1], input_data[:,-1],cv=10)
precision = metrics.precision_score(np.argmax(prob, axis=-1), test_y, average='micro')
recall = metrics.recall_score(np.argmax(prob, axis=-1), test_y, average="micro")
f1 = metrics.f1_score(np.argmax(prob, axis=-1), test_y, average="micro")
spe = specificity_loss_func(test_y, np.argmax(prob, axis=-1))
#tree_list.append(acc)
print("决策树模型")
print("准确率:",acc)
print("精确率:",precision)
print("召回率:",recall)
print("特异度:",spe)
print("F1:",f1)
print("交叉验证准确率:",scores.mean())
# 1-2.混淆矩阵
confusion_mat = confusion_matrix(np.array(test_y), np.array(result))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat)
disp.plot(
include_values=True,
cmap=plt.cm.Blues,
ax=None,
xticks_rotation="horizontal",
values_format=".2f"
)
plt.title("Tree confusion matrix")
plt.show()
# 1-3.ROC曲线
roc_list = []
for i in range(len(test_y)):
roc_list.append(prob[i][int(test_y[i])])
roc = metrics.roc_auc_score(test_y,np.array(result))
print("AUC值:",roc)
fpr,tpr,thresholds=metrics.roc_curve(test_y,np.array(result))
plt.plot(fpr,tpr, label="ROC curve (area={0})".format(round(roc,2)))
plt.plot([0,1],[0,1],linestyle='dashed')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Tree-ROC")
plt.legend(loc='lower right')
plt.show()
print("\n\n")
# 2.逻辑回归模型
# 2-1.评估指标
from sklearn.linear_model import LogisticRegression
LogisticRegressionModel = LogisticRegression()
LogisticRegressionModel.fit(train_x,train_y) #训练模型
LogisticRegressionModel_result = LogisticRegressionModel.predict(test_x) #模型预测
prob = model.predict_proba(test_x) #proba返回的是对于预测为各个类别的概率
acc=np.mean(LogisticRegressionModel_result == test_y) #计算测试集的准确率
scores = cross_val_score(LogisticRegressionModel, input_data[:,:-1], input_data[:,-1],cv=10) #计算十折交叉验证的准确率
precision = metrics.precision_score(np.argmax(prob, axis=-1), test_y, average='macro') #计算精确率
recall = metrics.recall_score(np.argmax(prob, axis=-1), test_y, average="macro") #计算召回率
f1 = metrics.f1_score(np.argmax(prob, axis=-1), test_y, average="macro") #计算F1
spe = specificity_loss_func(test_y, np.argmax(prob, axis=-1)) #计算特异度
print("逻辑回归模型")
print("准确率:",acc)
print("精确率:",precision)
print("召回率:",recall)
print("特异度:",spe)
print("F1:",f1)
print("交叉验证准确率:",scores.mean())
# 2-2.混淆矩阵
confusion_mat = confusion_matrix(np.array(test_y), np.array(LogisticRegressionModel_result))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat)
disp.plot(
include_values=True, #每个单元格上显示具体数值
cmap=plt.cm.Blues, #热力图颜色设为蓝色
ax=None, #默认,绘制图的坐标轴,否则使用当前活动的坐标轴
xticks_rotation="horizontal",#默认,类似旋转日期刻度
values_format=".2f", #显示数值的格式,两位小数
)
plt.title("LR-ROC") #图标题
plt.show()
# 2-3.ROC曲线
roc_list = []
for i in range(len(test_y)): #得到测试样本的个数,次数
roc_list.append(prob[i][int(test_y[i])])
#print(roc_list) # 15个测试集取0|1的概率
roc = metrics.roc_auc_score(test_y,np.array(roc_list)) #用真值,概率值计算AUC值
print("AUC值:",roc)
fpr,tpr,thresholds=metrics.roc_curve(test_y,np.array(roc_list)) #真实值,概率值放入ROC曲线
# print('FPR:',fpr)
# print('TPR:',tpr)
# print('thresholds:',thresholds)
plt.plot(fpr,tpr, label="ROC curve (area={0})".format(round(roc,2))) #画ROC曲线,保留两位小数
plt.plot([0,1],[0,1],linestyle='dashed') #dashed,线条为虚线
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("LR-ROC")
plt.legend(loc='lower right') #将图例显示在右下角
plt.show()
print("\n\n")
# 3.随机森林
# 3-1.评估指标
from sklearn.ensemble import RandomForestClassifier
SLmodel = RandomForestClassifier()
SLmodel.fit(train_x,train_y)
SLmodel_result = SLmodel.predict(test_x)
prob = SLmodel.predict_proba(test_x)
acc = np.mean(SLmodel_result == test_y)
scores = cross_val_score(SLmodel, input_data[:,:-1], input_data[:,-1],cv=10)
precision = metrics.precision_score(np.argmax(prob, axis=-1), test_y, average='macro')
recall = metrics.recall_score(np.argmax(prob, axis=-1), test_y, average="macro")
f1 = metrics.f1_score(np.argmax(prob, axis=-1), test_y, average="macro")
spe = specificity_loss_func(test_y, np.argmax(prob, axis=-1))
#tree_list.append(acc)
print("随机森林模型")
print("准确率:",acc)
print("精确率:",precision)
print("召回率:",recall)
print("特异度:",spe)
print("F1:",f1)
print("交叉验证准确率:",scores.mean())
# 3-1.混淆矩阵
confusion_mat = confusion_matrix(np.array(test_y), np.array(SLmodel_result))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat)
disp.plot(
include_values=True,
cmap=plt.cm.Blues,
ax=None,
xticks_rotation="horizontal",
values_format=".2f"
)
plt.title("RF confusion matrix")
plt.show()
# 3-2.ROC曲线
roc_list = []
for i in range(len(test_y)):
roc_list.append(prob[i][int(test_y[i])])
roc = metrics.roc_auc_score(test_y,np.array(SLmodel_result))
print("AUC值:",roc)
fpr,tpr,thresholds=metrics.roc_curve(test_y,np.array(SLmodel_result))
plt.plot(fpr,tpr, label="ROC curve (area={0})".format(round(roc,2)))
plt.plot([0,1],[0,1],linestyle='dashed')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("RF-ROC")
plt.legend(loc='lower right') #
plt.show()
print("\n\n")
# 4.SVM模型
# 4-1.评估指标
from sklearn.svm import SVC
SVCModel = SVC(probability=True)
SVCModel.fit(train_x,train_y)
SVCModel_result = SVCModel.predict(test_x)
prob = SVCModel.predict_proba(test_x)
# print(prob)
acc = np.mean(SVCModel_result == test_y)
scores = cross_val_score(SVCModel, input_data[:,:-1], input_data[:,-1],cv=10)
precision = metrics.precision_score(np.argmax(prob, axis=-1), test_y, average='macro')
recall = metrics.recall_score(np.argmax(prob, axis=-1), test_y, average="macro")
f1 = metrics.f1_score(np.argmax(prob, axis=-1), test_y, average="macro")
spe = specificity_loss_func(test_y, np.argmax(prob, axis=-1))
#svc_list.append(acc)
print("SVM模型")
print("准确率:",acc)
print("精确率:",precision)
print("召回率:",recall)
print("特异度:",spe)
print("F1:",f1)
print("交叉验证准确率:",scores.mean())
# 4-2.混淆矩阵
confusion_mat = confusion_matrix(np.array(test_y), np.array(SVCModel_result))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat)
disp.plot(
include_values=True,
cmap=plt.cm.Blues,
ax=None,
xticks_rotation="horizontal",
values_format=".2f",
)
plt.title("SVM-confusion matrix")
plt.show()
# 4-3.ROC曲线
roc_list = []
for i in range(len(test_y)):
roc_list.append(prob[i][int(test_y[i])])
roc = metrics.roc_auc_score(test_y,np.array(roc_list))
print("AUC值:",roc)
fpr,tpr,thresholds=metrics.roc_curve(test_y,np.array(roc_list))
plt.plot(fpr,tpr, label="ROC curve (area={0})".format(round(roc,2)))
plt.plot([0,1],[0,1],linestyle='dashed')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("SVM-ROC")
plt.legend(loc='lower right')
plt.show()
print("\n\n")
# 5.AdaBoost模型
# 5-1.评估指标
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=100)
model.fit(train_x,train_y)
result = model.predict(test_x)
prob = model.predict_proba(test_x)
# print(prob)
acc = np.mean(result == test_y)
scores = cross_val_score(model, input_data[:,:-1], input_data[:,-1],cv=10)
precision = metrics.precision_score(np.argmax(prob, axis=-1), test_y, average='macro')
recall = metrics.recall_score(np.argmax(prob, axis=-1), test_y, average="macro")
f1 = metrics.f1_score(np.argmax(prob, axis=-1), test_y, average="macro")
spe = specificity_loss_func(test_y, np.argmax(prob, axis=-1))
#adaboost_list.append(acc)
print("AdaBoost模型")
print("准确率:",acc)
print("精确率:",precision)
print("召回率:",recall)
print("特异度:",spe)
print("F1:",f1)
print("交叉验证准确率:",scores.mean())
print("特征重要性:",model.feature_importances_)
# 5-2.混淆矩阵
confusion_mat = confusion_matrix(np.array(test_y), np.array(result))
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_mat)
disp.plot(
include_values=True,
cmap=plt.cm.Blues,
ax=None,
xticks_rotation="horizontal",
values_format=".2f"
)
plt.title("Adboost confusion matrix")
plt.show()
#3.画ROC曲线
roc_list = []
for i in range(len(test_y)):
roc_list.append(prob[i][int(test_y[i])])
roc = metrics.roc_auc_score(test_y,np.array(roc_list))
print("AUC值:",roc)
fpr,tpr,thresholds=metrics.roc_curve(test_y,np.array(roc_list))
plt.plot(fpr,tpr, label="ROC curve (area={0})".format(round(roc,2)))
plt.plot([0,1],[0,1],linestyle='dashed')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Adboost-ROC")
plt.legend(loc='lower right')
plt.show()
# print(test_x)
print("十次交叉验证准确率:",scores)