数据分类实验的python程序
实验设置要求:
- 数据集:共12个,从本地文件夹中包含若干个以xlsx为后缀的Excel文件,每个文件中有一个小规模数据,有表头,最后一列是分类的类别class,其他列是特征,数值的。
- 实验方法:XGBoost、AdaBoost、SVM (采用rbf核)、Neural Network分类器
- 输出:分类准确率,即十折交叉验证的准确率均值和方差,并重复5次实验,不同数据的实验结果分别保存至各自的一个csv文件。
- 其他要求:SVC增加rbf参数设置,默认为0.001、MLPClassifier为1层隐层神经网络,隐层节点为100. XGBoost和AdaBoost弱分类器设置。 cross_val_score增加数据标准化和n_jobs设置。由于数据的类别可能是非连续的字符形式,增加class的映射
Excel中的数据形式如下:

python程序如下:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from concurrent.futures import ThreadPoolExecutor
# 1. 读取文件夹中的所有.xlsx文件
data_folder = "./分类数据集"
file_list = [f for f in os.listdir(data_folder) if f.endswith('.xlsx')]
result_folder = "./results"
# 定义分类器
classifiers = {
"XGBoost": xgb.XGBClassifier(),
"AdaBoost": AdaBoostClassifier(n_estimators=50), #https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
"SVM_rbf": SVC(C=1.0, kernel="rbf", gamma='scale'), #gamma默认为1 / (n_features * X.var()),
#可以根据数据集进行调整
"Neural_Network": MLPClassifier(hidden_layer_sizes=(100), max_iter=1000)
}
# # 对每一个文件进行分类实验
# def process_file(file):
# 对每一个文件进行分类实验
for file in file_list:
df = pd.read_excel(os.path.join(data_folder, file))
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
# 类别映射
le = LabelEncoder()
y = le.fit_transform(y)
results = {
"Classifier": [],
"Experiment 1": [],
"Experiment 2": [],
"Experiment 3": [],
"Experiment 4": [],
"Experiment 5": [],
"Mean Accuracy": [],
"Accuracy Variance": []
}
# 使用四种分类器
for clf_name, clf in classifiers.items():
all_accuracies = []
# 使用标准化和分类器创建流水线
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', clf)
])
# 重复5次实验
for exp_num in range(1, 6):
print(clf_name, exp_num)
kf = KFold(n_splits=10, shuffle=True, random_state=None)
accuracies = cross_val_score(pipeline, X, y, cv=kf,n_jobs=16)
results[f"Experiment {exp_num}"].append(np.mean(accuracies))
all_accuracies.extend(accuracies)
results["Classifier"].append(clf_name)
results["Mean Accuracy"].append(np.mean(all_accuracies))
results["Accuracy Variance"].append(np.var(all_accuracies))
# 保存到.csv文件
result_df = pd.DataFrame(results)
result_df.to_csv(os.path.join(result_folder, f"results_{file.replace('.xlsx', '.csv')}"), index=False)
# # 使用多线程处理文件
# with ThreadPoolExecutor() as executor:
# executor.map(process_file, file_list)
print("Experiments completed!")
浙公网安备 33010602011771号