机器学习 BASEML到底是什么

from .base import baseml from .BaseClassification import Classification from .BaseRegression import Regression from .BaseCluster import Cluster from .BaseDimentionReduction import DimentionReduction __all__ = [ 'baseml', 'Classification', 'Regression', 'Cluster', 'DimentionReduction', ]
# BaseML基类，各个大类能够继承其中的基本方法 import os import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import joblib class baseml: """BaseML中的继承基类,单例模式避免多次调用创建 """ def __init__(self): self.cwd = os.path.dirname(os.getcwd()) # 获取当前文件的绝对路径 self.file_dirname = os.path.dirname(os.path.abspath(__file__)) self.x_train, self.x_test, self.y_train, self.y_test, self.x_val, self.y_val = [ ], [], [], [], [], [] self.X = [] self.Y = [] self.dataset = [] self.model = None self.test_size = 0.2 self.scaler = None self.demo_input = None self.input_shape = None # 采用单例，避免基类创建太多次 def __new__(cls, args, kwargs): # print("__new__") if not hasattr(baseml, "_instance"): # print("创建新实例") baseml._instance = object.__new__(cls) return baseml._instance def train(self): # 必须要改写的类 raise NotImplementedError("train function must be implemented") def inference(self): # 必须要改写的类 raise NotImplementedError("inference function must be implemented") def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[], shuffle=True, show=False, split=True, scale=False): """Load the model's data set. Args: X (str\|numpy\|pandas\|list): 自变量. y (str\|numpy\|pandas\|list, optional): 目标值. 默认为 []. type (str, optional): X和y的输入格式, choice = ['csv', 'numpy','pandas','list','txt], 最后统一转换为numpy. x_column (list, optional): X 的索引列. 默认设置为X的所有列. y_column (list, optional): y的索引列. 默认设置为y的所有列. shuffle (bool, optional): 是否对元素随机排序. 默认为True. show (bool, optional): 显示5条数据. 默认为True. split(bool, optional): 是否划分数据集为训练集和测试集. 默认为True. scale(bool, optional): 是否对数据进行归一化. False. """ if (type == 'csv' or type == 'txt') and len(x_column) == 0: raise ValueError("请传入数据列号") if type == 'csv': self.dataset = pd.read_csv(X).values # .values就转成numpy格式了 if shuffle: np.random.shuffle(self.dataset) self.get_data(self.dataset, self.dataset, x_column, y_column, split, scale) elif type == 'numpy': if shuffle: X, y = self.shuffle_data(X, y) self.get_data(X, y, x_column, y_column, split, scale) elif type == 'pandas': X = X.values y = y.values if len(y) > 0 else [] if shuffle: X, y = self.shuffle_data(X, y) self.get_data(X, y, x_column, y_column, split, scale) elif type == 'list': X = np.array(X) y = np.array(y) if len(y) > 0 else [] if shuffle: X, y = self.shuffle_data(X, y) self.get_data(X, y, x_column, y_column, split, scale) elif type == 'txt': self.dataset = np.loadtxt(X) self.dataset = self.dataset.values if shuffle: np.random.shuffle(self.dataset) self.get_data(self.dataset, self.dataset, x_column, y_column, split, scale) print("Load dataset successfully!") if show and len(self.x_train) >= 5: # 显示前5条数据 print("X") print(self.x_train[:5]) print("y") if len(self.y_train) >= 5: print(self.y_train[:5]) else: print("None") def get_data(self, X, y, x_column, y_column, split, scale): """通过列号获取真实的训练数据 Args: X (numpy.ndarray): 自变量. y (numpy.ndarray): 因变量. x_column (list): 自变量的列索引集合. y_column (list): 因变量的列索引集合. """ if X.ndim == 1: X = X.reshape(-1, 1) if len(x_column) == 0 and len(X): # 如果没有赋值，那么默认选用所有列 x_column = list(range(X.shape[1])) if len(y_column) == 0 and len(y): # 如果没有赋值，默认用y的所有列 if y.ndim == 1: y_column = [0] else: y_column = list(range(y.shape[1])) if len(X): self.x_train = X[:, x_column] if scale: # 对训练数据进行归一化，在聚类、部分分类的时候需要使用 self.scaler = MinMaxScaler(feature_range=(0, 1)) self.x_train = self.scaler.fit_transform(self.x_train) if len(y): # if y.ndim == 1: y = y.reshape(-1, 1) self.y_train = y[:, y_column] if self.y_train.shape[0]: self.dataset = np.concatenate( (self.x_train, self.y_train), axis=1) # 按列进行拼接 else: self.dataset = self.x_train if split: # 进行数据集划分 self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.x_train, self.y_train, test_size=self.test_size, random_state=42) def shuffle_data(self, X, y): if len(X) == len(y): c = list(zip(X, y)) # 保持X与y的对应关系 np.random.shuffle(c) X = np.array([t[0] for t in c]) y = np.array([t[1] for t in c]) elif len(X) > 0 and len(y) == 0: np.random.shuffle(X) return X, y def save(self, path="checkpoint.pkl"): data = { 'model': self.model, 'input_shape': self.x_train.shape, 'demo_input': self.x_train[:1], } print("Saving model checkpoints...") joblib.dump(data, path, compress=3) print("Saved successfully!") def load(self, path): # self.model = joblib.load(path) model = joblib.load(path) if isinstance(model, dict): self.model = model['model'] try: self.demo_input = model['demo_input'] self.input_shape = model['input_shape'] except: pass else: self.model = model def reverse_scale(self, data): return self.scaler.inverse_transform(data) def get_test_data(self): return self.x_test, self.y_test def convert_np(self, data): if isinstance(data, np.ndarray): pass elif isinstance(data, list): data = np.array(data) elif isinstance(data, pd.DataFrame): data = data.values elif isinstance(data, tuple): data = np.array(data) else: TypeError("The type {} is not supported".format(type(data))) return data def plot(self, X=None, y_true=None): # 模型可视化，若不被改写则不被支持 raise NotImplementedError( "Error Code: -405. No implementation of this method.") def metricplot(self, X=None, y_true=None): # 模型可视化，若不被改写则不被支持 raise NotImplementedError( "Error Code: -405. No implementation of this method.") def load_tab_data(self, data_path, train_val_ratio=1.0, shuffle=True,random_seed=42,y_type='float',kw): # if y_type == 'long' and self.task_type == 'reg': # y_type = 'float' data = np.loadtxt(data_path, dtype=float, delimiter=',',skiprows=1) # [120, 4] x = data[:,:-1] y = data[:, -1] y = y.astype(y_type) if 0 < train_val_ratio < 1: train_size = int(train_val_ratio len(x)) val_size = len(x) - train_size x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=train_size, test_size=val_size, random_state=random_seed,shuffle=shuffle) else: x_train, y_train = x, y x_val, y_val = None, None # if self.task_type == 'cls': # y_train = y_train.astype(int) # y_val = y_val.astype(int) if y_val is not None else None # elif self.task_type =='reg': # y_train = y_train.astype(float) # y_val = y_val.astype(float) if y_val is not None else None self.x_train = x_train self.y_train = y_train self.x_test = x_val self.y_test = y_val return x_train, y_train, x_val, y_val def set_para(self, *kw): for i in kw: print("Setting {} to {}".format(i, kw[i])) setattr(self.model, i, kw[i]) @property def para(self): return self.para @para.setter def para(self, kw): for i in kw: print("Setting {} to {}".format(i, kw[i])) setattr(self.model, i, kw[i]) def valid(self, path=None, x=None ,y=None ,metrics='accuracy'): """验证模型的准确率 Args: path (str): 验证集的路径 x (np.ndarray, optional): 验证集的特征. Defaults to None. y (np.ndarray, optional): 验证集的标签. Defaults to None. metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'. Returns: acc: 返回验证指标的值 y_pred: 返回预测y值 """ if path is None and x is None and y is None: # 如果没有输入数据，默认采用x_test和y_test x = self.x_test y = self.y_test elif x is None and y is None: # 如果输入了路径，但是没有输入数据，则读取路径 df = pd.read_csv(path) x = df.iloc[:, :-1].values y = df.iloc[:, -1].values self.x_test = x self.y_test = y # 验证集的特征和标签不能为空 assert x is not None and y is not None, "Error Code: -801. The validation set cannot be empty. " assert len(x) > 0 and len(y) > 0, "Error Code: -801. The validation set cannot be empty. " y_pred = self.inference(x) from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,\ r2_score,mean_squared_error,mean_absolute_error,auc,\ silhouette_score if metrics == 'accuracy' or metrics=='acc': score = accuracy_score(y, y_pred) print('验证准确率为：{}%'.format(score 100)) elif metrics == 'precision': score = precision_score(y, y_pred,average='weighted') print('验证精确率为：{}%'.format(score * 100)) elif metrics =='recall': score = recall_score(y, y_pred,average='weighted') print('验证召回率为：{}%'.format(score * 100)) elif metrics == 'f1': score = f1_score(y, y_pred,average='weighted') print('验证f1-score为：{}%'.format(score * 100)) elif metrics == 'auc': score = auc(y, y_pred) print('验证auc为：{}%'.format(score * 100)) elif metrics == 'r2': assert len(y) >= 2, "Error Code: -603. The validation set has less than 2 samples and r2-score cannot be calculated." score = r2_score(y, y_pred) print('验证r2-score为：{}%'.format(score * 100)) elif metrics =='mse': score = mean_squared_error(y, y_pred) print('验证均方误差为：{}%'.format(score * 100)) elif metrics =='mae': score = mean_absolute_error(y, y_pred) print('验证平均绝对误差为：{}%'.format(score * 100)) else: raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics)) return score,y_pred
import numpy as np from sklearn.metrics import mean_squared_error, r2_score from sklearn import linear_model from sklearn import tree from sklearn import ensemble from sklearn.preprocessing import PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.svm import SVR from sklearn.ensemble import AdaBoostRegressor from sklearn.neural_network import MLPRegressor from yellowbrick.regressor import PredictionError import matplotlib.pyplot as plt import joblib from .base import baseml class Regression(baseml): """BaseML中的回归模块,包含['LinearRegression'(线性回归), 'CART'(决策树回归), 'RandomForest'(随机森林回归), 'Polynomial'(多项式回归), 'Lasso'(角回归), 'Ridge'(岭回归), 'SVM'(支持向量机回归), 'AdaBoost'(自适应增强回归), 'MLP'(多层感知机回归)]回归算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='LinearRegression', n_estimators=20, degree=2, n_hidden=(100,), para={}): """reg类的初始化 Args: algorithm (str, optional): 选择的回归学习器. Defaults to 'LinearRegression'. n_estimators (int, optional): RandomForest集成的决策树个数. Defaults to 20. degree (int, optional): 多项式回归的阶数. Defaults to 2. para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(Regression, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm if self.algorithm == 'LinearRegression': # 线性回归 if len(para) > 1: self.model = linear_model.LinearRegression(para) else: self.model = linear_model.LinearRegression() elif self.algorithm == 'CART': # 决策树回归 if len(para) > 1: self.model = tree.DecisionTreeRegressor(para) else: self.model = tree.DecisionTreeRegressor() elif self.algorithm == 'RandomForest': # 随机森林回归 if len(para) > 1: self.model = ensemble.RandomForestRegressor(para) else: self.model = ensemble.RandomForestRegressor( n_estimators=n_estimators) elif self.algorithm == 'Polynomial': # 多项式回归 if len(para) > 1: self.model = PolynomialFeatures(para) self.poly_linear_model = linear_model.LinearRegression() else: self.model = PolynomialFeatures(degree=degree) self.poly_linear_model = linear_model.LinearRegression() elif self.algorithm == 'Lasso': # Lasso回归 if len(para) > 1: self.model = linear_model.Lasso(para) else: self.model = linear_model.Lasso() elif self.algorithm == 'Ridge': # 岭回归 if len(para) > 1: self.model = linear_model.Ridge(para) else: self.model = linear_model.Ridge() elif self.algorithm == 'SVM': if len(para) > 1: self.model = SVR(para) else: self.model = SVR(degree=degree) elif self.algorithm == 'AdaBoost': if len(para) > 1: self.model = AdaBoostRegressor(para) else: self.model = AdaBoostRegressor(n_estimators=n_estimators) elif self.algorithm == 'MLP': if len(para) > 1: self.model = MLPRegressor(**para) else: self.model = MLPRegressor( hidden_layer_sizes=n_hidden, solver='lbfgs') def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出准确率. Defaults to False. val_size (float, optional): 验证集比例. Defaults to 0.2. lr (float, optional): 学习率. Defaults to 0.001. epochs (int, optional): 训练轮数. Defaults to 200. """ if self.algorithm == 'MLP': self.model.learning_rate_init = lr self.model.max_iter = epochs elif self.algorithm == 'AdaBoost': self.model.learning_rate = lr if validate: # 需要划分数据集，并输出准确率 self.x_train, self.x_val, self.y_train, self.y_val = \ train_test_split(self.x_train, self.y_train, test_size=val_size, random_state=0) if self.algorithm == 'Polynomial': x_transformed = self.model.fit_transform( self.x_train) # x每个数据对应的多项式系数 self.poly_linear_model.fit(x_transformed, self.y_train) else: self.model.fit(self.x_train, self.y_train) if self.algorithm == 'LinearRegression': self.coef = self.model.coef_ self.intercept = self.model.intercept_ if validate: if len(self.y_val < 2): print("测试集小于2个样本，无法使用R值计算") else: pred = self.model.predict(self.x_val) acc = r2_score(self.y_val, pred) print('R值为: {}%'.format(acc)) def inference(self, data=np.nan): """_summary_ Args: data (numpy, optional): 放进来推理的数据,不填默认使用self.x_test. Returns: pred: 返回预测结果. """ # if data is not np.nan: # 对data进行了指定 # self.x_test = data x_test = data if data is not np.nan else self.x_test assert len(x_test) > 0, "Error Code: -601. No dataset is loaded." x_test = self.convert_np(x_test) if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." if x_test.ndim != 2: x_test = x_test.reshape(x_test.shape[0], -1) if self.algorithm == 'Polynomial': x_trans = self.model.transform(x_test) self.pred = self.poly_linear_model.predict(x_trans) # self.pred = self.model. else: self.pred = self.model.predict(x_test) return self.pred # 重写方法 def save(self, path="checkpoint.pkl"): print("Saving model checkpoints...") if self.algorithm == 'Polynomial': modelList = [self.model, self.poly_linear_model] data = { 'model': modelList, 'input_shape': self.x_train.shape, 'demo_input': self.x_train[:1], } joblib.dump(data, path, compress=3) else: data = { 'model': self.model, 'input_shape': self.x_train.shape, 'demo_input': self.x_train[:1], } joblib.dump(data, path, compress=3) print("Saved successfully!") def load(self, path): if self.algorithm == 'Polynomial': self.model = joblib.load(path)['model'][0] self.poly_linear_model = joblib.load(path)['model'][1] else: self.model = joblib.load(path)['model'] def metricplot(self, X=None, y_true=None): """绘制模型回归预测误差图, 图中的identity为基准线, 说明预测出的标签(y轴)与真实标签(x轴)相同。回归模型越靠近基准线则越好。该图显示了回归模型的方差大小。 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ if X is None and y_true is None: X = self.x_test y_true = self.y_test # assert len(self.x_train) > 0 and len(self.y_train) > 0, \ # "Error Code: -601. No dataset is loaded." assert X is not None and y_true is not None, "Error Code: -604. No valid data is provided or the validataion dataset is empty." assert len(X) > 0 and len(y_true) > 0, "Error Code: -604. No valid data is provided or the validataion dataset is empty." if self.algorithm == 'Polynomial': from sklearn.pipeline import make_pipeline model = make_pipeline(self.model, self.poly_linear_model) else: model = self.model visualizer = PredictionError( model, title="Actual vs. Predicted Values", ) # self.y_test = self.y_test.squeeze() # visualizer.fit(self.x_train, self.y_train) # visualizer.score_ = visualizer.estimator.score( # self.x_test, self.y_test) # result = self.inference(self.x_test).squeeze() # visualizer.draw(self.y_test, result) y_true = y_true.squeeze() visualizer.fit(X, y_true) visualizer.score_ = visualizer.estimator.score(X, y_true) result = self.inference(X).squeeze() visualizer.draw(y_true, result) visualizer.show() def plot(self, X=None, y_true=None): """绘制回归模型图. Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ # 如果没有任何输入，默认采用x_test和y_test if X is None: assert len( self.x_test) is not None, "Error Code: -601. No dataset is loaded." X = self.x_test y_true = self.y_test X = self.convert_np(X) y_pred = self.inference(X) if y_true is not None: y_true = self.convert_np(y_true) X = X.reshape(X.shape[0], -1) # 转为二维 if self.algorithm == 'LinearRegression': self.linear_reg_plot(X[:, 0], y_pred, y_true) else: raise AssertionError( "Error Code: -405. No implementation of this method.") def linear_reg_plot(self, X, y_pred, y_true=None): """绘制线性回归模型图, 仅支持使用1维特征训练的模型. Args: X (np.ndarray): 放入的测试数据. x_pred (np.ndarray): 测试数据的预测标签. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 当被显式填入时才会绘制出真实的散点. """ assert self.model.n_features_in_ == 1, \ "Error Code: -306. "\ "The number of features for training is wrong, required {}, "\ "which is {}.".format(1, self.model.n_features_in_) fig, ax = plt.subplots() if y_true is not None: ax.scatter(X, y_true) ax.plot(X, y_pred, color='red') ax.set_xlabel('x') ax.set_ylabel('y') ax.axis('tight') plt.show()
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.svm import SVC from yellowbrick.classifier import ClassPredictionError from .base import baseml class Classification(baseml): """BaseML中的分类模块,包含['KNN'(K近临分类), 'SVM'(支持向量机分类), 'NaiveBayes'(朴素贝叶斯分类), 'CART'(决策树分类), 'AdaBoost'(自适应增强分类), 'MLP'(多层感知机分类), 'RandomForest'(随机森林分类)]分类算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='KNN', n_neighbors=5, n_estimators=100, n_hidden=(100,), para={}): """cls类初始化. Args: algorithm (str, optional): 采用的分类算法. Defaults to 'KNN'. n_neighbors (int, optional): KNN的k值. Defaults to 5. n_estimators (int, optional): Adaboost\|RandomForest所集成的决策树个数. Defaults to 100. n_hidden (tuple, optional): MLP隐藏层的形状. Defaults to (100,). para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(Classification, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm if self.algorithm == 'KNN': if len(para) > 1: self.model = KNeighborsClassifier(para) else: self.model = KNeighborsClassifier(n_neighbors=n_neighbors) elif self.algorithm == 'SVM': if len(para) > 1: self.model = SVC(para) else: self.model = SVC() elif self.algorithm == 'NaiveBayes': if len(para) > 1: self.model = GaussianNB(para) else: self.model = GaussianNB() elif self.algorithm == 'CART': if len(para) > 1: self.model = DecisionTreeClassifier(para) else: self.model = DecisionTreeClassifier() elif self.algorithm == 'AdaBoost': if len(para) > 1: self.model = AdaBoostClassifier(para) else: self.model = AdaBoostClassifier( n_estimators=n_estimators, random_state=0) elif self.algorithm == 'MLP': if len(para) > 1: self.model = MLPClassifier(para) else: self.model = MLPClassifier( hidden_layer_sizes=n_hidden, solver='lbfgs') elif self.algorithm == 'RandomForest': if len(para) > 1: self.model = RandomForestClassifier(*para) else: self.model = RandomForestClassifier( n_estimators=n_estimators, random_state=0) def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出准确率. Defaults to False. val_size (float, optional): 验证集比例. Defaults to 0.2. lr (float, optional): 学习率. Defaults to 0.001. epochs (int, optional): 训练轮数. Defaults to 200. """ if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']: # 设定学习率 if self.algorithm == 'MLP': self.model.learning_rate_init = lr self.model.max_iter = epochs elif self.algorithm == 'AdaBoost': self.model.learning_rate = lr if validate: self.x_train, self.x_val, self.y_train, self.y_val = \ train_test_split(self.x_train, self.y_train, test_size=val_size, random_state=0) self.model.fit(self.x_train, self.y_train) if validate: pred = self.model.predict(self.x_val) acc = accuracy_score(self.y_val, pred) print('训练准确率为：{}%'.format(acc 100)) def inference(self, data=np.nan, verbose=True): """使用模型进行推理 Args: data (np.ndarray, optional): 放进来推理的数据,不填默认使用self.x_test. verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True. Returns: pred: 返回预测结果. """ if data is not np.nan: # 对data进行了指定 x_test = self.convert_np(data) if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." elif len(self.x_train) > 0 and len(self.x_test) == 0: x_test = self.x_train else: x_test = self.x_test x_test = self.convert_np(x_test) if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']: pred = self.model.predict(x_test) return pred def metricplot(self, X=None, y_true=None): """绘制模型分类准确率图, 可直观查看每一类的分类正误情况 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ assert len(self.x_train) > 0 and len(self.y_train) > 0, \ "Error Code: -601. No dataset is loaded." if X is None and y_true is None: assert len(self.x_test) > 0 and len( self.y_test) > 0, "Error Code: -602. Dataset split was not performed." X = self.x_test y_true = self.y_test assert len(X) > 0 and len(y_true) > 0 visualizer = ClassPredictionError( self.model ) visualizer.fit(self.x_train, self.y_train) visualizer.score(X, y_true.reshape(-1)) visualizer.show() def plot(self, X=None, y_true=None): """绘制分类模型图 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ # 如果没有任何输入，默认采用x_test和y_test if X is None: assert len( self.x_test) > 0, "Error Code: -602. Dataset split was not performed." X = self.x_test X = self.convert_np(X) y_pred = self.inference(X) if y_true is not None: y_true = self.convert_np(y_true) X = X.reshape(X.shape[0], -1) # 转为二维 if self.algorithm == 'KNN': self.knn_plot(X, y_pred, y_true) elif self.algorithm == 'SVM': self.svm_plot(X, y_pred, y_true) else: raise AssertionError( "Error Code: -405. No implementation of this method.") def knn_plot(self, X, y_pred, y_true=None): """绘制KNN分类图, 不同标签的样本用不同颜色点代替。选择2维特征作为xy坐标, 最多选择5个类别进行可视化。 Args: X (np.ndarray): 放入的测试数据。 y_pred (np.ndarray): 放入的测试数据的预测标签。 y_true (np.ndarray, optional): 放入的测试数据的真实标签。 """ # 训练数据特征多于2维，仅取前两维 if X.shape[1] > 2: print('\033[1;34;1mFeatures is more than 2 dimensions, ' 'the first two dimensions are used by default.\033[0m') label = np.unique(y_pred) # 最多选择5个类别进行可视化 if len(label) > 5: label.sort() label = label[:5] y_max = label[4] idx = np.where(y_pred <= y_max) y_pred = y_pred[idx] X = X[idx, :].squeeze() print('\033[1;34;1mThe number of classes is more than 5, ' 'the top 5 classes are used by default.\033[0m') label_list = ["y_pred_" + str(i) for i in range(len(label))] y_pred_plot = plt.scatter( X[:, 0], X[:, 1], marker='o', c=y_pred, cmap='rainbow') handles = y_pred_plot.legend_elements()[0] # 只有显式输入y_true才会被画出 if y_true is not None: true_label = np.unique(y_true) true_label_list = ["y_true_" + str(i) for i in range(len(true_label))] y_true_plot = plt.scatter( X[:, 0], X[:, 1], marker='s', c=y_true, cmap='viridis', s=10) handles += y_true_plot.legend_elements()[0] label_list += true_label_list plt.legend(handles=handles, labels=label_list) plt.show() def svm_plot(self, X, y_pred, y_true=None): """绘制SVM分类图, 不同标签的样本用不同颜色点代替, 绘制出SVM分类边界。选择2维特征作为xy坐标。 Args: X (np.ndarray): 放入的测试数据。 y_pred (np.ndarray): 放入的测试数据的预测标签。 y_true (np.ndarray, optional): 放入的测试数据的真实标签。 """ assert self.model.n_features_in_ == 2, "Error Code: -306. "\ "The number of features for training is wrong, required {}, "\ "which is {}.".format(2, self.model.n_features_in_) fig, ax = plt.subplots() ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap="rainbow") if y_true is not None: ax.scatter(X[:, 0], X[:, 1], c=y_true, s=8, cmap="viridis", marker='s') if ax is None: ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() x = np.linspace(xlim[0], xlim[1], 30) # 产生30个间隔 y = np.linspace(ylim[0], ylim[1], 30) # 产生30个间隔 _Y, _X = np.meshgrid(y, x) z = self.model.predict(np.c_[_X.flatten(), _Y.flatten()]) zz = z.reshape(_X.shape) ax.contour(_X, _Y, zz, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"]) ax.set_xlim(xlim) ax.set_ylim(ylim) plt.show()
import os import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.cluster import SpectralClustering from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import Birch from sklearn.metrics import silhouette_score from yellowbrick.cluster import InterclusterDistance import matplotlib.pyplot as plt from .base import baseml class Cluster(baseml): # cluster """BaseML中的聚类模块,包含['Kmeans'(K均值聚类), 'Spectral clustering'(谱聚类), 'Agglomerative clustering'(层次聚类), 'Birch'(二叉可伸缩聚类树聚类)]聚类算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='Kmeans', N_CLUSTERS=5, para={}): """clt类初始化 Args: algorithm (str, optional): 采用的聚类算法. Defaults to 'Kmeans'. N_CLUSTERS (int, optional): 聚类个数. Defaults to 5. para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(Cluster, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm self.n = N_CLUSTERS if self.algorithm == 'Kmeans': if len(para) > 1: self.model = KMeans(para) else: self.model = KMeans(n_clusters=N_CLUSTERS) elif self.algorithm == 'Spectral clustering': if len(para) > 1: self.model = SpectralClustering(para) else: self.model = SpectralClustering(n_clusters=N_CLUSTERS) elif self.algorithm == 'Agglomerative clustering': if len(para) > 1: self.model = AgglomerativeClustering(para) else: self.model = AgglomerativeClustering(n_clusters=N_CLUSTERS) elif self.algorithm == 'Birch': if len(para) > 1: self.model = Birch(para) else: self.model = Birch(n_clusters=N_CLUSTERS) def train(self, validate=False): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出模型轮廓系数. Defaults to True. """ self.model.fit(self.x_train) if validate: score = silhouette_score(self.x_train, labels=self.model.labels_) print('轮廓系数为：{}'.format(score)) # -1为不正确的聚类，0为重叠聚类，1为正确的聚类 def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[], shuffle=True, show=False, split=False, scale=False): # 聚类方法默认不需要split数据集 super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale) def inference(self, data=np.nan, verbose=True): """使用模型进行推理 Args: data (numpy, optional): 放进来推理的数据,不填默认使用self.x_train. verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True. Returns: pred: 返回预测结果. """ if data is not np.nan: # 对data进行了指定 self.x_test = data if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') assert type(self.demo_input) == type(self.x_test), f"Error Code: -309. The data type {type(self.x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == self.x_test.shape[1:], f"Error Code: -309. The data shape {self.x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." else: self.x_test = self.x_train self.x_test = self.convert_np(self.x_test) if verbose and len(self.x_train) != 0: labels = self.model.labels_ # 获取聚类标签 # print(silhouette_score(self.x_train, labels)) # 获取聚类结果总的轮廓系数 if self.algorithm == 'Kmeans': print(self.model.cluster_centers_) # 输出类簇中心 for i in range(self.n): print(f" CLUSTER-{i+1} ".center(60, '=')) print(self.x_train[labels == i]) if self.x_test is not []: pred = self.model.predict(self.x_test) return pred def metricplot(self, X=None): """绘制模型聚类簇间距离图, 各簇分的越开, 说明聚类效果越好。 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train. """ assert self.algorithm == 'Kmeans', \ "Error Code: -405. No implementation of this method." if X is None: assert len( self.x_train) > 0, "Error Code: -601. No dataset is loaded." X = self.x_train visualizer = InterclusterDistance(self.model) visualizer.fit(self.x_train) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure def plot(self, X=None): """绘制聚类模型图 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train. """ assert self.algorithm == 'Kmeans', \ "Error Code: -405. No implementation of this method." # 如果没有任何输入，默认采用x_train if X is None: if len(self.x_train) > 0: self.x_test = self.x_train assert len( self.x_test) > 0, "Error Code: -602. Dataset split was not performed." X = self.x_test X = self.convert_np(X) y_pred = self.inference(X) self.cluster_plot(X, y_pred) def cluster_plot(self, X, y_pred): """绘制聚类模型散点图，并显示聚类标签 Args: X (np.ndarray): 放入的测试数据, 不填默认使用self.x_train. y_pred (np.ndarray): 模型对测试数据预测的类别. """ # 训练数据特征多于2维，仅取前两维 if X.shape[1] > 2: print('\033[1;34;1mfeatures is more than 2 dimensions, \ the first two dimensions are used by default\033[0m') # 画出不同颜色的点 plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis') # 画出聚类中心 centers = self.model.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='black', s=20, alpha=0.5) # 标出聚类序号 for i in range(self.model.cluster_centers_.shape[0]): plt.text(centers[:, 0][i]+0.5, y=centers[:, 1][i]+0.5, s=i, fontdict=dict(color='red', size=10), bbox=dict(facecolor='yellow', alpha=0.2), ) def valid(self, path=None, x=None ,y=None ,metrics='accuracy'): """验证模型的准确率 Args: path (str): 验证集的路径 x (np.ndarray, optional): 验证集的特征. Defaults to None. y (np.ndarray, optional): 验证集的标签. Defaults to None. metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'. Returns: acc: 返回验证指标的值 y_pred: 返回预测y值 """ if path is None and x is None and y is None: # 如果没有输入数据，默认采用x_test和y_test x = self.x_train y = self.y_train elif x is None and y is None: # 如果输入了路径，但是没有输入数据，则读取路径 df = pd.read_csv(path) x = df.iloc[:, :-1].values y = df.iloc[:, -1].values self.x_test = x self.y_test = y # 验证集的特征和标签不能为空 assert x is not None and y is not None, "Error Code: -801. The validation set cannot be empty. " assert len(x) > 0 and len(y) > 0, "Error Code: -801. The validation set cannot be empty. " y_pred = self.inference(x) from sklearn.metrics import silhouette_score, calinski_harabasz_score,davies_bouldin_score if metrics == 'silhouette_score': score = silhouette_score(x, self.model.labels_) print('验证轮廓系数为：{}%'.format(score * 100)) elif metrics == 'calinski_harabasz_score': score = calinski_harabasz_score(x, self.model.labels_) print('验证Calinski-Harabasz指数为：{}'.format(score)) elif metrics == 'davies_bouldin_score': score = davies_bouldin_score(x, self.model.labels_) print('验证Davies-Bouldin指数为：{}'.format(score)) else: raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics)) return score,y_pred
import os import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.manifold import LocallyLinearEmbedding as LLE from yellowbrick.features import PCA as yb_PCA from .base import baseml class DimentionReduction(baseml): # reduction """BaseML中的降维模块,包含['PCA'(主成分分析), 'LDA'(线性判别分析), 'LLE'(局部线性嵌入)]降维算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='PCA', n_components=2, para={}): """rdc类的构造函数 Args: algorithm (str, optional): 采用的降维算法. Defaults to 'PCA'. n_components (int, optional): 降维后保留的特征数. Defaults to 2. para (dict, optional): para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(DimentionReduction, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm self.n_components = n_components if self.algorithm == 'PCA': # 主成分分析 if len(para) > 1: self.model = PCA(para) else: self.model = PCA(n_components=n_components) elif self.algorithm == 'LDA': # 线性判别分析 if len(para) > 1: self.model = LDA(para) else: self.model = LDA(n_components=n_components) elif self.algorithm == 'LLE': # 局部线性嵌入 if len(para) > 1: self.model = LLE(**para) else: self.model = LLE(n_components=n_components) def train(self, validate=True): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出方差贡献率. Defaults to True. """ if self.algorithm == 'LDA': if len(self.y_train) == 0: raise Exception("使用LDA时必须输入y标签") self.model.fit(self.x_train, self.y_train) else: self.model.fit(self.x_train) if validate and self.algorithm != 'LLE': explained_var = self.model.explained_variance_ratio_ # 获取贡献率 print('累计方差贡献率为：{}'.format(explained_var)) def inference(self, data=np.nan): """使用模型进行降维 Args: data (numpy, optional): 放进来降维的数据,不填默认使用self.x_train. Returns: pred: 返回降维结果，保留的特征数为刚开始输进来的. """ if data is not np.nan: # 对data进行了指定 self.x_test = data if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') x_test = self.convert_np(self.x_test) assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." else: self.x_test = self.x_train self.x_test = self.convert_np(self.x_test) if self.x_test is not []: pred = self.model.transform(self.x_test) return pred def fit_transform(self): # 一步到位地返回降维结果 return self.model.fit_transform(self.x_train) def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[], shuffle=True, show=False, split=False, scale=False): # 降维方法默认不需要split数据集 super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale) def plot(self, X=None, y_true=None): """绘制降维模型图，目前仅支持PCA. Args: X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train. y_true (_type_, optional): 测试数据的真实标签，, 不输入默认使用self.y_train. """ assert self.algorithm == 'PCA', "Error Code: -405. No implementation of this method." # 如果没有任何输入，默认采用x_train if X is None: if len(self.x_train) > 0: self.x_test = self.x_train assert len( self.x_test) > 0, "Error Code: -601. No dataset is loaded." X = self.x_test y_true = self.y_train X = self.convert_np(X) assert y_true is not None and len(y_true) > 0, \ "Error Code: -307. The parameter {} is not set.".format("y_true") y_true = self.convert_np(y_true) self.pca_projection(X, y_true) def pca_projection(self, X, y_true): """绘制PCA投影图, 能够投影至2维或3维中, 检验数据降维的可行性 Args: X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train. y_true (_type_, optional): 测试数据的真实标签，, 不输入默认使用self.y_train. """ proj = self.n_components proj = min(proj, 3) label = np.unique(y_true) classes = ['class_%i' % i for i in range(len(label))] visualizer = yb_PCA(scale=True, projection=proj, classes=classes) visualizer.fit_transform(X, y_true.squeeze()) visualizer.show()
参考示例 from BaseML import Regression as reg #导入回归任务模块 model = reg('LinearRegression') #实例化线性回归模型 model.load_tab_data('data/shoe_size-height1.csv') #载入训练集数据 model.train() #训练模型 model.valid('data/shoe_size-height1.csv',metrics='r2') #对模型进行评估 model.metricplot() #可视化评估结果 model.plot() #模型预测结果可视化 model.save('data/mymodel.pkl') #保存模型 #根据实际的进行填写 shoe_size= 40.5 result = model.inference([shoe_size]) #得出预测的身高 print("\n鞋码为", shoe_size, "时，预测的身高为：", f"{result[0]:.2f}") #打印结果

posted @ 2025-11-04 16:03 aiplus 阅读(15) 评论(0) 收藏举报

刷新页面返回顶部

from .base import baseml from .BaseClassification import Classification from .BaseRegression import Regression from .BaseCluster import Cluster from .BaseDimentionReduction import DimentionReduction __all__ = [ 'baseml', 'Classification', 'Regression', 'Cluster', 'DimentionReduction', ]
# BaseML基类，各个大类能够继承其中的基本方法 import os import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import joblib class baseml: """BaseML中的继承基类,单例模式避免多次调用创建 """ def __init__(self): self.cwd = os.path.dirname(os.getcwd()) # 获取当前文件的绝对路径 self.file_dirname = os.path.dirname(os.path.abspath(__file__)) self.x_train, self.x_test, self.y_train, self.y_test, self.x_val, self.y_val = [ ], [], [], [], [], [] self.X = [] self.Y = [] self.dataset = [] self.model = None self.test_size = 0.2 self.scaler = None self.demo_input = None self.input_shape = None # 采用单例，避免基类创建太多次 def __new__(cls, args, kwargs): # print("__new__") if not hasattr(baseml, "_instance"): # print("创建新实例") baseml._instance = object.__new__(cls) return baseml._instance def train(self): # 必须要改写的类 raise NotImplementedError("train function must be implemented") def inference(self): # 必须要改写的类 raise NotImplementedError("inference function must be implemented") def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[], shuffle=True, show=False, split=True, scale=False): """Load the model's data set. Args: X (str\|numpy\|pandas\|list): 自变量. y (str\|numpy\|pandas\|list, optional): 目标值. 默认为 []. type (str, optional): X和y的输入格式, choice = ['csv', 'numpy','pandas','list','txt], 最后统一转换为numpy. x_column (list, optional): X 的索引列. 默认设置为X的所有列. y_column (list, optional): y的索引列. 默认设置为y的所有列. shuffle (bool, optional): 是否对元素随机排序. 默认为True. show (bool, optional): 显示5条数据. 默认为True. split(bool, optional): 是否划分数据集为训练集和测试集. 默认为True. scale(bool, optional): 是否对数据进行归一化. False. """ if (type == 'csv' or type == 'txt') and len(x_column) == 0: raise ValueError("请传入数据列号") if type == 'csv': self.dataset = pd.read_csv(X).values # .values就转成numpy格式了 if shuffle: np.random.shuffle(self.dataset) self.get_data(self.dataset, self.dataset, x_column, y_column, split, scale) elif type == 'numpy': if shuffle: X, y = self.shuffle_data(X, y) self.get_data(X, y, x_column, y_column, split, scale) elif type == 'pandas': X = X.values y = y.values if len(y) > 0 else [] if shuffle: X, y = self.shuffle_data(X, y) self.get_data(X, y, x_column, y_column, split, scale) elif type == 'list': X = np.array(X) y = np.array(y) if len(y) > 0 else [] if shuffle: X, y = self.shuffle_data(X, y) self.get_data(X, y, x_column, y_column, split, scale) elif type == 'txt': self.dataset = np.loadtxt(X) self.dataset = self.dataset.values if shuffle: np.random.shuffle(self.dataset) self.get_data(self.dataset, self.dataset, x_column, y_column, split, scale) print("Load dataset successfully!") if show and len(self.x_train) >= 5: # 显示前5条数据 print("X") print(self.x_train[:5]) print("y") if len(self.y_train) >= 5: print(self.y_train[:5]) else: print("None") def get_data(self, X, y, x_column, y_column, split, scale): """通过列号获取真实的训练数据 Args: X (numpy.ndarray): 自变量. y (numpy.ndarray): 因变量. x_column (list): 自变量的列索引集合. y_column (list): 因变量的列索引集合. """ if X.ndim == 1: X = X.reshape(-1, 1) if len(x_column) == 0 and len(X): # 如果没有赋值，那么默认选用所有列 x_column = list(range(X.shape[1])) if len(y_column) == 0 and len(y): # 如果没有赋值，默认用y的所有列 if y.ndim == 1: y_column = [0] else: y_column = list(range(y.shape[1])) if len(X): self.x_train = X[:, x_column] if scale: # 对训练数据进行归一化，在聚类、部分分类的时候需要使用 self.scaler = MinMaxScaler(feature_range=(0, 1)) self.x_train = self.scaler.fit_transform(self.x_train) if len(y): # if y.ndim == 1: y = y.reshape(-1, 1) self.y_train = y[:, y_column] if self.y_train.shape[0]: self.dataset = np.concatenate( (self.x_train, self.y_train), axis=1) # 按列进行拼接 else: self.dataset = self.x_train if split: # 进行数据集划分 self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.x_train, self.y_train, test_size=self.test_size, random_state=42) def shuffle_data(self, X, y): if len(X) == len(y): c = list(zip(X, y)) # 保持X与y的对应关系 np.random.shuffle(c) X = np.array([t[0] for t in c]) y = np.array([t[1] for t in c]) elif len(X) > 0 and len(y) == 0: np.random.shuffle(X) return X, y def save(self, path="checkpoint.pkl"): data = { 'model': self.model, 'input_shape': self.x_train.shape, 'demo_input': self.x_train[:1], } print("Saving model checkpoints...") joblib.dump(data, path, compress=3) print("Saved successfully!") def load(self, path): # self.model = joblib.load(path) model = joblib.load(path) if isinstance(model, dict): self.model = model['model'] try: self.demo_input = model['demo_input'] self.input_shape = model['input_shape'] except: pass else: self.model = model def reverse_scale(self, data): return self.scaler.inverse_transform(data) def get_test_data(self): return self.x_test, self.y_test def convert_np(self, data): if isinstance(data, np.ndarray): pass elif isinstance(data, list): data = np.array(data) elif isinstance(data, pd.DataFrame): data = data.values elif isinstance(data, tuple): data = np.array(data) else: TypeError("The type {} is not supported".format(type(data))) return data def plot(self, X=None, y_true=None): # 模型可视化，若不被改写则不被支持 raise NotImplementedError( "Error Code: -405. No implementation of this method.") def metricplot(self, X=None, y_true=None): # 模型可视化，若不被改写则不被支持 raise NotImplementedError( "Error Code: -405. No implementation of this method.") def load_tab_data(self, data_path, train_val_ratio=1.0, shuffle=True,random_seed=42,y_type='float',kw): # if y_type == 'long' and self.task_type == 'reg': # y_type = 'float' data = np.loadtxt(data_path, dtype=float, delimiter=',',skiprows=1) # [120, 4] x = data[:,:-1] y = data[:, -1] y = y.astype(y_type) if 0 < train_val_ratio < 1: train_size = int(train_val_ratio len(x)) val_size = len(x) - train_size x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=train_size, test_size=val_size, random_state=random_seed,shuffle=shuffle) else: x_train, y_train = x, y x_val, y_val = None, None # if self.task_type == 'cls': # y_train = y_train.astype(int) # y_val = y_val.astype(int) if y_val is not None else None # elif self.task_type =='reg': # y_train = y_train.astype(float) # y_val = y_val.astype(float) if y_val is not None else None self.x_train = x_train self.y_train = y_train self.x_test = x_val self.y_test = y_val return x_train, y_train, x_val, y_val def set_para(self, *kw): for i in kw: print("Setting {} to {}".format(i, kw[i])) setattr(self.model, i, kw[i]) @property def para(self): return self.para @para.setter def para(self, kw): for i in kw: print("Setting {} to {}".format(i, kw[i])) setattr(self.model, i, kw[i]) def valid(self, path=None, x=None ,y=None ,metrics='accuracy'): """验证模型的准确率 Args: path (str): 验证集的路径 x (np.ndarray, optional): 验证集的特征. Defaults to None. y (np.ndarray, optional): 验证集的标签. Defaults to None. metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'. Returns: acc: 返回验证指标的值 y_pred: 返回预测y值 """ if path is None and x is None and y is None: # 如果没有输入数据，默认采用x_test和y_test x = self.x_test y = self.y_test elif x is None and y is None: # 如果输入了路径，但是没有输入数据，则读取路径 df = pd.read_csv(path) x = df.iloc[:, :-1].values y = df.iloc[:, -1].values self.x_test = x self.y_test = y # 验证集的特征和标签不能为空 assert x is not None and y is not None, "Error Code: -801. The validation set cannot be empty. " assert len(x) > 0 and len(y) > 0, "Error Code: -801. The validation set cannot be empty. " y_pred = self.inference(x) from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,\ r2_score,mean_squared_error,mean_absolute_error,auc,\ silhouette_score if metrics == 'accuracy' or metrics=='acc': score = accuracy_score(y, y_pred) print('验证准确率为：{}%'.format(score 100)) elif metrics == 'precision': score = precision_score(y, y_pred,average='weighted') print('验证精确率为：{}%'.format(score * 100)) elif metrics =='recall': score = recall_score(y, y_pred,average='weighted') print('验证召回率为：{}%'.format(score * 100)) elif metrics == 'f1': score = f1_score(y, y_pred,average='weighted') print('验证f1-score为：{}%'.format(score * 100)) elif metrics == 'auc': score = auc(y, y_pred) print('验证auc为：{}%'.format(score * 100)) elif metrics == 'r2': assert len(y) >= 2, "Error Code: -603. The validation set has less than 2 samples and r2-score cannot be calculated." score = r2_score(y, y_pred) print('验证r2-score为：{}%'.format(score * 100)) elif metrics =='mse': score = mean_squared_error(y, y_pred) print('验证均方误差为：{}%'.format(score * 100)) elif metrics =='mae': score = mean_absolute_error(y, y_pred) print('验证平均绝对误差为：{}%'.format(score * 100)) else: raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics)) return score,y_pred
import numpy as np from sklearn.metrics import mean_squared_error, r2_score from sklearn import linear_model from sklearn import tree from sklearn import ensemble from sklearn.preprocessing import PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.svm import SVR from sklearn.ensemble import AdaBoostRegressor from sklearn.neural_network import MLPRegressor from yellowbrick.regressor import PredictionError import matplotlib.pyplot as plt import joblib from .base import baseml class Regression(baseml): """BaseML中的回归模块,包含['LinearRegression'(线性回归), 'CART'(决策树回归), 'RandomForest'(随机森林回归), 'Polynomial'(多项式回归), 'Lasso'(角回归), 'Ridge'(岭回归), 'SVM'(支持向量机回归), 'AdaBoost'(自适应增强回归), 'MLP'(多层感知机回归)]回归算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='LinearRegression', n_estimators=20, degree=2, n_hidden=(100,), para={}): """reg类的初始化 Args: algorithm (str, optional): 选择的回归学习器. Defaults to 'LinearRegression'. n_estimators (int, optional): RandomForest集成的决策树个数. Defaults to 20. degree (int, optional): 多项式回归的阶数. Defaults to 2. para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(Regression, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm if self.algorithm == 'LinearRegression': # 线性回归 if len(para) > 1: self.model = linear_model.LinearRegression(para) else: self.model = linear_model.LinearRegression() elif self.algorithm == 'CART': # 决策树回归 if len(para) > 1: self.model = tree.DecisionTreeRegressor(para) else: self.model = tree.DecisionTreeRegressor() elif self.algorithm == 'RandomForest': # 随机森林回归 if len(para) > 1: self.model = ensemble.RandomForestRegressor(para) else: self.model = ensemble.RandomForestRegressor( n_estimators=n_estimators) elif self.algorithm == 'Polynomial': # 多项式回归 if len(para) > 1: self.model = PolynomialFeatures(para) self.poly_linear_model = linear_model.LinearRegression() else: self.model = PolynomialFeatures(degree=degree) self.poly_linear_model = linear_model.LinearRegression() elif self.algorithm == 'Lasso': # Lasso回归 if len(para) > 1: self.model = linear_model.Lasso(para) else: self.model = linear_model.Lasso() elif self.algorithm == 'Ridge': # 岭回归 if len(para) > 1: self.model = linear_model.Ridge(para) else: self.model = linear_model.Ridge() elif self.algorithm == 'SVM': if len(para) > 1: self.model = SVR(para) else: self.model = SVR(degree=degree) elif self.algorithm == 'AdaBoost': if len(para) > 1: self.model = AdaBoostRegressor(para) else: self.model = AdaBoostRegressor(n_estimators=n_estimators) elif self.algorithm == 'MLP': if len(para) > 1: self.model = MLPRegressor(**para) else: self.model = MLPRegressor( hidden_layer_sizes=n_hidden, solver='lbfgs') def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出准确率. Defaults to False. val_size (float, optional): 验证集比例. Defaults to 0.2. lr (float, optional): 学习率. Defaults to 0.001. epochs (int, optional): 训练轮数. Defaults to 200. """ if self.algorithm == 'MLP': self.model.learning_rate_init = lr self.model.max_iter = epochs elif self.algorithm == 'AdaBoost': self.model.learning_rate = lr if validate: # 需要划分数据集，并输出准确率 self.x_train, self.x_val, self.y_train, self.y_val = \ train_test_split(self.x_train, self.y_train, test_size=val_size, random_state=0) if self.algorithm == 'Polynomial': x_transformed = self.model.fit_transform( self.x_train) # x每个数据对应的多项式系数 self.poly_linear_model.fit(x_transformed, self.y_train) else: self.model.fit(self.x_train, self.y_train) if self.algorithm == 'LinearRegression': self.coef = self.model.coef_ self.intercept = self.model.intercept_ if validate: if len(self.y_val < 2): print("测试集小于2个样本，无法使用R值计算") else: pred = self.model.predict(self.x_val) acc = r2_score(self.y_val, pred) print('R值为: {}%'.format(acc)) def inference(self, data=np.nan): """_summary_ Args: data (numpy, optional): 放进来推理的数据,不填默认使用self.x_test. Returns: pred: 返回预测结果. """ # if data is not np.nan: # 对data进行了指定 # self.x_test = data x_test = data if data is not np.nan else self.x_test assert len(x_test) > 0, "Error Code: -601. No dataset is loaded." x_test = self.convert_np(x_test) if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." if x_test.ndim != 2: x_test = x_test.reshape(x_test.shape[0], -1) if self.algorithm == 'Polynomial': x_trans = self.model.transform(x_test) self.pred = self.poly_linear_model.predict(x_trans) # self.pred = self.model. else: self.pred = self.model.predict(x_test) return self.pred # 重写方法 def save(self, path="checkpoint.pkl"): print("Saving model checkpoints...") if self.algorithm == 'Polynomial': modelList = [self.model, self.poly_linear_model] data = { 'model': modelList, 'input_shape': self.x_train.shape, 'demo_input': self.x_train[:1], } joblib.dump(data, path, compress=3) else: data = { 'model': self.model, 'input_shape': self.x_train.shape, 'demo_input': self.x_train[:1], } joblib.dump(data, path, compress=3) print("Saved successfully!") def load(self, path): if self.algorithm == 'Polynomial': self.model = joblib.load(path)['model'][0] self.poly_linear_model = joblib.load(path)['model'][1] else: self.model = joblib.load(path)['model'] def metricplot(self, X=None, y_true=None): """绘制模型回归预测误差图, 图中的identity为基准线, 说明预测出的标签(y轴)与真实标签(x轴)相同。回归模型越靠近基准线则越好。该图显示了回归模型的方差大小。 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ if X is None and y_true is None: X = self.x_test y_true = self.y_test # assert len(self.x_train) > 0 and len(self.y_train) > 0, \ # "Error Code: -601. No dataset is loaded." assert X is not None and y_true is not None, "Error Code: -604. No valid data is provided or the validataion dataset is empty." assert len(X) > 0 and len(y_true) > 0, "Error Code: -604. No valid data is provided or the validataion dataset is empty." if self.algorithm == 'Polynomial': from sklearn.pipeline import make_pipeline model = make_pipeline(self.model, self.poly_linear_model) else: model = self.model visualizer = PredictionError( model, title="Actual vs. Predicted Values", ) # self.y_test = self.y_test.squeeze() # visualizer.fit(self.x_train, self.y_train) # visualizer.score_ = visualizer.estimator.score( # self.x_test, self.y_test) # result = self.inference(self.x_test).squeeze() # visualizer.draw(self.y_test, result) y_true = y_true.squeeze() visualizer.fit(X, y_true) visualizer.score_ = visualizer.estimator.score(X, y_true) result = self.inference(X).squeeze() visualizer.draw(y_true, result) visualizer.show() def plot(self, X=None, y_true=None): """绘制回归模型图. Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ # 如果没有任何输入，默认采用x_test和y_test if X is None: assert len( self.x_test) is not None, "Error Code: -601. No dataset is loaded." X = self.x_test y_true = self.y_test X = self.convert_np(X) y_pred = self.inference(X) if y_true is not None: y_true = self.convert_np(y_true) X = X.reshape(X.shape[0], -1) # 转为二维 if self.algorithm == 'LinearRegression': self.linear_reg_plot(X[:, 0], y_pred, y_true) else: raise AssertionError( "Error Code: -405. No implementation of this method.") def linear_reg_plot(self, X, y_pred, y_true=None): """绘制线性回归模型图, 仅支持使用1维特征训练的模型. Args: X (np.ndarray): 放入的测试数据. x_pred (np.ndarray): 测试数据的预测标签. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 当被显式填入时才会绘制出真实的散点. """ assert self.model.n_features_in_ == 1, \ "Error Code: -306. "\ "The number of features for training is wrong, required {}, "\ "which is {}.".format(1, self.model.n_features_in_) fig, ax = plt.subplots() if y_true is not None: ax.scatter(X, y_true) ax.plot(X, y_pred, color='red') ax.set_xlabel('x') ax.set_ylabel('y') ax.axis('tight') plt.show()
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.svm import SVC from yellowbrick.classifier import ClassPredictionError from .base import baseml class Classification(baseml): """BaseML中的分类模块,包含['KNN'(K近临分类), 'SVM'(支持向量机分类), 'NaiveBayes'(朴素贝叶斯分类), 'CART'(决策树分类), 'AdaBoost'(自适应增强分类), 'MLP'(多层感知机分类), 'RandomForest'(随机森林分类)]分类算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='KNN', n_neighbors=5, n_estimators=100, n_hidden=(100,), para={}): """cls类初始化. Args: algorithm (str, optional): 采用的分类算法. Defaults to 'KNN'. n_neighbors (int, optional): KNN的k值. Defaults to 5. n_estimators (int, optional): Adaboost\|RandomForest所集成的决策树个数. Defaults to 100. n_hidden (tuple, optional): MLP隐藏层的形状. Defaults to (100,). para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(Classification, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm if self.algorithm == 'KNN': if len(para) > 1: self.model = KNeighborsClassifier(para) else: self.model = KNeighborsClassifier(n_neighbors=n_neighbors) elif self.algorithm == 'SVM': if len(para) > 1: self.model = SVC(para) else: self.model = SVC() elif self.algorithm == 'NaiveBayes': if len(para) > 1: self.model = GaussianNB(para) else: self.model = GaussianNB() elif self.algorithm == 'CART': if len(para) > 1: self.model = DecisionTreeClassifier(para) else: self.model = DecisionTreeClassifier() elif self.algorithm == 'AdaBoost': if len(para) > 1: self.model = AdaBoostClassifier(para) else: self.model = AdaBoostClassifier( n_estimators=n_estimators, random_state=0) elif self.algorithm == 'MLP': if len(para) > 1: self.model = MLPClassifier(para) else: self.model = MLPClassifier( hidden_layer_sizes=n_hidden, solver='lbfgs') elif self.algorithm == 'RandomForest': if len(para) > 1: self.model = RandomForestClassifier(*para) else: self.model = RandomForestClassifier( n_estimators=n_estimators, random_state=0) def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出准确率. Defaults to False. val_size (float, optional): 验证集比例. Defaults to 0.2. lr (float, optional): 学习率. Defaults to 0.001. epochs (int, optional): 训练轮数. Defaults to 200. """ if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']: # 设定学习率 if self.algorithm == 'MLP': self.model.learning_rate_init = lr self.model.max_iter = epochs elif self.algorithm == 'AdaBoost': self.model.learning_rate = lr if validate: self.x_train, self.x_val, self.y_train, self.y_val = \ train_test_split(self.x_train, self.y_train, test_size=val_size, random_state=0) self.model.fit(self.x_train, self.y_train) if validate: pred = self.model.predict(self.x_val) acc = accuracy_score(self.y_val, pred) print('训练准确率为：{}%'.format(acc 100)) def inference(self, data=np.nan, verbose=True): """使用模型进行推理 Args: data (np.ndarray, optional): 放进来推理的数据,不填默认使用self.x_test. verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True. Returns: pred: 返回预测结果. """ if data is not np.nan: # 对data进行了指定 x_test = self.convert_np(data) if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." elif len(self.x_train) > 0 and len(self.x_test) == 0: x_test = self.x_train else: x_test = self.x_test x_test = self.convert_np(x_test) if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']: pred = self.model.predict(x_test) return pred def metricplot(self, X=None, y_true=None): """绘制模型分类准确率图, 可直观查看每一类的分类正误情况 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ assert len(self.x_train) > 0 and len(self.y_train) > 0, \ "Error Code: -601. No dataset is loaded." if X is None and y_true is None: assert len(self.x_test) > 0 and len( self.y_test) > 0, "Error Code: -602. Dataset split was not performed." X = self.x_test y_true = self.y_test assert len(X) > 0 and len(y_true) > 0 visualizer = ClassPredictionError( self.model ) visualizer.fit(self.x_train, self.y_train) visualizer.score(X, y_true.reshape(-1)) visualizer.show() def plot(self, X=None, y_true=None): """绘制分类模型图 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test. y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test. """ # 如果没有任何输入，默认采用x_test和y_test if X is None: assert len( self.x_test) > 0, "Error Code: -602. Dataset split was not performed." X = self.x_test X = self.convert_np(X) y_pred = self.inference(X) if y_true is not None: y_true = self.convert_np(y_true) X = X.reshape(X.shape[0], -1) # 转为二维 if self.algorithm == 'KNN': self.knn_plot(X, y_pred, y_true) elif self.algorithm == 'SVM': self.svm_plot(X, y_pred, y_true) else: raise AssertionError( "Error Code: -405. No implementation of this method.") def knn_plot(self, X, y_pred, y_true=None): """绘制KNN分类图, 不同标签的样本用不同颜色点代替。选择2维特征作为xy坐标, 最多选择5个类别进行可视化。 Args: X (np.ndarray): 放入的测试数据。 y_pred (np.ndarray): 放入的测试数据的预测标签。 y_true (np.ndarray, optional): 放入的测试数据的真实标签。 """ # 训练数据特征多于2维，仅取前两维 if X.shape[1] > 2: print('\033[1;34;1mFeatures is more than 2 dimensions, ' 'the first two dimensions are used by default.\033[0m') label = np.unique(y_pred) # 最多选择5个类别进行可视化 if len(label) > 5: label.sort() label = label[:5] y_max = label[4] idx = np.where(y_pred <= y_max) y_pred = y_pred[idx] X = X[idx, :].squeeze() print('\033[1;34;1mThe number of classes is more than 5, ' 'the top 5 classes are used by default.\033[0m') label_list = ["y_pred_" + str(i) for i in range(len(label))] y_pred_plot = plt.scatter( X[:, 0], X[:, 1], marker='o', c=y_pred, cmap='rainbow') handles = y_pred_plot.legend_elements()[0] # 只有显式输入y_true才会被画出 if y_true is not None: true_label = np.unique(y_true) true_label_list = ["y_true_" + str(i) for i in range(len(true_label))] y_true_plot = plt.scatter( X[:, 0], X[:, 1], marker='s', c=y_true, cmap='viridis', s=10) handles += y_true_plot.legend_elements()[0] label_list += true_label_list plt.legend(handles=handles, labels=label_list) plt.show() def svm_plot(self, X, y_pred, y_true=None): """绘制SVM分类图, 不同标签的样本用不同颜色点代替, 绘制出SVM分类边界。选择2维特征作为xy坐标。 Args: X (np.ndarray): 放入的测试数据。 y_pred (np.ndarray): 放入的测试数据的预测标签。 y_true (np.ndarray, optional): 放入的测试数据的真实标签。 """ assert self.model.n_features_in_ == 2, "Error Code: -306. "\ "The number of features for training is wrong, required {}, "\ "which is {}.".format(2, self.model.n_features_in_) fig, ax = plt.subplots() ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap="rainbow") if y_true is not None: ax.scatter(X[:, 0], X[:, 1], c=y_true, s=8, cmap="viridis", marker='s') if ax is None: ax = plt.gca() xlim = ax.get_xlim() ylim = ax.get_ylim() x = np.linspace(xlim[0], xlim[1], 30) # 产生30个间隔 y = np.linspace(ylim[0], ylim[1], 30) # 产生30个间隔 _Y, _X = np.meshgrid(y, x) z = self.model.predict(np.c_[_X.flatten(), _Y.flatten()]) zz = z.reshape(_X.shape) ax.contour(_X, _Y, zz, colors="k", levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"]) ax.set_xlim(xlim) ax.set_ylim(ylim) plt.show()
import os import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.cluster import SpectralClustering from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import Birch from sklearn.metrics import silhouette_score from yellowbrick.cluster import InterclusterDistance import matplotlib.pyplot as plt from .base import baseml class Cluster(baseml): # cluster """BaseML中的聚类模块,包含['Kmeans'(K均值聚类), 'Spectral clustering'(谱聚类), 'Agglomerative clustering'(层次聚类), 'Birch'(二叉可伸缩聚类树聚类)]聚类算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='Kmeans', N_CLUSTERS=5, para={}): """clt类初始化 Args: algorithm (str, optional): 采用的聚类算法. Defaults to 'Kmeans'. N_CLUSTERS (int, optional): 聚类个数. Defaults to 5. para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(Cluster, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm self.n = N_CLUSTERS if self.algorithm == 'Kmeans': if len(para) > 1: self.model = KMeans(para) else: self.model = KMeans(n_clusters=N_CLUSTERS) elif self.algorithm == 'Spectral clustering': if len(para) > 1: self.model = SpectralClustering(para) else: self.model = SpectralClustering(n_clusters=N_CLUSTERS) elif self.algorithm == 'Agglomerative clustering': if len(para) > 1: self.model = AgglomerativeClustering(para) else: self.model = AgglomerativeClustering(n_clusters=N_CLUSTERS) elif self.algorithm == 'Birch': if len(para) > 1: self.model = Birch(para) else: self.model = Birch(n_clusters=N_CLUSTERS) def train(self, validate=False): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出模型轮廓系数. Defaults to True. """ self.model.fit(self.x_train) if validate: score = silhouette_score(self.x_train, labels=self.model.labels_) print('轮廓系数为：{}'.format(score)) # -1为不正确的聚类，0为重叠聚类，1为正确的聚类 def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[], shuffle=True, show=False, split=False, scale=False): # 聚类方法默认不需要split数据集 super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale) def inference(self, data=np.nan, verbose=True): """使用模型进行推理 Args: data (numpy, optional): 放进来推理的数据,不填默认使用self.x_train. verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True. Returns: pred: 返回预测结果. """ if data is not np.nan: # 对data进行了指定 self.x_test = data if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') assert type(self.demo_input) == type(self.x_test), f"Error Code: -309. The data type {type(self.x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == self.x_test.shape[1:], f"Error Code: -309. The data shape {self.x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." else: self.x_test = self.x_train self.x_test = self.convert_np(self.x_test) if verbose and len(self.x_train) != 0: labels = self.model.labels_ # 获取聚类标签 # print(silhouette_score(self.x_train, labels)) # 获取聚类结果总的轮廓系数 if self.algorithm == 'Kmeans': print(self.model.cluster_centers_) # 输出类簇中心 for i in range(self.n): print(f" CLUSTER-{i+1} ".center(60, '=')) print(self.x_train[labels == i]) if self.x_test is not []: pred = self.model.predict(self.x_test) return pred def metricplot(self, X=None): """绘制模型聚类簇间距离图, 各簇分的越开, 说明聚类效果越好。 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train. """ assert self.algorithm == 'Kmeans', \ "Error Code: -405. No implementation of this method." if X is None: assert len( self.x_train) > 0, "Error Code: -601. No dataset is loaded." X = self.x_train visualizer = InterclusterDistance(self.model) visualizer.fit(self.x_train) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure def plot(self, X=None): """绘制聚类模型图 Args: X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train. """ assert self.algorithm == 'Kmeans', \ "Error Code: -405. No implementation of this method." # 如果没有任何输入，默认采用x_train if X is None: if len(self.x_train) > 0: self.x_test = self.x_train assert len( self.x_test) > 0, "Error Code: -602. Dataset split was not performed." X = self.x_test X = self.convert_np(X) y_pred = self.inference(X) self.cluster_plot(X, y_pred) def cluster_plot(self, X, y_pred): """绘制聚类模型散点图，并显示聚类标签 Args: X (np.ndarray): 放入的测试数据, 不填默认使用self.x_train. y_pred (np.ndarray): 模型对测试数据预测的类别. """ # 训练数据特征多于2维，仅取前两维 if X.shape[1] > 2: print('\033[1;34;1mfeatures is more than 2 dimensions, \ the first two dimensions are used by default\033[0m') # 画出不同颜色的点 plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis') # 画出聚类中心 centers = self.model.cluster_centers_ plt.scatter(centers[:, 0], centers[:, 1], c='black', s=20, alpha=0.5) # 标出聚类序号 for i in range(self.model.cluster_centers_.shape[0]): plt.text(centers[:, 0][i]+0.5, y=centers[:, 1][i]+0.5, s=i, fontdict=dict(color='red', size=10), bbox=dict(facecolor='yellow', alpha=0.2), ) def valid(self, path=None, x=None ,y=None ,metrics='accuracy'): """验证模型的准确率 Args: path (str): 验证集的路径 x (np.ndarray, optional): 验证集的特征. Defaults to None. y (np.ndarray, optional): 验证集的标签. Defaults to None. metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'. Returns: acc: 返回验证指标的值 y_pred: 返回预测y值 """ if path is None and x is None and y is None: # 如果没有输入数据，默认采用x_test和y_test x = self.x_train y = self.y_train elif x is None and y is None: # 如果输入了路径，但是没有输入数据，则读取路径 df = pd.read_csv(path) x = df.iloc[:, :-1].values y = df.iloc[:, -1].values self.x_test = x self.y_test = y # 验证集的特征和标签不能为空 assert x is not None and y is not None, "Error Code: -801. The validation set cannot be empty. " assert len(x) > 0 and len(y) > 0, "Error Code: -801. The validation set cannot be empty. " y_pred = self.inference(x) from sklearn.metrics import silhouette_score, calinski_harabasz_score,davies_bouldin_score if metrics == 'silhouette_score': score = silhouette_score(x, self.model.labels_) print('验证轮廓系数为：{}%'.format(score * 100)) elif metrics == 'calinski_harabasz_score': score = calinski_harabasz_score(x, self.model.labels_) print('验证Calinski-Harabasz指数为：{}'.format(score)) elif metrics == 'davies_bouldin_score': score = davies_bouldin_score(x, self.model.labels_) print('验证Davies-Bouldin指数为：{}'.format(score)) else: raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics)) return score,y_pred
import os import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.manifold import LocallyLinearEmbedding as LLE from yellowbrick.features import PCA as yb_PCA from .base import baseml class DimentionReduction(baseml): # reduction """BaseML中的降维模块,包含['PCA'(主成分分析), 'LDA'(线性判别分析), 'LLE'(局部线性嵌入)]降维算法. Attributes: algorithm: 算法名称 model: 实例化的模型更多用法及算法详解可参考：https://xedu.readthedocs.io/zh/master/baseml/introduction.html """ def __init__(self, algorithm='PCA', n_components=2, para={}): """rdc类的构造函数 Args: algorithm (str, optional): 采用的降维算法. Defaults to 'PCA'. n_components (int, optional): 降维后保留的特征数. Defaults to 2. para (dict, optional): para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}. """ super(DimentionReduction, self).__init__() # 继承父类的构造方法 self.algorithm = algorithm self.n_components = n_components if self.algorithm == 'PCA': # 主成分分析 if len(para) > 1: self.model = PCA(para) else: self.model = PCA(n_components=n_components) elif self.algorithm == 'LDA': # 线性判别分析 if len(para) > 1: self.model = LDA(para) else: self.model = LDA(n_components=n_components) elif self.algorithm == 'LLE': # 局部线性嵌入 if len(para) > 1: self.model = LLE(**para) else: self.model = LLE(n_components=n_components) def train(self, validate=True): """训练模型. Args: validate (bool, optional): 是否需要验证模型，并输出方差贡献率. Defaults to True. """ if self.algorithm == 'LDA': if len(self.y_train) == 0: raise Exception("使用LDA时必须输入y标签") self.model.fit(self.x_train, self.y_train) else: self.model.fit(self.x_train) if validate and self.algorithm != 'LLE': explained_var = self.model.explained_variance_ratio_ # 获取贡献率 print('累计方差贡献率为：{}'.format(explained_var)) def inference(self, data=np.nan): """使用模型进行降维 Args: data (numpy, optional): 放进来降维的数据,不填默认使用self.x_train. Returns: pred: 返回降维结果，保留的特征数为刚开始输进来的. """ if data is not np.nan: # 对data进行了指定 self.x_test = data if self.input_shape is not None: model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch') x_test = self.convert_np(self.x_test) assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}." assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}." else: self.x_test = self.x_train self.x_test = self.convert_np(self.x_test) if self.x_test is not []: pred = self.model.transform(self.x_test) return pred def fit_transform(self): # 一步到位地返回降维结果 return self.model.fit_transform(self.x_train) def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[], shuffle=True, show=False, split=False, scale=False): # 降维方法默认不需要split数据集 super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale) def plot(self, X=None, y_true=None): """绘制降维模型图，目前仅支持PCA. Args: X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train. y_true (_type_, optional): 测试数据的真实标签，, 不输入默认使用self.y_train. """ assert self.algorithm == 'PCA', "Error Code: -405. No implementation of this method." # 如果没有任何输入，默认采用x_train if X is None: if len(self.x_train) > 0: self.x_test = self.x_train assert len( self.x_test) > 0, "Error Code: -601. No dataset is loaded." X = self.x_test y_true = self.y_train X = self.convert_np(X) assert y_true is not None and len(y_true) > 0, \ "Error Code: -307. The parameter {} is not set.".format("y_true") y_true = self.convert_np(y_true) self.pca_projection(X, y_true) def pca_projection(self, X, y_true): """绘制PCA投影图, 能够投影至2维或3维中, 检验数据降维的可行性 Args: X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train. y_true (_type_, optional): 测试数据的真实标签，, 不输入默认使用self.y_train. """ proj = self.n_components proj = min(proj, 3) label = np.unique(y_true) classes = ['class_%i' % i for i in range(len(label))] visualizer = yb_PCA(scale=True, projection=proj, classes=classes) visualizer.fit_transform(X, y_true.squeeze()) visualizer.show()
参考示例 from BaseML import Regression as reg #导入回归任务模块 model = reg('LinearRegression') #实例化线性回归模型 model.load_tab_data('data/shoe_size-height1.csv') #载入训练集数据 model.train() #训练模型 model.valid('data/shoe_size-height1.csv',metrics='r2') #对模型进行评估 model.metricplot() #可视化评估结果 model.plot() #模型预测结果可视化 model.save('data/mymodel.pkl') #保存模型 #根据实际的进行填写 shoe_size= 40.5 result = model.inference([shoe_size]) #得出预测的身高 print("\n鞋码为", shoe_size, "时，预测的身高为：", f"{result[0]:.2f}") #打印结果

DN

机器学习 BASEML到底是什么

公告