机器学习 BASEML到底是什么

from .base import baseml
from .BaseClassification import Classification
from .BaseRegression import Regression
from .BaseCluster import Cluster
from .BaseDimentionReduction import DimentionReduction


__all__ = [
    'baseml',
    'Classification',
    'Regression',
    'Cluster',
    'DimentionReduction',
    ]

 

 
# BaseML基类,各个大类能够继承其中的基本方法
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import joblib


class baseml:
    """BaseML中的继承基类,单例模式避免多次调用创建

    """

    def __init__(self):

        self.cwd = os.path.dirname(os.getcwd())  # 获取当前文件的绝对路径
        self.file_dirname = os.path.dirname(os.path.abspath(__file__))
        self.x_train, self.x_test, self.y_train, self.y_test, self.x_val, self.y_val = [
        ], [], [], [], [], []
        self.X = []
        self.Y = []
        self.dataset = []
        self.model = None
        self.test_size = 0.2
        self.scaler = None
        self.demo_input = None
        self.input_shape = None

    # 采用单例,避免基类创建太多次
    def __new__(cls, *args, **kwargs):
        # print("__new__")
        if not hasattr(baseml, "_instance"):
            # print("创建新实例")
            baseml._instance = object.__new__(cls)
        return baseml._instance

    def train(self):
        # 必须要改写的类
        raise NotImplementedError("train function must be implemented")

    def inference(self):
        # 必须要改写的类
        raise NotImplementedError("inference function must be implemented")

    def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[],
                     shuffle=True, show=False, split=True, scale=False):
        """Load the model's data set.

        Args:
            X (str|numpy|pandas|list): 自变量.
            y (str|numpy|pandas|list, optional): 目标值. 默认为 [].
            type (str, optional): X和y的输入格式, choice = ['csv', 'numpy','pandas','list','txt], 最后统一转换为numpy. 
            x_column (list, optional): X 的索引列. 默认设置为X的所有列.
            y_column (list, optional): y的索引列. 默认设置为y的所有列.
            shuffle (bool, optional): 是否对元素随机排序. 默认为True.
            show (bool, optional): 显示5条数据. 默认为True.
            split(bool, optional): 是否划分数据集为训练集和测试集. 默认为True.
            scale(bool, optional): 是否对数据进行归一化. False.

        """
        if (type == 'csv' or type == 'txt') and len(x_column) == 0:
            raise ValueError("请传入数据列号")
        if type == 'csv':
            self.dataset = pd.read_csv(X).values  # .values就转成numpy格式了
            if shuffle:
                np.random.shuffle(self.dataset)
            self.get_data(self.dataset, self.dataset,
                          x_column, y_column, split, scale)
        elif type == 'numpy':
            if shuffle:
                X, y = self.shuffle_data(X, y)
            self.get_data(X, y, x_column, y_column, split, scale)
        elif type == 'pandas':
            X = X.values
            y = y.values if len(y) > 0 else []
            if shuffle:
                X, y = self.shuffle_data(X, y)
            self.get_data(X, y, x_column, y_column, split, scale)
        elif type == 'list':
            X = np.array(X)
            y = np.array(y) if len(y) > 0 else []
            if shuffle:
                X, y = self.shuffle_data(X, y)
            self.get_data(X, y, x_column, y_column, split, scale)
        elif type == 'txt':
            self.dataset = np.loadtxt(X)
            self.dataset = self.dataset.values
            if shuffle:
                np.random.shuffle(self.dataset)
            self.get_data(self.dataset, self.dataset,
                          x_column, y_column, split, scale)

        print("Load dataset successfully!")

        if show and len(self.x_train) >= 5:   # 显示前5条数据
            print("X")
            print(self.x_train[:5])
            print("y")
            if len(self.y_train) >= 5:
                print(self.y_train[:5])
            else:
                print("None")

    def get_data(self, X, y, x_column, y_column, split, scale):
        """通过列号获取真实的训练数据

        Args:
            X (numpy.ndarray): 自变量.
            y (numpy.ndarray): 因变量.
            x_column (list): 自变量的列索引集合.
            y_column (list): 因变量的列索引集合.
        """
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        if len(x_column) == 0 and len(X):
            # 如果没有赋值,那么默认选用所有列
            x_column = list(range(X.shape[1]))
        if len(y_column) == 0 and len(y):
            # 如果没有赋值,默认用y的所有列
            if y.ndim == 1:
                y_column = [0]
            else:
                y_column = list(range(y.shape[1]))

        if len(X):
            self.x_train = X[:, x_column]

        if scale:  # 对训练数据进行归一化,在聚类、部分分类的时候需要使用
            self.scaler = MinMaxScaler(feature_range=(0, 1))
            self.x_train = self.scaler.fit_transform(self.x_train)

        if len(y):  #
            if y.ndim == 1:
                y = y.reshape(-1, 1)
            self.y_train = y[:, y_column]
            if self.y_train.shape[0]:
                self.dataset = np.concatenate(
                    (self.x_train, self.y_train), axis=1)  # 按列进行拼接
                
        else:
            self.dataset = self.x_train

        if split:   # 进行数据集划分
            self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
                self.x_train,  self.y_train, test_size=self.test_size, random_state=42)

    def shuffle_data(self, X, y):
        if len(X) == len(y):
            c = list(zip(X, y))  # 保持X与y的对应关系
            np.random.shuffle(c)
            X = np.array([t[0] for t in c])
            y = np.array([t[1] for t in c])
        elif len(X) > 0 and len(y) == 0:
            np.random.shuffle(X)

        return X, y

    def save(self, path="checkpoint.pkl"):
        data = {
            'model': self.model,
            'input_shape': self.x_train.shape,
            'demo_input': self.x_train[:1],
        }
        print("Saving model checkpoints...")
        joblib.dump(data, path, compress=3)
        print("Saved successfully!")

    def load(self, path):
        # self.model = joblib.load(path)
        model = joblib.load(path)
        if isinstance(model, dict):
            self.model = model['model']
            try:
                self.demo_input = model['demo_input']
                self.input_shape = model['input_shape']
            except:
                pass
        else:
            self.model = model
        


    def reverse_scale(self, data):
        return self.scaler.inverse_transform(data)

    def get_test_data(self):
        return self.x_test, self.y_test

    def convert_np(self, data):
        if isinstance(data, np.ndarray):
            pass
        elif isinstance(data, list):
            data = np.array(data)
        elif isinstance(data, pd.DataFrame):
            data = data.values
        elif isinstance(data, tuple):
            data = np.array(data)
        else:
            TypeError("The type {} is not supported".format(type(data)))
        return data

    def plot(self, X=None, y_true=None):
        # 模型可视化,若不被改写则不被支持
        raise NotImplementedError(
            "Error Code: -405. No implementation of this method.")

    def metricplot(self, X=None, y_true=None):
        # 模型可视化,若不被改写则不被支持
        raise NotImplementedError(
            "Error Code: -405. No implementation of this method.")

    def load_tab_data(self, data_path, train_val_ratio=1.0, shuffle=True,random_seed=42,y_type='float',**kw):
        # if y_type == 'long' and self.task_type == 'reg':
        #     y_type = 'float'
        data = np.loadtxt(data_path, dtype=float, delimiter=',',skiprows=1) # [120, 4]
        x = data[:,:-1]
        y = data[:, -1]
        y = y.astype(y_type)
        if 0 < train_val_ratio < 1:
            train_size =  int(train_val_ratio * len(x))
            val_size =  len(x) - train_size

            x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=train_size, test_size=val_size, random_state=random_seed,shuffle=shuffle)
        else:
            x_train, y_train = x, y
            x_val, y_val = None, None

        # if self.task_type == 'cls':
        #     y_train = y_train.astype(int)
        #     y_val = y_val.astype(int) if y_val is not None else None
        # elif self.task_type =='reg':
        #     y_train = y_train.astype(float)
        #     y_val = y_val.astype(float) if y_val is not None else None
        
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_val
        self.y_test = y_val
        return x_train, y_train, x_val, y_val
    
    def set_para(self, **kw):
        for i in kw:
            print("Setting {} to {}".format(i, kw[i]))
            setattr(self.model, i, kw[i])

    @property
    def para(self):
        return self.para

    @para.setter
    def para(self, kw):
        for i in kw:
            print("Setting {} to {}".format(i, kw[i]))
            setattr(self.model, i, kw[i])

    def valid(self, path=None, x=None ,y=None ,metrics='accuracy'):
        """验证模型的准确率

        Args:
            path (str): 验证集的路径
            x (np.ndarray, optional): 验证集的特征. Defaults to None.
            y (np.ndarray, optional): 验证集的标签. Defaults to None.
            metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'.

        Returns:
            acc: 返回验证指标的值
            y_pred: 返回预测y值
        """
        if path is None and x is None and y is None: # 如果没有输入数据,默认采用x_test和y_test
            x = self.x_test
            y = self.y_test
        elif x is None and y is None: # 如果输入了路径,但是没有输入数据,则读取路径
            df = pd.read_csv(path)
            x = df.iloc[:, :-1].values
            y = df.iloc[:, -1].values
            self.x_test = x
            self.y_test = y

        # 验证集的特征和标签不能为空
        assert x is not None and y is not None,  "Error Code: -801. The validation set cannot be empty. "
        assert len(x) > 0 and len(y) > 0,  "Error Code: -801. The validation set cannot be empty. "
        
        y_pred = self.inference(x)

        from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,\
        r2_score,mean_squared_error,mean_absolute_error,auc,\
        silhouette_score
        if metrics == 'accuracy' or metrics=='acc':
            score = accuracy_score(y, y_pred)
            print('验证准确率为:{}%'.format(score * 100))
        elif metrics == 'precision':
            score = precision_score(y, y_pred,average='weighted')
            print('验证精确率为:{}%'.format(score * 100))

        elif metrics =='recall':
            score = recall_score(y, y_pred,average='weighted')
            print('验证召回率为:{}%'.format(score * 100))

        elif metrics == 'f1':
            score = f1_score(y, y_pred,average='weighted')
            print('验证f1-score为:{}%'.format(score * 100))
        
        elif metrics == 'auc':
            score = auc(y, y_pred)
            print('验证auc为:{}%'.format(score * 100))
        
        elif metrics == 'r2':
            assert len(y) >= 2, "Error Code: -603. The validation set has less than 2 samples and r2-score cannot be calculated."
            score = r2_score(y, y_pred)
            
            print('验证r2-score为:{}%'.format(score * 100))
            
        elif metrics =='mse':
            score = mean_squared_error(y, y_pred)
            print('验证均方误差为:{}%'.format(score * 100))

        elif metrics =='mae':
            score = mean_absolute_error(y, y_pred)
            print('验证平均绝对误差为:{}%'.format(score * 100))

        else:
            raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics))
        return score,y_pred

  

 
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from yellowbrick.regressor import PredictionError
import matplotlib.pyplot as plt
import joblib
from .base import baseml


class Regression(baseml):
    """BaseML中的回归模块,包含['LinearRegression'(线性回归), 'CART'(决策树回归), 'RandomForest'(随机森林回归),
       'Polynomial'(多项式回归), 'Lasso'(角回归), 'Ridge'(岭回归), 'SVM'(支持向量机回归), 'AdaBoost'(自适应增强回归), 'MLP'(多层感知机回归)]回归算法.

    Attributes:
        algorithm: 算法名称
        model: 实例化的模型
    
    更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
    """

    def __init__(self, algorithm='LinearRegression', n_estimators=20, degree=2, n_hidden=(100,), para={}):
        """reg类的初始化

        Args:
            algorithm (str, optional): 选择的回归学习器. Defaults to 'LinearRegression'.
            n_estimators (int, optional): RandomForest集成的决策树个数. Defaults to 20.
            degree (int, optional): 多项式回归的阶数. Defaults to 2.
            para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
        """
        super(Regression, self).__init__()   # 继承父类的构造方法
        self.algorithm = algorithm
        if self.algorithm == 'LinearRegression':   # 线性回归
            if len(para) > 1:
                self.model = linear_model.LinearRegression(**para)
            else:
                self.model = linear_model.LinearRegression()
        elif self.algorithm == 'CART':   # 决策树回归
            if len(para) > 1:
                self.model = tree.DecisionTreeRegressor(**para)
            else:
                self.model = tree.DecisionTreeRegressor()
        elif self.algorithm == 'RandomForest':   # 随机森林回归
            if len(para) > 1:
                self.model = ensemble.RandomForestRegressor(**para)
            else:
                self.model = ensemble.RandomForestRegressor(
                    n_estimators=n_estimators)
        elif self.algorithm == 'Polynomial':     # 多项式回归
            if len(para) > 1:
                self.model = PolynomialFeatures(**para)
                self.poly_linear_model = linear_model.LinearRegression()
            else:
                self.model = PolynomialFeatures(degree=degree)
                self.poly_linear_model = linear_model.LinearRegression()
        elif self.algorithm == 'Lasso':          # Lasso回归
            if len(para) > 1:
                self.model = linear_model.Lasso(**para)
            else:
                self.model = linear_model.Lasso()
        elif self.algorithm == 'Ridge':          # 岭回归
            if len(para) > 1:
                self.model = linear_model.Ridge(**para)
            else:
                self.model = linear_model.Ridge()
        elif self.algorithm == 'SVM':
            if len(para) > 1:
                self.model = SVR(**para)
            else:
                self.model = SVR(degree=degree)
        elif self.algorithm == 'AdaBoost':
            if len(para) > 1:
                self.model = AdaBoostRegressor(**para)
            else:
                self.model = AdaBoostRegressor(n_estimators=n_estimators)
        elif self.algorithm == 'MLP':
            if len(para) > 1:
                self.model = MLPRegressor(**para)
            else:
                self.model = MLPRegressor(
                    hidden_layer_sizes=n_hidden, solver='lbfgs')

    def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200):
        """训练模型.

        Args:
            validate (bool, optional): 是否需要验证模型,并输出准确率. Defaults to False.
            val_size (float, optional): 验证集比例. Defaults to 0.2.
            lr (float, optional): 学习率. Defaults to 0.001.
            epochs (int, optional): 训练轮数. Defaults to 200.
        """
        if self.algorithm == 'MLP':
            self.model.learning_rate_init = lr
            self.model.max_iter = epochs
        elif self.algorithm == 'AdaBoost':
            self.model.learning_rate = lr

        if validate:  # 需要划分数据集,并输出准确率
            self.x_train, self.x_val, self.y_train, self.y_val = \
                train_test_split(self.x_train, self.y_train,
                                 test_size=val_size, random_state=0)

        if self.algorithm == 'Polynomial':
            x_transformed = self.model.fit_transform(
                self.x_train)  # x每个数据对应的多项式系数
            self.poly_linear_model.fit(x_transformed, self.y_train)

        else:
            self.model.fit(self.x_train, self.y_train)

        if self.algorithm == 'LinearRegression':
            self.coef = self.model.coef_
            self.intercept = self.model.intercept_

        if validate:
            if len(self.y_val < 2):
                print("测试集小于2个样本,无法使用R值计算")
            else:
                pred = self.model.predict(self.x_val)
                acc = r2_score(self.y_val, pred)
                print('R值为: {}%'.format(acc))

    def inference(self, data=np.nan):
        """_summary_

        Args:
            data (numpy, optional): 放进来推理的数据,不填默认使用self.x_test.

        Returns:
            pred: 返回预测结果.
        """
        # if data is not np.nan:  # 对data进行了指定
        #     self.x_test = data
        x_test = data if data is not np.nan else self.x_test
        assert len(x_test) > 0, "Error Code: -601. No dataset is loaded."
        x_test = self.convert_np(x_test)
        if self.input_shape is not None: 
            model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
            assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
            assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."

        if x_test.ndim != 2:
            x_test = x_test.reshape(x_test.shape[0], -1)
        if self.algorithm == 'Polynomial':
            x_trans = self.model.transform(x_test)
            self.pred = self.poly_linear_model.predict(x_trans)
            # self.pred = self.model.
        else:
            self.pred = self.model.predict(x_test)

        return self.pred

    # 重写方法
    def save(self, path="checkpoint.pkl"):
        print("Saving model checkpoints...")

        if self.algorithm == 'Polynomial':
            modelList = [self.model, self.poly_linear_model]
            data = {
                'model': modelList,
                'input_shape': self.x_train.shape,
                'demo_input': self.x_train[:1],
            }
            joblib.dump(data, path, compress=3)
        else:
            data = {
                'model': self.model,
                'input_shape': self.x_train.shape,
                'demo_input': self.x_train[:1],
            }
            joblib.dump(data, path, compress=3)
        print("Saved successfully!")

    def load(self, path):
        if self.algorithm == 'Polynomial':
            
            self.model = joblib.load(path)['model'][0]
            self.poly_linear_model = joblib.load(path)['model'][1]
        else:
            self.model = joblib.load(path)['model']

    def metricplot(self, X=None, y_true=None):
        """绘制模型回归预测误差图, 图中的identity为基准线, 说明预测出的标签(y轴)与
        真实标签(x轴)相同。回归模型越靠近基准线则越好。该图显示了回归模型的方差大小。

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
            y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
        """

        if X is None and y_true is None:
            X = self.x_test
            y_true = self.y_test
        # assert len(self.x_train) > 0 and len(self.y_train) > 0,  \
        #     "Error Code: -601. No dataset is loaded."
        assert X is not None and y_true is not None,  "Error Code: -604. No valid data is provided or the validataion dataset is empty."
        assert len(X) > 0 and len(y_true) > 0,  "Error Code: -604. No valid data is provided or the validataion dataset is empty."
        if self.algorithm == 'Polynomial':
            from sklearn.pipeline import make_pipeline
            model = make_pipeline(self.model, self.poly_linear_model)
        else:
            model = self.model

        visualizer = PredictionError(
            model,
            title="Actual vs. Predicted Values",
        )

        # self.y_test = self.y_test.squeeze()
        # visualizer.fit(self.x_train, self.y_train)
        # visualizer.score_ = visualizer.estimator.score(
        #     self.x_test, self.y_test)
        # result = self.inference(self.x_test).squeeze()
        # visualizer.draw(self.y_test, result)
        y_true = y_true.squeeze()
        visualizer.fit(X, y_true)

        visualizer.score_ = visualizer.estimator.score(X, y_true)
        result = self.inference(X).squeeze()
        visualizer.draw(y_true, result)

        visualizer.show()

    def plot(self, X=None, y_true=None):
        """绘制回归模型图.

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
            y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
        """

        # 如果没有任何输入,默认采用x_test和y_test
        if X is None:
            assert len(
                self.x_test) is not None, "Error Code: -601. No dataset is loaded."
            X = self.x_test
            y_true = self.y_test
        X = self.convert_np(X)
        y_pred = self.inference(X)
        if y_true is not None:
            y_true = self.convert_np(y_true)
        X = X.reshape(X.shape[0], -1)   # 转为二维
        if self.algorithm == 'LinearRegression':
            self.linear_reg_plot(X[:, 0], y_pred, y_true)
        else:
            raise AssertionError(
                "Error Code: -405. No implementation of this method.")

    def linear_reg_plot(self, X, y_pred, y_true=None):
        """绘制线性回归模型图, 仅支持使用1维特征训练的模型. 

        Args:
            X (np.ndarray): 放入的测试数据.
            x_pred (np.ndarray): 测试数据的预测标签.
            y_true (np.ndarray, optional): 放入的测试数据的真实标签, 当被显式填入时才会绘制出真实的散点.
        """

        assert self.model.n_features_in_ == 1, \
            "Error Code: -306. "\
            "The number of features for training is wrong, required {}, "\
            "which is {}.".format(1, self.model.n_features_in_)
        fig, ax = plt.subplots()
        if y_true is not None:
            ax.scatter(X, y_true)
        ax.plot(X, y_pred, color='red')
        ax.set_xlabel('x')
        ax.set_ylabel('y')
        ax.axis('tight')

        plt.show()

  

 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from yellowbrick.classifier import ClassPredictionError
from .base import baseml


class Classification(baseml):
    """BaseML中的分类模块,包含['KNN'(K近临分类), 'SVM'(支持向量机分类), 'NaiveBayes'(朴素贝叶斯分类), 'CART'(决策树分类), 
        'AdaBoost'(自适应增强分类), 'MLP'(多层感知机分类), 'RandomForest'(随机森林分类)]分类算法.

    Attributes:
        algorithm: 算法名称
        model: 实例化的模型

    更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
    """

    def __init__(self, algorithm='KNN', n_neighbors=5, n_estimators=100, n_hidden=(100,), para={}):
        """cls类初始化.

        Args:
            algorithm (str, optional): 采用的分类算法. Defaults to 'KNN'.
            n_neighbors (int, optional): KNN的k值. Defaults to 5.
            n_estimators (int, optional): Adaboost|RandomForest所集成的决策树个数. Defaults to 100.
            n_hidden (tuple, optional): MLP隐藏层的形状. Defaults to (100,).
            para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
        """
        super(Classification, self).__init__()   # 继承父类的构造方法
        self.algorithm = algorithm

        if self.algorithm == 'KNN':
            if len(para) > 1:
                self.model = KNeighborsClassifier(**para)
            else:
                self.model = KNeighborsClassifier(n_neighbors=n_neighbors)
        elif self.algorithm == 'SVM':
            if len(para) > 1:
                self.model = SVC(**para)
            else:
                self.model = SVC()
        elif self.algorithm == 'NaiveBayes':
            if len(para) > 1:
                self.model = GaussianNB(**para)
            else:
                self.model = GaussianNB()
        elif self.algorithm == 'CART':
            if len(para) > 1:
                self.model = DecisionTreeClassifier(**para)
            else:
                self.model = DecisionTreeClassifier()
        elif self.algorithm == 'AdaBoost':
            if len(para) > 1:
                self.model = AdaBoostClassifier(**para)
            else:
                self.model = AdaBoostClassifier(
                    n_estimators=n_estimators, random_state=0)

        elif self.algorithm == 'MLP':
            if len(para) > 1:
                self.model = MLPClassifier(**para)
            else:
                self.model = MLPClassifier(
                    hidden_layer_sizes=n_hidden, solver='lbfgs')
        elif self.algorithm == 'RandomForest':
            if len(para) > 1:
                self.model = RandomForestClassifier(**para)
            else:
                self.model = RandomForestClassifier(
                    n_estimators=n_estimators, random_state=0)

    def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200):
        """训练模型.

        Args:
            validate (bool, optional): 是否需要验证模型,并输出准确率. Defaults to False.
            val_size (float, optional): 验证集比例. Defaults to 0.2.
            lr (float, optional): 学习率. Defaults to 0.001.
            epochs (int, optional): 训练轮数. Defaults to 200.
        """
        if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']:
            # 设定学习率
            if self.algorithm == 'MLP':
                self.model.learning_rate_init = lr
                self.model.max_iter = epochs
            elif self.algorithm == 'AdaBoost':
                self.model.learning_rate = lr


            if validate:

                self.x_train, self.x_val, self.y_train, self.y_val = \
                    train_test_split(self.x_train, self.y_train,
                                     test_size=val_size, random_state=0)

            self.model.fit(self.x_train, self.y_train)

            if validate:
                pred = self.model.predict(self.x_val)
                acc = accuracy_score(self.y_val, pred)
                print('训练准确率为:{}%'.format(acc * 100))

    def inference(self, data=np.nan, verbose=True):
        """使用模型进行推理

        Args:
            data (np.ndarray, optional): 放进来推理的数据,不填默认使用self.x_test.
            verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True.

        Returns:
            pred: 返回预测结果.
        """
        if data is not np.nan:  # 对data进行了指定
            x_test = self.convert_np(data)
            if self.input_shape is not None: 
                model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
                assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
                assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."


        elif len(self.x_train) > 0 and len(self.x_test) == 0:
            x_test = self.x_train
        else:
            x_test = self.x_test
        x_test = self.convert_np(x_test)

        if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']:
            pred = self.model.predict(x_test)
            return pred

    def metricplot(self, X=None, y_true=None):
        """绘制模型分类准确率图, 可直观查看每一类的分类正误情况

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
            y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
        """

        assert len(self.x_train) > 0 and len(self.y_train) > 0,  \
            "Error Code: -601. No dataset is loaded."
        if X is None and y_true is None:
            assert len(self.x_test) > 0 and len(
                self.y_test) > 0,  "Error Code: -602. Dataset split was not performed."
            X = self.x_test
            y_true = self.y_test
        assert len(X) > 0 and len(y_true) > 0
        visualizer = ClassPredictionError(
            self.model
        )
        visualizer.fit(self.x_train, self.y_train)
        visualizer.score(X, y_true.reshape(-1))
        visualizer.show()

    def plot(self, X=None, y_true=None):
        """绘制分类模型图

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
            y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
        """

        # 如果没有任何输入,默认采用x_test和y_test
        if X is None:
            assert len(
                self.x_test) > 0, "Error Code: -602. Dataset split was not performed."
            X = self.x_test
        X = self.convert_np(X)
        y_pred = self.inference(X)
        if y_true is not None:
            y_true = self.convert_np(y_true)
        X = X.reshape(X.shape[0], -1)   # 转为二维
        if self.algorithm == 'KNN':
            self.knn_plot(X, y_pred, y_true)
        elif self.algorithm == 'SVM':
            self.svm_plot(X, y_pred, y_true)
        else:
            raise AssertionError(
                "Error Code: -405. No implementation of this method.")

    def knn_plot(self, X, y_pred, y_true=None):
        """绘制KNN分类图, 不同标签的样本用不同颜色点代替。选择2维特征作为xy坐标, 最多选择5个类别进行可视化。

        Args:
            X (np.ndarray): 放入的测试数据。
            y_pred (np.ndarray): 放入的测试数据的预测标签。
            y_true (np.ndarray, optional): 放入的测试数据的真实标签。
        """

        # 训练数据特征多于2维,仅取前两维
        if X.shape[1] > 2:
            print('\033[1;34;1mFeatures is more than 2 dimensions, '
                  'the first two dimensions are used by default.\033[0m')

        label = np.unique(y_pred)
        # 最多选择5个类别进行可视化
        if len(label) > 5:
            label.sort()
            label = label[:5]
            y_max = label[4]
            idx = np.where(y_pred <= y_max)
            y_pred = y_pred[idx]
            X = X[idx, :].squeeze()
            print('\033[1;34;1mThe number of classes is more than 5, '
                  'the top 5 classes are used by default.\033[0m')

        label_list = ["y_pred_" + str(i) for i in range(len(label))]
        y_pred_plot = plt.scatter(
            X[:, 0], X[:, 1], marker='o', c=y_pred, cmap='rainbow')
        handles = y_pred_plot.legend_elements()[0]

        # 只有显式输入y_true才会被画出
        if y_true is not None:
            true_label = np.unique(y_true)
            true_label_list = ["y_true_" + str(i)
                               for i in range(len(true_label))]
            y_true_plot = plt.scatter(
                X[:, 0], X[:, 1], marker='s', c=y_true, cmap='viridis', s=10)
            handles += y_true_plot.legend_elements()[0]
            label_list += true_label_list

        plt.legend(handles=handles, labels=label_list)
        plt.show()

    def svm_plot(self, X, y_pred, y_true=None):
        """绘制SVM分类图, 不同标签的样本用不同颜色点代替, 绘制出SVM分类边界。选择2维特征作为xy坐标。

        Args:
            X (np.ndarray): 放入的测试数据。
            y_pred (np.ndarray): 放入的测试数据的预测标签。
            y_true (np.ndarray, optional): 放入的测试数据的真实标签。
        """

        assert self.model.n_features_in_ == 2, "Error Code: -306. "\
            "The number of features for training is wrong, required {}, "\
            "which is {}.".format(2, self.model.n_features_in_)

        fig, ax = plt.subplots()
        ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap="rainbow")
        if y_true is not None:
            ax.scatter(X[:, 0], X[:, 1], c=y_true,
                       s=8, cmap="viridis", marker='s')
        if ax is None:
            ax = plt.gca()
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()

        x = np.linspace(xlim[0], xlim[1], 30)  # 产生30个间隔
        y = np.linspace(ylim[0], ylim[1], 30)  # 产生30个间隔
        _Y, _X = np.meshgrid(y, x)
        z = self.model.predict(np.c_[_X.flatten(), _Y.flatten()])

        zz = z.reshape(_X.shape)

        ax.contour(_X, _Y, zz, colors="k",
                   levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"])
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)

        plt.show()

    

  

 
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch

from sklearn.metrics import silhouette_score
from yellowbrick.cluster import InterclusterDistance
import matplotlib.pyplot as plt
from .base import baseml


class Cluster(baseml):  # cluster
    """BaseML中的聚类模块,包含['Kmeans'(K均值聚类), 'Spectral clustering'(谱聚类), 'Agglomerative clustering'(层次聚类),
       'Birch'(二叉可伸缩聚类树聚类)]聚类算法.

    Attributes:
        algorithm: 算法名称
        model: 实例化的模型
    
    更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
    """

    def __init__(self, algorithm='Kmeans', N_CLUSTERS=5, para={}):
        """clt类初始化

        Args:
            algorithm (str, optional): 采用的聚类算法. Defaults to 'Kmeans'.
            N_CLUSTERS (int, optional): 聚类个数. Defaults to 5.
            para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
        """
        super(Cluster, self).__init__()   # 继承父类的构造方法
        self.algorithm = algorithm
        self.n = N_CLUSTERS

        if self.algorithm == 'Kmeans':
            if len(para) > 1:
                self.model = KMeans(**para)
            else:
                self.model = KMeans(n_clusters=N_CLUSTERS)
        elif self.algorithm == 'Spectral clustering':
            if len(para) > 1:
                self.model = SpectralClustering(**para)
            else:
                self.model = SpectralClustering(n_clusters=N_CLUSTERS)
        elif self.algorithm == 'Agglomerative clustering':
            if len(para) > 1:
                self.model = AgglomerativeClustering(**para)
            else:
                self.model = AgglomerativeClustering(n_clusters=N_CLUSTERS)
        elif self.algorithm == 'Birch':
            if len(para) > 1:
                self.model = Birch(**para)
            else:
                self.model = Birch(n_clusters=N_CLUSTERS)

    def train(self, validate=False):
        """训练模型.

        Args:
            validate (bool, optional): 是否需要验证模型,并输出模型轮廓系数. Defaults to True.
        """

        self.model.fit(self.x_train)

        if validate:
            score = silhouette_score(self.x_train, labels=self.model.labels_)
            print('轮廓系数为:{}'.format(score))   # -1为不正确的聚类,0为重叠聚类,1为正确的聚类

    def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[],
                     shuffle=True, show=False, split=False, scale=False):
        # 聚类方法默认不需要split数据集
        super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale)

    def inference(self, data=np.nan, verbose=True):
        """使用模型进行推理

        Args:
            data (numpy, optional): 放进来推理的数据,不填默认使用self.x_train.
            verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True.

        Returns:
            pred: 返回预测结果.
        """
        if data is not np.nan:  # 对data进行了指定
            self.x_test = data
            if self.input_shape is not None: 
                model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
                assert type(self.demo_input) == type(self.x_test), f"Error Code: -309. The data type {type(self.x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
                assert self.input_shape[1:] == self.x_test.shape[1:], f"Error Code: -309. The data shape {self.x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."

        else:
            self.x_test = self.x_train
        self.x_test = self.convert_np(self.x_test)

        if verbose and len(self.x_train) != 0:
            labels = self.model.labels_      # 获取聚类标签
            # print(silhouette_score(self.x_train, labels))      # 获取聚类结果总的轮廓系数
            if self.algorithm == 'Kmeans':
                print(self.model.cluster_centers_)  # 输出类簇中心
            for i in range(self.n):
                print(f" CLUSTER-{i+1} ".center(60, '='))
                print(self.x_train[labels == i])

        if self.x_test is not []:
            pred = self.model.predict(self.x_test)
            return pred

    def metricplot(self, X=None):
        """绘制模型聚类簇间距离图, 各簇分的越开, 说明聚类效果越好。

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train.
        """

        assert self.algorithm == 'Kmeans', \
            "Error Code: -405. No implementation of this method."
        if X is None:
            assert len(
                self.x_train) > 0, "Error Code: -601. No dataset is loaded."
            X = self.x_train
        visualizer = InterclusterDistance(self.model)

        visualizer.fit(self.x_train)        # Fit the data to the visualizer
        visualizer.show()        # Finalize and render the figure

    def plot(self, X=None):
        """绘制聚类模型图

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train.
        """

        assert self.algorithm == 'Kmeans', \
            "Error Code: -405. No implementation of this method."

        # 如果没有任何输入,默认采用x_train
        if X is None:
            if len(self.x_train) > 0:
                self.x_test = self.x_train
            assert len(
                self.x_test) > 0, "Error Code: -602. Dataset split was not performed."
            X = self.x_test
        X = self.convert_np(X)
        y_pred = self.inference(X)

        self.cluster_plot(X, y_pred)

    def cluster_plot(self, X, y_pred):
        """绘制聚类模型散点图,并显示聚类标签

        Args:
            X (np.ndarray): 放入的测试数据, 不填默认使用self.x_train.
            y_pred (np.ndarray): 模型对测试数据预测的类别.
        """

        # 训练数据特征多于2维,仅取前两维
        if X.shape[1] > 2:
            print('\033[1;34;1mfeatures is more than 2 dimensions, \
            the first two dimensions are used by default\033[0m')

        # 画出不同颜色的点
        plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis')
        # 画出聚类中心
        centers = self.model.cluster_centers_
        plt.scatter(centers[:, 0], centers[:, 1], c='black', s=20, alpha=0.5)
        # 标出聚类序号
        for i in range(self.model.cluster_centers_.shape[0]):
            plt.text(centers[:, 0][i]+0.5, y=centers[:, 1][i]+0.5, s=i,
                     fontdict=dict(color='red', size=10),
                     bbox=dict(facecolor='yellow', alpha=0.2),
                     )

    def valid(self, path=None, x=None ,y=None ,metrics='accuracy'):
        """验证模型的准确率

        Args:
            path (str): 验证集的路径
            x (np.ndarray, optional): 验证集的特征. Defaults to None.
            y (np.ndarray, optional): 验证集的标签. Defaults to None.
            metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'.

        Returns:
            acc: 返回验证指标的值
            y_pred: 返回预测y值
        """
        if path is None and x is None and y is None: # 如果没有输入数据,默认采用x_test和y_test
            x = self.x_train
            y = self.y_train
        elif x is None and y is None: # 如果输入了路径,但是没有输入数据,则读取路径
            df = pd.read_csv(path)
            x = df.iloc[:, :-1].values
            y = df.iloc[:, -1].values
            self.x_test = x
            self.y_test = y

        # 验证集的特征和标签不能为空
        assert x is not None and y is not None,  "Error Code: -801. The validation set cannot be empty. "
        assert len(x) > 0 and len(y) > 0,  "Error Code: -801. The validation set cannot be empty. "
        
        y_pred = self.inference(x)

        from sklearn.metrics import silhouette_score, calinski_harabasz_score,davies_bouldin_score

        if metrics == 'silhouette_score':
            score = silhouette_score(x, self.model.labels_)
            print('验证轮廓系数为:{}%'.format(score * 100))
        elif metrics == 'calinski_harabasz_score':
            score = calinski_harabasz_score(x, self.model.labels_)
            
            print('验证Calinski-Harabasz指数为:{}'.format(score))
        elif metrics == 'davies_bouldin_score':
            score = davies_bouldin_score(x, self.model.labels_)
            print('验证Davies-Bouldin指数为:{}'.format(score))


        else:
            raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics))
        return score,y_pred

  

 
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import LocallyLinearEmbedding as LLE
from yellowbrick.features import PCA as yb_PCA

from .base import baseml


class DimentionReduction(baseml):  # reduction
    """BaseML中的降维模块,包含['PCA'(主成分分析), 'LDA'(线性判别分析), 'LLE'(局部线性嵌入)]降维算法.

    Attributes:
        algorithm: 算法名称
        model: 实例化的模型
    
    更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
    """

    def __init__(self, algorithm='PCA', n_components=2, para={}):
        """rdc类的构造函数

        Args:
            algorithm (str, optional): 采用的降维算法. Defaults to 'PCA'.
            n_components (int, optional): 降维后保留的特征数. Defaults to 2.
            para (dict, optional): para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
        """

        super(DimentionReduction, self).__init__()   # 继承父类的构造方法
        self.algorithm = algorithm
        self.n_components = n_components

        if self.algorithm == 'PCA':     # 主成分分析
            if len(para) > 1:
                self.model = PCA(**para)
            else:
                self.model = PCA(n_components=n_components)
        elif self.algorithm == 'LDA':   # 线性判别分析
            if len(para) > 1:
                self.model = LDA(**para)
            else:
                self.model = LDA(n_components=n_components)
        elif self.algorithm == 'LLE':   # 局部线性嵌入
            if len(para) > 1:
                self.model = LLE(**para)
            else:
                self.model = LLE(n_components=n_components)

    def train(self, validate=True):
        """训练模型.

        Args:
            validate (bool, optional): 是否需要验证模型,并输出方差贡献率. Defaults to True.
        """
        if self.algorithm == 'LDA':
            if len(self.y_train) == 0:
                raise Exception("使用LDA时必须输入y标签")
            self.model.fit(self.x_train, self.y_train)
        else:
            self.model.fit(self.x_train)

        if validate and self.algorithm != 'LLE':
            explained_var = self.model.explained_variance_ratio_  # 获取贡献率
            print('累计方差贡献率为:{}'.format(explained_var))

    def inference(self, data=np.nan):
        """使用模型进行降维

        Args:
            data (numpy, optional): 放进来降维的数据,不填默认使用self.x_train.

        Returns:
            pred: 返回降维结果,保留的特征数为刚开始输进来的.
        """
        if data is not np.nan:  # 对data进行了指定
            self.x_test = data
            if self.input_shape is not None: 
                model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
                x_test = self.convert_np(self.x_test)   
                assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
                assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."

        else:
            self.x_test = self.x_train
        self.x_test = self.convert_np(self.x_test)
        if self.x_test is not []:
            pred = self.model.transform(self.x_test)
            return pred

    def fit_transform(self):
        # 一步到位地返回降维结果
        return self.model.fit_transform(self.x_train)

    def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[],
                     shuffle=True, show=False, split=False, scale=False):
        # 降维方法默认不需要split数据集
        super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale)

    def plot(self, X=None, y_true=None):
        """绘制降维模型图, 目前仅支持PCA.

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train.
            y_true (_type_, optional): 测试数据的真实标签,, 不输入默认使用self.y_train.
        """

        assert self.algorithm == 'PCA', "Error Code: -405. No implementation of this method."
        # 如果没有任何输入,默认采用x_train
        if X is None:
            if len(self.x_train) > 0:
                self.x_test = self.x_train
            assert len(
                self.x_test) > 0, "Error Code: -601. No dataset is loaded."
            X = self.x_test
            y_true = self.y_train
        X = self.convert_np(X)
        assert y_true is not None and len(y_true) > 0, \
            "Error Code: -307. The parameter {} is not set.".format("y_true")
        y_true = self.convert_np(y_true)

        self.pca_projection(X, y_true)

    def pca_projection(self, X, y_true):
        """绘制PCA投影图, 能够投影至2维或3维中, 检验数据降维的可行性

        Args:
            X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train.
            y_true (_type_, optional): 测试数据的真实标签,, 不输入默认使用self.y_train.
        """
        proj = self.n_components
        proj = min(proj, 3)

        label = np.unique(y_true)
        classes = ['class_%i' % i for i in range(len(label))]
        visualizer = yb_PCA(scale=True, projection=proj, classes=classes)
        visualizer.fit_transform(X, y_true.squeeze())

        visualizer.show()

  

 

参考示例

 

from BaseML import Regression as reg  #导入回归任务模块

model = reg('LinearRegression')   #实例化线性回归模型

model.load_tab_data('data/shoe_size-height1.csv')  #载入训练集数据

model.train()  #训练模型

model.valid('data/shoe_size-height1.csv',metrics='r2')   #对模型进行评估

model.metricplot()   #可视化评估结果

model.plot()   #模型预测结果可视化

model.save('data/mymodel.pkl')  #保存模型

#根据实际的进行填写
shoe_size= 40.5  
result = model.inference([shoe_size])   #得出预测的身高

print("\n鞋码为", shoe_size, "时,预测的身高为:", f"{result[0]:.2f}")    #打印结果

 

 

 
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
   
posted @ 2025-11-04 16:03  aiplus  阅读(15)  评论(0)    收藏  举报
悬浮按钮示例