机器学习 BASEML到底是什么
from .base import baseml from .BaseClassification import Classification from .BaseRegression import Regression from .BaseCluster import Cluster from .BaseDimentionReduction import DimentionReduction __all__ = [ 'baseml', 'Classification', 'Regression', 'Cluster', 'DimentionReduction', ]
|
|
# BaseML基类,各个大类能够继承其中的基本方法
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import joblib
class baseml:
"""BaseML中的继承基类,单例模式避免多次调用创建
"""
def __init__(self):
self.cwd = os.path.dirname(os.getcwd()) # 获取当前文件的绝对路径
self.file_dirname = os.path.dirname(os.path.abspath(__file__))
self.x_train, self.x_test, self.y_train, self.y_test, self.x_val, self.y_val = [
], [], [], [], [], []
self.X = []
self.Y = []
self.dataset = []
self.model = None
self.test_size = 0.2
self.scaler = None
self.demo_input = None
self.input_shape = None
# 采用单例,避免基类创建太多次
def __new__(cls, *args, **kwargs):
# print("__new__")
if not hasattr(baseml, "_instance"):
# print("创建新实例")
baseml._instance = object.__new__(cls)
return baseml._instance
def train(self):
# 必须要改写的类
raise NotImplementedError("train function must be implemented")
def inference(self):
# 必须要改写的类
raise NotImplementedError("inference function must be implemented")
def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[],
shuffle=True, show=False, split=True, scale=False):
"""Load the model's data set.
Args:
X (str|numpy|pandas|list): 自变量.
y (str|numpy|pandas|list, optional): 目标值. 默认为 [].
type (str, optional): X和y的输入格式, choice = ['csv', 'numpy','pandas','list','txt], 最后统一转换为numpy.
x_column (list, optional): X 的索引列. 默认设置为X的所有列.
y_column (list, optional): y的索引列. 默认设置为y的所有列.
shuffle (bool, optional): 是否对元素随机排序. 默认为True.
show (bool, optional): 显示5条数据. 默认为True.
split(bool, optional): 是否划分数据集为训练集和测试集. 默认为True.
scale(bool, optional): 是否对数据进行归一化. False.
"""
if (type == 'csv' or type == 'txt') and len(x_column) == 0:
raise ValueError("请传入数据列号")
if type == 'csv':
self.dataset = pd.read_csv(X).values # .values就转成numpy格式了
if shuffle:
np.random.shuffle(self.dataset)
self.get_data(self.dataset, self.dataset,
x_column, y_column, split, scale)
elif type == 'numpy':
if shuffle:
X, y = self.shuffle_data(X, y)
self.get_data(X, y, x_column, y_column, split, scale)
elif type == 'pandas':
X = X.values
y = y.values if len(y) > 0 else []
if shuffle:
X, y = self.shuffle_data(X, y)
self.get_data(X, y, x_column, y_column, split, scale)
elif type == 'list':
X = np.array(X)
y = np.array(y) if len(y) > 0 else []
if shuffle:
X, y = self.shuffle_data(X, y)
self.get_data(X, y, x_column, y_column, split, scale)
elif type == 'txt':
self.dataset = np.loadtxt(X)
self.dataset = self.dataset.values
if shuffle:
np.random.shuffle(self.dataset)
self.get_data(self.dataset, self.dataset,
x_column, y_column, split, scale)
print("Load dataset successfully!")
if show and len(self.x_train) >= 5: # 显示前5条数据
print("X")
print(self.x_train[:5])
print("y")
if len(self.y_train) >= 5:
print(self.y_train[:5])
else:
print("None")
def get_data(self, X, y, x_column, y_column, split, scale):
"""通过列号获取真实的训练数据
Args:
X (numpy.ndarray): 自变量.
y (numpy.ndarray): 因变量.
x_column (list): 自变量的列索引集合.
y_column (list): 因变量的列索引集合.
"""
if X.ndim == 1:
X = X.reshape(-1, 1)
if len(x_column) == 0 and len(X):
# 如果没有赋值,那么默认选用所有列
x_column = list(range(X.shape[1]))
if len(y_column) == 0 and len(y):
# 如果没有赋值,默认用y的所有列
if y.ndim == 1:
y_column = [0]
else:
y_column = list(range(y.shape[1]))
if len(X):
self.x_train = X[:, x_column]
if scale: # 对训练数据进行归一化,在聚类、部分分类的时候需要使用
self.scaler = MinMaxScaler(feature_range=(0, 1))
self.x_train = self.scaler.fit_transform(self.x_train)
if len(y): #
if y.ndim == 1:
y = y.reshape(-1, 1)
self.y_train = y[:, y_column]
if self.y_train.shape[0]:
self.dataset = np.concatenate(
(self.x_train, self.y_train), axis=1) # 按列进行拼接
else:
self.dataset = self.x_train
if split: # 进行数据集划分
self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
self.x_train, self.y_train, test_size=self.test_size, random_state=42)
def shuffle_data(self, X, y):
if len(X) == len(y):
c = list(zip(X, y)) # 保持X与y的对应关系
np.random.shuffle(c)
X = np.array([t[0] for t in c])
y = np.array([t[1] for t in c])
elif len(X) > 0 and len(y) == 0:
np.random.shuffle(X)
return X, y
def save(self, path="checkpoint.pkl"):
data = {
'model': self.model,
'input_shape': self.x_train.shape,
'demo_input': self.x_train[:1],
}
print("Saving model checkpoints...")
joblib.dump(data, path, compress=3)
print("Saved successfully!")
def load(self, path):
# self.model = joblib.load(path)
model = joblib.load(path)
if isinstance(model, dict):
self.model = model['model']
try:
self.demo_input = model['demo_input']
self.input_shape = model['input_shape']
except:
pass
else:
self.model = model
def reverse_scale(self, data):
return self.scaler.inverse_transform(data)
def get_test_data(self):
return self.x_test, self.y_test
def convert_np(self, data):
if isinstance(data, np.ndarray):
pass
elif isinstance(data, list):
data = np.array(data)
elif isinstance(data, pd.DataFrame):
data = data.values
elif isinstance(data, tuple):
data = np.array(data)
else:
TypeError("The type {} is not supported".format(type(data)))
return data
def plot(self, X=None, y_true=None):
# 模型可视化,若不被改写则不被支持
raise NotImplementedError(
"Error Code: -405. No implementation of this method.")
def metricplot(self, X=None, y_true=None):
# 模型可视化,若不被改写则不被支持
raise NotImplementedError(
"Error Code: -405. No implementation of this method.")
def load_tab_data(self, data_path, train_val_ratio=1.0, shuffle=True,random_seed=42,y_type='float',**kw):
# if y_type == 'long' and self.task_type == 'reg':
# y_type = 'float'
data = np.loadtxt(data_path, dtype=float, delimiter=',',skiprows=1) # [120, 4]
x = data[:,:-1]
y = data[:, -1]
y = y.astype(y_type)
if 0 < train_val_ratio < 1:
train_size = int(train_val_ratio * len(x))
val_size = len(x) - train_size
x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=train_size, test_size=val_size, random_state=random_seed,shuffle=shuffle)
else:
x_train, y_train = x, y
x_val, y_val = None, None
# if self.task_type == 'cls':
# y_train = y_train.astype(int)
# y_val = y_val.astype(int) if y_val is not None else None
# elif self.task_type =='reg':
# y_train = y_train.astype(float)
# y_val = y_val.astype(float) if y_val is not None else None
self.x_train = x_train
self.y_train = y_train
self.x_test = x_val
self.y_test = y_val
return x_train, y_train, x_val, y_val
def set_para(self, **kw):
for i in kw:
print("Setting {} to {}".format(i, kw[i]))
setattr(self.model, i, kw[i])
@property
def para(self):
return self.para
@para.setter
def para(self, kw):
for i in kw:
print("Setting {} to {}".format(i, kw[i]))
setattr(self.model, i, kw[i])
def valid(self, path=None, x=None ,y=None ,metrics='accuracy'):
"""验证模型的准确率
Args:
path (str): 验证集的路径
x (np.ndarray, optional): 验证集的特征. Defaults to None.
y (np.ndarray, optional): 验证集的标签. Defaults to None.
metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'.
Returns:
acc: 返回验证指标的值
y_pred: 返回预测y值
"""
if path is None and x is None and y is None: # 如果没有输入数据,默认采用x_test和y_test
x = self.x_test
y = self.y_test
elif x is None and y is None: # 如果输入了路径,但是没有输入数据,则读取路径
df = pd.read_csv(path)
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
self.x_test = x
self.y_test = y
# 验证集的特征和标签不能为空
assert x is not None and y is not None, "Error Code: -801. The validation set cannot be empty. "
assert len(x) > 0 and len(y) > 0, "Error Code: -801. The validation set cannot be empty. "
y_pred = self.inference(x)
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,\
r2_score,mean_squared_error,mean_absolute_error,auc,\
silhouette_score
if metrics == 'accuracy' or metrics=='acc':
score = accuracy_score(y, y_pred)
print('验证准确率为:{}%'.format(score * 100))
elif metrics == 'precision':
score = precision_score(y, y_pred,average='weighted')
print('验证精确率为:{}%'.format(score * 100))
elif metrics =='recall':
score = recall_score(y, y_pred,average='weighted')
print('验证召回率为:{}%'.format(score * 100))
elif metrics == 'f1':
score = f1_score(y, y_pred,average='weighted')
print('验证f1-score为:{}%'.format(score * 100))
elif metrics == 'auc':
score = auc(y, y_pred)
print('验证auc为:{}%'.format(score * 100))
elif metrics == 'r2':
assert len(y) >= 2, "Error Code: -603. The validation set has less than 2 samples and r2-score cannot be calculated."
score = r2_score(y, y_pred)
print('验证r2-score为:{}%'.format(score * 100))
elif metrics =='mse':
score = mean_squared_error(y, y_pred)
print('验证均方误差为:{}%'.format(score * 100))
elif metrics =='mae':
score = mean_absolute_error(y, y_pred)
print('验证平均绝对误差为:{}%'.format(score * 100))
else:
raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics))
return score,y_pred
|
|
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from yellowbrick.regressor import PredictionError
import matplotlib.pyplot as plt
import joblib
from .base import baseml
class Regression(baseml):
"""BaseML中的回归模块,包含['LinearRegression'(线性回归), 'CART'(决策树回归), 'RandomForest'(随机森林回归),
'Polynomial'(多项式回归), 'Lasso'(角回归), 'Ridge'(岭回归), 'SVM'(支持向量机回归), 'AdaBoost'(自适应增强回归), 'MLP'(多层感知机回归)]回归算法.
Attributes:
algorithm: 算法名称
model: 实例化的模型
更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
"""
def __init__(self, algorithm='LinearRegression', n_estimators=20, degree=2, n_hidden=(100,), para={}):
"""reg类的初始化
Args:
algorithm (str, optional): 选择的回归学习器. Defaults to 'LinearRegression'.
n_estimators (int, optional): RandomForest集成的决策树个数. Defaults to 20.
degree (int, optional): 多项式回归的阶数. Defaults to 2.
para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
"""
super(Regression, self).__init__() # 继承父类的构造方法
self.algorithm = algorithm
if self.algorithm == 'LinearRegression': # 线性回归
if len(para) > 1:
self.model = linear_model.LinearRegression(**para)
else:
self.model = linear_model.LinearRegression()
elif self.algorithm == 'CART': # 决策树回归
if len(para) > 1:
self.model = tree.DecisionTreeRegressor(**para)
else:
self.model = tree.DecisionTreeRegressor()
elif self.algorithm == 'RandomForest': # 随机森林回归
if len(para) > 1:
self.model = ensemble.RandomForestRegressor(**para)
else:
self.model = ensemble.RandomForestRegressor(
n_estimators=n_estimators)
elif self.algorithm == 'Polynomial': # 多项式回归
if len(para) > 1:
self.model = PolynomialFeatures(**para)
self.poly_linear_model = linear_model.LinearRegression()
else:
self.model = PolynomialFeatures(degree=degree)
self.poly_linear_model = linear_model.LinearRegression()
elif self.algorithm == 'Lasso': # Lasso回归
if len(para) > 1:
self.model = linear_model.Lasso(**para)
else:
self.model = linear_model.Lasso()
elif self.algorithm == 'Ridge': # 岭回归
if len(para) > 1:
self.model = linear_model.Ridge(**para)
else:
self.model = linear_model.Ridge()
elif self.algorithm == 'SVM':
if len(para) > 1:
self.model = SVR(**para)
else:
self.model = SVR(degree=degree)
elif self.algorithm == 'AdaBoost':
if len(para) > 1:
self.model = AdaBoostRegressor(**para)
else:
self.model = AdaBoostRegressor(n_estimators=n_estimators)
elif self.algorithm == 'MLP':
if len(para) > 1:
self.model = MLPRegressor(**para)
else:
self.model = MLPRegressor(
hidden_layer_sizes=n_hidden, solver='lbfgs')
def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200):
"""训练模型.
Args:
validate (bool, optional): 是否需要验证模型,并输出准确率. Defaults to False.
val_size (float, optional): 验证集比例. Defaults to 0.2.
lr (float, optional): 学习率. Defaults to 0.001.
epochs (int, optional): 训练轮数. Defaults to 200.
"""
if self.algorithm == 'MLP':
self.model.learning_rate_init = lr
self.model.max_iter = epochs
elif self.algorithm == 'AdaBoost':
self.model.learning_rate = lr
if validate: # 需要划分数据集,并输出准确率
self.x_train, self.x_val, self.y_train, self.y_val = \
train_test_split(self.x_train, self.y_train,
test_size=val_size, random_state=0)
if self.algorithm == 'Polynomial':
x_transformed = self.model.fit_transform(
self.x_train) # x每个数据对应的多项式系数
self.poly_linear_model.fit(x_transformed, self.y_train)
else:
self.model.fit(self.x_train, self.y_train)
if self.algorithm == 'LinearRegression':
self.coef = self.model.coef_
self.intercept = self.model.intercept_
if validate:
if len(self.y_val < 2):
print("测试集小于2个样本,无法使用R值计算")
else:
pred = self.model.predict(self.x_val)
acc = r2_score(self.y_val, pred)
print('R值为: {}%'.format(acc))
def inference(self, data=np.nan):
"""_summary_
Args:
data (numpy, optional): 放进来推理的数据,不填默认使用self.x_test.
Returns:
pred: 返回预测结果.
"""
# if data is not np.nan: # 对data进行了指定
# self.x_test = data
x_test = data if data is not np.nan else self.x_test
assert len(x_test) > 0, "Error Code: -601. No dataset is loaded."
x_test = self.convert_np(x_test)
if self.input_shape is not None:
model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."
if x_test.ndim != 2:
x_test = x_test.reshape(x_test.shape[0], -1)
if self.algorithm == 'Polynomial':
x_trans = self.model.transform(x_test)
self.pred = self.poly_linear_model.predict(x_trans)
# self.pred = self.model.
else:
self.pred = self.model.predict(x_test)
return self.pred
# 重写方法
def save(self, path="checkpoint.pkl"):
print("Saving model checkpoints...")
if self.algorithm == 'Polynomial':
modelList = [self.model, self.poly_linear_model]
data = {
'model': modelList,
'input_shape': self.x_train.shape,
'demo_input': self.x_train[:1],
}
joblib.dump(data, path, compress=3)
else:
data = {
'model': self.model,
'input_shape': self.x_train.shape,
'demo_input': self.x_train[:1],
}
joblib.dump(data, path, compress=3)
print("Saved successfully!")
def load(self, path):
if self.algorithm == 'Polynomial':
self.model = joblib.load(path)['model'][0]
self.poly_linear_model = joblib.load(path)['model'][1]
else:
self.model = joblib.load(path)['model']
def metricplot(self, X=None, y_true=None):
"""绘制模型回归预测误差图, 图中的identity为基准线, 说明预测出的标签(y轴)与
真实标签(x轴)相同。回归模型越靠近基准线则越好。该图显示了回归模型的方差大小。
Args:
X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
"""
if X is None and y_true is None:
X = self.x_test
y_true = self.y_test
# assert len(self.x_train) > 0 and len(self.y_train) > 0, \
# "Error Code: -601. No dataset is loaded."
assert X is not None and y_true is not None, "Error Code: -604. No valid data is provided or the validataion dataset is empty."
assert len(X) > 0 and len(y_true) > 0, "Error Code: -604. No valid data is provided or the validataion dataset is empty."
if self.algorithm == 'Polynomial':
from sklearn.pipeline import make_pipeline
model = make_pipeline(self.model, self.poly_linear_model)
else:
model = self.model
visualizer = PredictionError(
model,
title="Actual vs. Predicted Values",
)
# self.y_test = self.y_test.squeeze()
# visualizer.fit(self.x_train, self.y_train)
# visualizer.score_ = visualizer.estimator.score(
# self.x_test, self.y_test)
# result = self.inference(self.x_test).squeeze()
# visualizer.draw(self.y_test, result)
y_true = y_true.squeeze()
visualizer.fit(X, y_true)
visualizer.score_ = visualizer.estimator.score(X, y_true)
result = self.inference(X).squeeze()
visualizer.draw(y_true, result)
visualizer.show()
def plot(self, X=None, y_true=None):
"""绘制回归模型图.
Args:
X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
"""
# 如果没有任何输入,默认采用x_test和y_test
if X is None:
assert len(
self.x_test) is not None, "Error Code: -601. No dataset is loaded."
X = self.x_test
y_true = self.y_test
X = self.convert_np(X)
y_pred = self.inference(X)
if y_true is not None:
y_true = self.convert_np(y_true)
X = X.reshape(X.shape[0], -1) # 转为二维
if self.algorithm == 'LinearRegression':
self.linear_reg_plot(X[:, 0], y_pred, y_true)
else:
raise AssertionError(
"Error Code: -405. No implementation of this method.")
def linear_reg_plot(self, X, y_pred, y_true=None):
"""绘制线性回归模型图, 仅支持使用1维特征训练的模型.
Args:
X (np.ndarray): 放入的测试数据.
x_pred (np.ndarray): 测试数据的预测标签.
y_true (np.ndarray, optional): 放入的测试数据的真实标签, 当被显式填入时才会绘制出真实的散点.
"""
assert self.model.n_features_in_ == 1, \
"Error Code: -306. "\
"The number of features for training is wrong, required {}, "\
"which is {}.".format(1, self.model.n_features_in_)
fig, ax = plt.subplots()
if y_true is not None:
ax.scatter(X, y_true)
ax.plot(X, y_pred, color='red')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.axis('tight')
plt.show()
|
|
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from yellowbrick.classifier import ClassPredictionError
from .base import baseml
class Classification(baseml):
"""BaseML中的分类模块,包含['KNN'(K近临分类), 'SVM'(支持向量机分类), 'NaiveBayes'(朴素贝叶斯分类), 'CART'(决策树分类),
'AdaBoost'(自适应增强分类), 'MLP'(多层感知机分类), 'RandomForest'(随机森林分类)]分类算法.
Attributes:
algorithm: 算法名称
model: 实例化的模型
更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
"""
def __init__(self, algorithm='KNN', n_neighbors=5, n_estimators=100, n_hidden=(100,), para={}):
"""cls类初始化.
Args:
algorithm (str, optional): 采用的分类算法. Defaults to 'KNN'.
n_neighbors (int, optional): KNN的k值. Defaults to 5.
n_estimators (int, optional): Adaboost|RandomForest所集成的决策树个数. Defaults to 100.
n_hidden (tuple, optional): MLP隐藏层的形状. Defaults to (100,).
para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
"""
super(Classification, self).__init__() # 继承父类的构造方法
self.algorithm = algorithm
if self.algorithm == 'KNN':
if len(para) > 1:
self.model = KNeighborsClassifier(**para)
else:
self.model = KNeighborsClassifier(n_neighbors=n_neighbors)
elif self.algorithm == 'SVM':
if len(para) > 1:
self.model = SVC(**para)
else:
self.model = SVC()
elif self.algorithm == 'NaiveBayes':
if len(para) > 1:
self.model = GaussianNB(**para)
else:
self.model = GaussianNB()
elif self.algorithm == 'CART':
if len(para) > 1:
self.model = DecisionTreeClassifier(**para)
else:
self.model = DecisionTreeClassifier()
elif self.algorithm == 'AdaBoost':
if len(para) > 1:
self.model = AdaBoostClassifier(**para)
else:
self.model = AdaBoostClassifier(
n_estimators=n_estimators, random_state=0)
elif self.algorithm == 'MLP':
if len(para) > 1:
self.model = MLPClassifier(**para)
else:
self.model = MLPClassifier(
hidden_layer_sizes=n_hidden, solver='lbfgs')
elif self.algorithm == 'RandomForest':
if len(para) > 1:
self.model = RandomForestClassifier(**para)
else:
self.model = RandomForestClassifier(
n_estimators=n_estimators, random_state=0)
def train(self, validate=False,val_size=0.2, lr=0.001,epochs=200):
"""训练模型.
Args:
validate (bool, optional): 是否需要验证模型,并输出准确率. Defaults to False.
val_size (float, optional): 验证集比例. Defaults to 0.2.
lr (float, optional): 学习率. Defaults to 0.001.
epochs (int, optional): 训练轮数. Defaults to 200.
"""
if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']:
# 设定学习率
if self.algorithm == 'MLP':
self.model.learning_rate_init = lr
self.model.max_iter = epochs
elif self.algorithm == 'AdaBoost':
self.model.learning_rate = lr
if validate:
self.x_train, self.x_val, self.y_train, self.y_val = \
train_test_split(self.x_train, self.y_train,
test_size=val_size, random_state=0)
self.model.fit(self.x_train, self.y_train)
if validate:
pred = self.model.predict(self.x_val)
acc = accuracy_score(self.y_val, pred)
print('训练准确率为:{}%'.format(acc * 100))
def inference(self, data=np.nan, verbose=True):
"""使用模型进行推理
Args:
data (np.ndarray, optional): 放进来推理的数据,不填默认使用self.x_test.
verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True.
Returns:
pred: 返回预测结果.
"""
if data is not np.nan: # 对data进行了指定
x_test = self.convert_np(data)
if self.input_shape is not None:
model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."
elif len(self.x_train) > 0 and len(self.x_test) == 0:
x_test = self.x_train
else:
x_test = self.x_test
x_test = self.convert_np(x_test)
if self.algorithm in ['AdaBoost', 'SVM', 'NaiveBayes', 'MLP', 'KNN', 'CART', 'RandomForest']:
pred = self.model.predict(x_test)
return pred
def metricplot(self, X=None, y_true=None):
"""绘制模型分类准确率图, 可直观查看每一类的分类正误情况
Args:
X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
"""
assert len(self.x_train) > 0 and len(self.y_train) > 0, \
"Error Code: -601. No dataset is loaded."
if X is None and y_true is None:
assert len(self.x_test) > 0 and len(
self.y_test) > 0, "Error Code: -602. Dataset split was not performed."
X = self.x_test
y_true = self.y_test
assert len(X) > 0 and len(y_true) > 0
visualizer = ClassPredictionError(
self.model
)
visualizer.fit(self.x_train, self.y_train)
visualizer.score(X, y_true.reshape(-1))
visualizer.show()
def plot(self, X=None, y_true=None):
"""绘制分类模型图
Args:
X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_test.
y_true (np.ndarray, optional): 放入的测试数据的真实标签, 不填默认使用self.y_test.
"""
# 如果没有任何输入,默认采用x_test和y_test
if X is None:
assert len(
self.x_test) > 0, "Error Code: -602. Dataset split was not performed."
X = self.x_test
X = self.convert_np(X)
y_pred = self.inference(X)
if y_true is not None:
y_true = self.convert_np(y_true)
X = X.reshape(X.shape[0], -1) # 转为二维
if self.algorithm == 'KNN':
self.knn_plot(X, y_pred, y_true)
elif self.algorithm == 'SVM':
self.svm_plot(X, y_pred, y_true)
else:
raise AssertionError(
"Error Code: -405. No implementation of this method.")
def knn_plot(self, X, y_pred, y_true=None):
"""绘制KNN分类图, 不同标签的样本用不同颜色点代替。选择2维特征作为xy坐标, 最多选择5个类别进行可视化。
Args:
X (np.ndarray): 放入的测试数据。
y_pred (np.ndarray): 放入的测试数据的预测标签。
y_true (np.ndarray, optional): 放入的测试数据的真实标签。
"""
# 训练数据特征多于2维,仅取前两维
if X.shape[1] > 2:
print('\033[1;34;1mFeatures is more than 2 dimensions, '
'the first two dimensions are used by default.\033[0m')
label = np.unique(y_pred)
# 最多选择5个类别进行可视化
if len(label) > 5:
label.sort()
label = label[:5]
y_max = label[4]
idx = np.where(y_pred <= y_max)
y_pred = y_pred[idx]
X = X[idx, :].squeeze()
print('\033[1;34;1mThe number of classes is more than 5, '
'the top 5 classes are used by default.\033[0m')
label_list = ["y_pred_" + str(i) for i in range(len(label))]
y_pred_plot = plt.scatter(
X[:, 0], X[:, 1], marker='o', c=y_pred, cmap='rainbow')
handles = y_pred_plot.legend_elements()[0]
# 只有显式输入y_true才会被画出
if y_true is not None:
true_label = np.unique(y_true)
true_label_list = ["y_true_" + str(i)
for i in range(len(true_label))]
y_true_plot = plt.scatter(
X[:, 0], X[:, 1], marker='s', c=y_true, cmap='viridis', s=10)
handles += y_true_plot.legend_elements()[0]
label_list += true_label_list
plt.legend(handles=handles, labels=label_list)
plt.show()
def svm_plot(self, X, y_pred, y_true=None):
"""绘制SVM分类图, 不同标签的样本用不同颜色点代替, 绘制出SVM分类边界。选择2维特征作为xy坐标。
Args:
X (np.ndarray): 放入的测试数据。
y_pred (np.ndarray): 放入的测试数据的预测标签。
y_true (np.ndarray, optional): 放入的测试数据的真实标签。
"""
assert self.model.n_features_in_ == 2, "Error Code: -306. "\
"The number of features for training is wrong, required {}, "\
"which is {}.".format(2, self.model.n_features_in_)
fig, ax = plt.subplots()
ax.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap="rainbow")
if y_true is not None:
ax.scatter(X[:, 0], X[:, 1], c=y_true,
s=8, cmap="viridis", marker='s')
if ax is None:
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
x = np.linspace(xlim[0], xlim[1], 30) # 产生30个间隔
y = np.linspace(ylim[0], ylim[1], 30) # 产生30个间隔
_Y, _X = np.meshgrid(y, x)
z = self.model.predict(np.c_[_X.flatten(), _Y.flatten()])
zz = z.reshape(_X.shape)
ax.contour(_X, _Y, zz, colors="k",
levels=[-1, 0, 1], alpha=0.5, linestyles=["--", "-", "--"])
ax.set_xlim(xlim)
ax.set_ylim(ylim)
plt.show()
|
|
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import InterclusterDistance
import matplotlib.pyplot as plt
from .base import baseml
class Cluster(baseml): # cluster
"""BaseML中的聚类模块,包含['Kmeans'(K均值聚类), 'Spectral clustering'(谱聚类), 'Agglomerative clustering'(层次聚类),
'Birch'(二叉可伸缩聚类树聚类)]聚类算法.
Attributes:
algorithm: 算法名称
model: 实例化的模型
更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
"""
def __init__(self, algorithm='Kmeans', N_CLUSTERS=5, para={}):
"""clt类初始化
Args:
algorithm (str, optional): 采用的聚类算法. Defaults to 'Kmeans'.
N_CLUSTERS (int, optional): 聚类个数. Defaults to 5.
para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
"""
super(Cluster, self).__init__() # 继承父类的构造方法
self.algorithm = algorithm
self.n = N_CLUSTERS
if self.algorithm == 'Kmeans':
if len(para) > 1:
self.model = KMeans(**para)
else:
self.model = KMeans(n_clusters=N_CLUSTERS)
elif self.algorithm == 'Spectral clustering':
if len(para) > 1:
self.model = SpectralClustering(**para)
else:
self.model = SpectralClustering(n_clusters=N_CLUSTERS)
elif self.algorithm == 'Agglomerative clustering':
if len(para) > 1:
self.model = AgglomerativeClustering(**para)
else:
self.model = AgglomerativeClustering(n_clusters=N_CLUSTERS)
elif self.algorithm == 'Birch':
if len(para) > 1:
self.model = Birch(**para)
else:
self.model = Birch(n_clusters=N_CLUSTERS)
def train(self, validate=False):
"""训练模型.
Args:
validate (bool, optional): 是否需要验证模型,并输出模型轮廓系数. Defaults to True.
"""
self.model.fit(self.x_train)
if validate:
score = silhouette_score(self.x_train, labels=self.model.labels_)
print('轮廓系数为:{}'.format(score)) # -1为不正确的聚类,0为重叠聚类,1为正确的聚类
def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[],
shuffle=True, show=False, split=False, scale=False):
# 聚类方法默认不需要split数据集
super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale)
def inference(self, data=np.nan, verbose=True):
"""使用模型进行推理
Args:
data (numpy, optional): 放进来推理的数据,不填默认使用self.x_train.
verbose (bool, optional): 是否输出推理中的中间结果. Defaults to True.
Returns:
pred: 返回预测结果.
"""
if data is not np.nan: # 对data进行了指定
self.x_test = data
if self.input_shape is not None:
model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
assert type(self.demo_input) == type(self.x_test), f"Error Code: -309. The data type {type(self.x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
assert self.input_shape[1:] == self.x_test.shape[1:], f"Error Code: -309. The data shape {self.x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."
else:
self.x_test = self.x_train
self.x_test = self.convert_np(self.x_test)
if verbose and len(self.x_train) != 0:
labels = self.model.labels_ # 获取聚类标签
# print(silhouette_score(self.x_train, labels)) # 获取聚类结果总的轮廓系数
if self.algorithm == 'Kmeans':
print(self.model.cluster_centers_) # 输出类簇中心
for i in range(self.n):
print(f" CLUSTER-{i+1} ".center(60, '='))
print(self.x_train[labels == i])
if self.x_test is not []:
pred = self.model.predict(self.x_test)
return pred
def metricplot(self, X=None):
"""绘制模型聚类簇间距离图, 各簇分的越开, 说明聚类效果越好。
Args:
X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train.
"""
assert self.algorithm == 'Kmeans', \
"Error Code: -405. No implementation of this method."
if X is None:
assert len(
self.x_train) > 0, "Error Code: -601. No dataset is loaded."
X = self.x_train
visualizer = InterclusterDistance(self.model)
visualizer.fit(self.x_train) # Fit the data to the visualizer
visualizer.show() # Finalize and render the figure
def plot(self, X=None):
"""绘制聚类模型图
Args:
X (np.ndarray, optional): 放入的测试数据, 不填默认使用self.x_train.
"""
assert self.algorithm == 'Kmeans', \
"Error Code: -405. No implementation of this method."
# 如果没有任何输入,默认采用x_train
if X is None:
if len(self.x_train) > 0:
self.x_test = self.x_train
assert len(
self.x_test) > 0, "Error Code: -602. Dataset split was not performed."
X = self.x_test
X = self.convert_np(X)
y_pred = self.inference(X)
self.cluster_plot(X, y_pred)
def cluster_plot(self, X, y_pred):
"""绘制聚类模型散点图,并显示聚类标签
Args:
X (np.ndarray): 放入的测试数据, 不填默认使用self.x_train.
y_pred (np.ndarray): 模型对测试数据预测的类别.
"""
# 训练数据特征多于2维,仅取前两维
if X.shape[1] > 2:
print('\033[1;34;1mfeatures is more than 2 dimensions, \
the first two dimensions are used by default\033[0m')
# 画出不同颜色的点
plt.scatter(X[:, 0], X[:, 1], c=y_pred, s=50, cmap='viridis')
# 画出聚类中心
centers = self.model.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=20, alpha=0.5)
# 标出聚类序号
for i in range(self.model.cluster_centers_.shape[0]):
plt.text(centers[:, 0][i]+0.5, y=centers[:, 1][i]+0.5, s=i,
fontdict=dict(color='red', size=10),
bbox=dict(facecolor='yellow', alpha=0.2),
)
def valid(self, path=None, x=None ,y=None ,metrics='accuracy'):
"""验证模型的准确率
Args:
path (str): 验证集的路径
x (np.ndarray, optional): 验证集的特征. Defaults to None.
y (np.ndarray, optional): 验证集的标签. Defaults to None.
metrics (str, optional): 验证集的评估指标. Defaults to 'accuracy'.
Returns:
acc: 返回验证指标的值
y_pred: 返回预测y值
"""
if path is None and x is None and y is None: # 如果没有输入数据,默认采用x_test和y_test
x = self.x_train
y = self.y_train
elif x is None and y is None: # 如果输入了路径,但是没有输入数据,则读取路径
df = pd.read_csv(path)
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
self.x_test = x
self.y_test = y
# 验证集的特征和标签不能为空
assert x is not None and y is not None, "Error Code: -801. The validation set cannot be empty. "
assert len(x) > 0 and len(y) > 0, "Error Code: -801. The validation set cannot be empty. "
y_pred = self.inference(x)
from sklearn.metrics import silhouette_score, calinski_harabasz_score,davies_bouldin_score
if metrics == 'silhouette_score':
score = silhouette_score(x, self.model.labels_)
print('验证轮廓系数为:{}%'.format(score * 100))
elif metrics == 'calinski_harabasz_score':
score = calinski_harabasz_score(x, self.model.labels_)
print('验证Calinski-Harabasz指数为:{}'.format(score))
elif metrics == 'davies_bouldin_score':
score = davies_bouldin_score(x, self.model.labels_)
print('验证Davies-Bouldin指数为:{}'.format(score))
else:
raise AssertionError("Error Code: -307. The '{}' metric is not currently supported.".format(metrics))
return score,y_pred
|
|
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import LocallyLinearEmbedding as LLE
from yellowbrick.features import PCA as yb_PCA
from .base import baseml
class DimentionReduction(baseml): # reduction
"""BaseML中的降维模块,包含['PCA'(主成分分析), 'LDA'(线性判别分析), 'LLE'(局部线性嵌入)]降维算法.
Attributes:
algorithm: 算法名称
model: 实例化的模型
更多用法及算法详解可参考:https://xedu.readthedocs.io/zh/master/baseml/introduction.html
"""
def __init__(self, algorithm='PCA', n_components=2, para={}):
"""rdc类的构造函数
Args:
algorithm (str, optional): 采用的降维算法. Defaults to 'PCA'.
n_components (int, optional): 降维后保留的特征数. Defaults to 2.
para (dict, optional): para (dict, optional): 参数字典,可自定义参数放入,参数名称详见sklearn官方文档. Defaults to {}.
"""
super(DimentionReduction, self).__init__() # 继承父类的构造方法
self.algorithm = algorithm
self.n_components = n_components
if self.algorithm == 'PCA': # 主成分分析
if len(para) > 1:
self.model = PCA(**para)
else:
self.model = PCA(n_components=n_components)
elif self.algorithm == 'LDA': # 线性判别分析
if len(para) > 1:
self.model = LDA(**para)
else:
self.model = LDA(n_components=n_components)
elif self.algorithm == 'LLE': # 局部线性嵌入
if len(para) > 1:
self.model = LLE(**para)
else:
self.model = LLE(n_components=n_components)
def train(self, validate=True):
"""训练模型.
Args:
validate (bool, optional): 是否需要验证模型,并输出方差贡献率. Defaults to True.
"""
if self.algorithm == 'LDA':
if len(self.y_train) == 0:
raise Exception("使用LDA时必须输入y标签")
self.model.fit(self.x_train, self.y_train)
else:
self.model.fit(self.x_train)
if validate and self.algorithm != 'LLE':
explained_var = self.model.explained_variance_ratio_ # 获取贡献率
print('累计方差贡献率为:{}'.format(explained_var))
def inference(self, data=np.nan):
"""使用模型进行降维
Args:
data (numpy, optional): 放进来降维的数据,不填默认使用self.x_train.
Returns:
pred: 返回降维结果,保留的特征数为刚开始输进来的.
"""
if data is not np.nan: # 对data进行了指定
self.x_test = data
if self.input_shape is not None:
model_input_shape = str(self.input_shape).replace(str(self.input_shape[0]), 'batch')
x_test = self.convert_np(self.x_test)
assert type(self.demo_input) == type(x_test), f"Error Code: -309. The data type {type(x_test)} doesn't match the model input type {type(self.demo_input)}. Example input: {self.demo_input.tolist()}."
assert self.input_shape[1:] == x_test.shape[1:], f"Error Code: -309. The data shape {x_test.shape} doesn't match the model input shape {model_input_shape}. Example input: {self.demo_input.tolist()}."
else:
self.x_test = self.x_train
self.x_test = self.convert_np(self.x_test)
if self.x_test is not []:
pred = self.model.transform(self.x_test)
return pred
def fit_transform(self):
# 一步到位地返回降维结果
return self.model.fit_transform(self.x_train)
def load_dataset(self, X, y=[], type=None, x_column=[], y_column=[],
shuffle=True, show=False, split=False, scale=False):
# 降维方法默认不需要split数据集
super().load_dataset(X, y, type, x_column, y_column, shuffle, show, split, scale)
def plot(self, X=None, y_true=None):
"""绘制降维模型图, 目前仅支持PCA.
Args:
X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train.
y_true (_type_, optional): 测试数据的真实标签,, 不输入默认使用self.y_train.
"""
assert self.algorithm == 'PCA', "Error Code: -405. No implementation of this method."
# 如果没有任何输入,默认采用x_train
if X is None:
if len(self.x_train) > 0:
self.x_test = self.x_train
assert len(
self.x_test) > 0, "Error Code: -601. No dataset is loaded."
X = self.x_test
y_true = self.y_train
X = self.convert_np(X)
assert y_true is not None and len(y_true) > 0, \
"Error Code: -307. The parameter {} is not set.".format("y_true")
y_true = self.convert_np(y_true)
self.pca_projection(X, y_true)
def pca_projection(self, X, y_true):
"""绘制PCA投影图, 能够投影至2维或3维中, 检验数据降维的可行性
Args:
X (np.ndarray, optional): 放入的测试数据, 不输入默认使用self.x_train.
y_true (_type_, optional): 测试数据的真实标签,, 不输入默认使用self.y_train.
"""
proj = self.n_components
proj = min(proj, 3)
label = np.unique(y_true)
classes = ['class_%i' % i for i in range(len(label))]
visualizer = yb_PCA(scale=True, projection=proj, classes=classes)
visualizer.fit_transform(X, y_true.squeeze())
visualizer.show()
|
|
|
参考示例
from BaseML import Regression as reg #导入回归任务模块 model = reg('LinearRegression') #实例化线性回归模型 model.load_tab_data('data/shoe_size-height1.csv') #载入训练集数据 model.train() #训练模型 model.valid('data/shoe_size-height1.csv',metrics='r2') #对模型进行评估 model.metricplot() #可视化评估结果 model.plot() #模型预测结果可视化 model.save('data/mymodel.pkl') #保存模型 #根据实际的进行填写 shoe_size= 40.5 result = model.inference([shoe_size]) #得出预测的身高 print("\n鞋码为", shoe_size, "时,预测的身高为:", f"{result[0]:.2f}") #打印结果
|
|

浙公网安备 33010602011771号