各函数包与函数模块之间的所属关系如图:
注意,所有函数包以及Notbook文件都是所属父文件夹的同级别文件,只有这样才能顺利调用所需函数
![]()
![]()
各函数包如下:
kNN
1 import numpy as np
2 from math import sqrt
3 from collections import Counter
4
5
6 class KNNClassifier:
7
8 def __init__(self, k):
9 """初始化kNN分类器"""
10 assert k >= 1, "k must be valid"
11 self.k = k
12 self._X_train = None
13 self._y_train = None
14
15 def fit(self, X_train, y_train):
16 """根据训练数据集X_train和y_train训练kNN分类器"""
17 assert X_train.shape[0] == y_train.shape[0],\
18 "the size of X_train must be equal to the size of y_train"
19 assert self.k <= X_train.shape[0],\
20 "the size of X_train must be at least k ."
21
22 self._X_train = X_train
23 self._y_train = y_train
24 return self
25
26 def predict(self, X_predict): # predict(self, X_predict)
27 """给定待预测数据集X_predict,返回X_predict的结果向量"""
28 assert self._X_train is not None and self._y_train is not None,\
29 " must fit before predict!"
30 assert X_predict.shape[1] == self._X_train.shape[1],\
31 " the feature number of X_predict must be equal to X_train"
32
33 y_predict = [self._predict(x) for x in X_predict]
34 return np.array(y_predict)
35
36 def _predict(self, x):
37 """给定单个待测数据x,返回x的预测结果值"""
38 assert x.shape[0] == self._X_train.shape[1],\
39 "the feature number of x must be equal to x_train"
40 distances = [sqrt(np.sum((x_train - x) ** 2))
41 for x_train in self._X_train]
42 nearest = np.argsort(distances)
43
44 topK_y = [self._y_train[i] for i in nearest[:self.k]]
45 votes = Counter(topK_y)
46
47 return votes.most_common(1)[0][0]
48
49 def __repr__(self):
50 return "KNN(k=%d)" % self.k
KNN_function
1 # KNN_classify()
2 import numpy as np
3 from math import sqrt
4 from collections import Counter
5
6
7 def KNN_classify (k, X_trian, y_trian, x):
8 assert 1 <= k <= X_trian.shape[0], ' k must be valid'
9 assert X_trian.shape[0] == y_trian.shape[0],\
10 'the size of X_trian must equal to the size of y_trian '
11 assert X_trian.shape[1] == x.shape[0],\
12 "the feature number of must be equal to X_trian "
13
14 distances = [sqrt(np.sum(x_trian - x) ** 2) for x_trian in X_trian]
15 nearest = np.argsort(distances)
16 topk_y = [y_trian[i] for i in nearest[:k]]
17 votes = Counter(topk_y)
18 return votes.most_common(1)[0][0]
19
20
21 print(" KNN_classify 已加载.")
playML:
kNN.py
1 import numpy as np
2 from math import sqrt
3 from collections import Counter
4 from .metrics import accuracy_score # from .metrics 报错
5
6
7 class KNNClassifier:
8
9 def __init__(self, k):
10 """初始化kNN分类器"""
11 assert k >= 1, "k must be valid"
12 self.k = k
13 self._X_train = None
14 self._y_train = None
15
16 def fit(self, X_train, y_train):
17 """根据训练数据集X_train和y_train训练kNN分类器"""
18 assert X_train.shape[0] == y_train.shape[0],\
19 "the size of X_train must be equal to the size of y_train"
20 assert self.k <= X_train.shape[0],\
21 "the size of X_train must be at least k ."
22
23 self._X_train = X_train
24 self._y_train = y_train
25 return self
26
27 def predict(self, X_predict): # predict(self, X_predict)
28 """给定待预测数据集X_predict,返回X_predict的结果向量"""
29 assert self._X_train is not None and self._y_train is not None,\
30 " must fit before predict!"
31 assert X_predict.shape[1] == self._X_train.shape[1],\
32 " the feature number of X_predict must be equal to X_train"
33
34 y_predict = [self._predict(x) for x in X_predict]
35 return np.array(y_predict)
36
37 def _predict(self, x):
38 """给定单个待测数据x,返回x的预测结果值"""
39 assert x.shape[0] == self._X_train.shape[1],\
40 "the feature number of x must be equal to x_train"
41 distances = [sqrt(np.sum((x_train - x) ** 2))
42 for x_train in self._X_train]
43 nearest = np.argsort(distances)
44
45 topK_y = [self._y_train[i] for i in nearest[:self.k]]
46 votes = Counter(topK_y)
47
48 return votes.most_common(1)[0][0]
49
50 def score(self, X_test, y_test):
51 """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
52 y_predict = self.predict(X_test) # self._predict(X_test),大意了直接采纳第一个提示
53 return accuracy_score(y_test, y_predict)
54
55 def __repr__(self):
56 return "KNN(k=%d)" % self.k
LinearRegression.py
1 import numpy as np
2 from .metrics import r2_score
3 # 源名需加kNN
4
5 class LinearRegression:
6
7 def __int__(self):
8 """初始化 Linear Regression 模型"""
9 self.coef_ = None
10 self.interception_ = None
11 self._theta = None
12
13 def fit_normal(self, X_train, y_train):
14 """根据训练数据集 X_train, y_train 训练 Linear Regression 模型"""
15 assert X_train.shape[0] == y_train.shape[0], \
16 "the size of X_train must be equal to the size of y_train"
17
18 X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
19 self._theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y_train)
20
21 self.interception_ = self._theta[0]
22 self.coef_ = self._theta[1:]
23
24 return self
25
26 def fit_gd(self, X_train, y_train, eta=0.001, n_iters=1e4):
27 """根据训练数据集 X_trian, y_train,使用梯度下降法训练 Linear Regression模型"""
28 assert X_train.shape[0] == y_train.shape[0], \
29 "the size of X_train must equal to the size of y_train"
30
31 def J(theta, X_b, y):
32 try:
33 return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
34 except:
35 return float('inf')
36
37 def dJ(theta, X_b, y):
38 # res = np.empty(len(theta))
39 # res[0] = np.sum(X_b.dot(theta) - y)
40 # for i in range(1, len(theta)):
41 # res[i] = (X_b.dot(theta) - y).dot(X_b[:, i])
42 # return res * 2 / len(theta)
43 return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(y)
44
45 def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e5, epsilon=1e-8):
46
47 theta = initial_theta
48 # theta_history.append(initial_theta)
49 i_iters = 0
50
51 while i_iters < n_iters:
52 gradient = dJ(theta, X_b, y)
53 last_theta = theta
54 theta = theta - eta * gradient
55 # theta_history.append(theta)
56
57 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
58 break
59 i_iters += 1
60
61 return theta
62
63 X_b = np.hstack([np.ones((len(X_trian), 1)), X_trian])
64 initial_theta = np.zeros(X_b.shape[1])
65 self._theta = gradient_descent(X_b, y_train, initial_theta, eta)
66
67 self.interception_ = self._theta[0]
68 self.coef_ = self._theta[1:]
69
70 return self
71
72 def fit_sgd(self, X_train, y_train, n_iters=5, t0=5, t1=50):
73 """根据训练数据集 X_trian, y_train,使用梯度下降法训练 Linear Regression模型"""
74 assert X_train.shape[0] == y_train.shape[0], \
75 "the size of X_train must equal to the size of y_train"
76 assert n_iters >= 1,\
77 "the size of n_iters must >= 1"
78 def dJ_sgd(theta, X_b_i, y_i):
79
80 return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2
81
82 def sgd(X_b, y, initial_theta, n_iters, t0=5, t1=50):
83
84 def learning_rate(t):
85 return t0 / (t + t1)
86
87 theta = initial_theta
88 m = len(X_b)
89
90 for cur_iter in range(n_iters):
91 indexes = np.random.permutation(m)
92 X_b_new = X_b[indexes]
93 y_new = y[indexes]
94 for i in range(m):
95 gradient = dJ_sgd(theta, X_b_new[i], y_new[i])
96 theta = theta - learning_rate(cur_iter * m + i) * gradient
97 return theta
98
99
100 X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
101 initial_theta = np.zeros(X_b.shape[1])
102 self._theta = sgd(X_b, y_train, initial_theta, n_iters, t0, t1)
103
104 self.interception_ = self._theta[0]
105 self.coef_ = self._theta[1:]
106 return self
107
108
109
110 def predict(self, X_predict):
111 """给定待测数据集 X_predict,返回表示 X_predict 的结果向量 """
112 assert self.interception_ is not None and self.coef_ is not None, \
113 "must fit before predict!"
114 assert X_predict.shape[1] == len(self.coef_), \
115 "the feature number of X_predict must be equal to X_train"
116
117 X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
118 return X_b.dot(self._theta)
119
120 def score(self, X_test, y_test):
121 """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
122
123 y_predict = self.predict(X_test)
124 return r2_score(y_test, y_predict)
125
126 def __repr__(self):
127 return "LinearRegression()"
LogistcRegression.py
1 import numpy as np
2 from .metrics import accuracy_score
3 # 源名需加kNN
4
5 class LogisticRegression:
6
7 def __int__(self):
8 """初始化 LogisticRegression 模型"""
9 self.coef_ = None
10 self.interception_ = None
11 self._theta = None
12
13 def _sigmoid(self, t):
14 return 1 / (1 + np.exp(-t))
15
16 def fit(self, X_train, y_train, eta=0.001, n_iters=1e4):
17 """根据训练数据集 X_trian, y_train,使用梯度下降法训练 LogisticRegression模型"""
18 assert X_train.shape[0] == y_train.shape[0], \
19 "the size of X_train must equal to the size of y_train"
20
21 def J(theta, X_b, y):
22 y_hat = self._sigmoid(X_b.dot(theta))
23 try:
24 return - np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y)
25 except:
26 return float('inf')
27
28 def dJ(theta, X_b, y):
29 return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) * 2 / len(y)
30
31 def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e5, epsilon=1e-8):
32
33 theta = initial_theta
34 # theta_history.append(initial_theta)
35 i_iters = 0
36
37 while i_iters < n_iters:
38 gradient = dJ(theta, X_b, y)
39 last_theta = theta
40 theta = theta - eta * gradient
41 # theta_history.append(theta)
42
43 if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
44 break
45 i_iters += 1
46
47 return theta
48
49 X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
50 initial_theta = np.zeros(X_b.shape[1])
51 self._theta = gradient_descent(X_b, y_train, initial_theta, eta)
52
53 self.interception_ = self._theta[0]
54 self.coef_ = self._theta[1:]
55
56 return self
57
58 def predict_proba(self, X_predict):
59 """给定待测数据集 X_predict,返回表示 X_predict 的结果概率向量 """
60 assert self.interception_ is not None and self.coef_ is not None, \
61 "must fit before predict!"
62 assert X_predict.shape[1] == len(self.coef_), \
63 "the feature number of X_predict must be equal to X_train"
64
65 X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
66 return self._sigmoid(X_b.dot(self._theta))
67
68 def predict(self, X_predict):
69 """给定待测数据集 X_predict,返回表示 X_predict 的结果向量 """
70 assert self.interception_ is not None and self.coef_ is not None, \
71 "must fit before predict!"
72 assert X_predict.shape[1] == len(self.coef_), \
73 "the feature number of X_predict must be equal to X_train"
74
75 proba = self.predict_proba(X_predict)
76 return np.array(proba >= 0.5, dtype='int')
77
78 def score(self, X_test, y_test):
79 """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
80
81 y_predict = self.predict(X_test)
82 return accuracy_score(y_test, y_predict)
83
84 def __repr__(self):
85 return "LogisticRegression()"
metrics.py
1 import numpy as np
2 from math import sqrt
3
4
5 def accuracy_score(y_ture, y_predict):
6 """计算 y_ture与 y_predict之间的准确度"""
7 assert y_ture.shape[0] == y_predict.shape[0]
8 "the size of y_ture must be equal to the size of y_predict"
9
10 return sum(y_ture == y_predict)/len(y_ture)
11
12
13 def mean_squared_error(y_ture, y_predict):
14 """计算 y_ture,与 y_predict 之间的MSE """
15 assert len(y_ture) == len(y_predict), \
16 "the size of y_ture must be equal to the size of y_predict "
17 return np.sum((y_ture - y_predict) ** 2) / len(y_ture)
18
19
20 def root_mean_squared_error(y_ture, y_predict):
21 """计算 y_ture与 y_predict之间的RMSE"""
22 assert len(y_ture) == len(y_predict), \
23 "the size of y_ture must be equal to the size of y_predict "
24 return sqrt(mean_squared_error(y_ture, y_predict))
25
26
27 def mean_absolute_error(y_ture, y_predict):
28 """计算 y_ture,与 y_predict 之间的MAE """
29 assert len(y_ture) == len(y_predict), \
30 "the size of y_ture must be equal to the size of y_predict "
31 return np.sum(np.absolute(y_ture - y_predict)) / len(y_predict)
32
33
34 def r2_score(y_ture, y_predict):
35 """计算 y_ture,与 y_predict 之间的 R Square """
36 return 1 - mean_squared_error(y_ture, y_predict) / np.var(y_ture)
37
38 def TN(y_ture, y_predict):
39 assert len(y_ture) == len(y_predict)
40 return np.sum((y_ture == 0) & (y_predict == 0))
41
42 def FP(y_ture, y_predict):
43 assert len(y_ture) == len(y_predict)
44 return np.sum((y_ture == 0) & (y_predict == 1))
45
46 def FN(y_ture, y_predict):
47 assert len(y_ture) == len(y_predict)
48 return np.sum((y_ture == 1) & (y_predict == 0))
49
50 def TP(y_ture, y_predict):
51 assert len(y_ture) == len(y_predict)
52 return np.sum((y_ture == 1) & (y_predict == 1))
53
54 def confusion_matrix(y_true, y_predict):
55 return np.array([
56 [TN(y_true,y_predict), FP(y_true,y_predict)],
57 [FN(y_true,y_predict), TP(y_true,y_predict)]
58 ])
59
60 def precision_score(y_true, y_predict):
61 tp = TP(y_true, y_predict)
62 fp = FP(y_true, y_predict)
63 try:
64 return tp / (tp + fp)
65 except:
66 return 0.0
67
68 def recall_score(y_true, y_predict):
69 tp = TP(y_true, y_predict)
70 fn = FN(y_true, y_predict)
71 try:
72 return tp / (tp + fn)
73 except:
74 return 0.0
75
76 def f1_score(precision, recall):
77 try:
78 return 2 * precision * recall / ( precision + recall)
79 except:
80 return 0.0
81
82 def TPR(y_true, y_predict):
83 tp = TP(y_true, y_predict)
84 fn = FN(y_true, y_predict)
85 try:
86 return tp / (tp + fn)
87 except:
88 return 0.0
89
90 def FPR(y_true, y_predict):
91 tp = TP(y_true, y_predict)
92 tn = TN(y_true, y_predict)
93 try:
94 return tp / (tp + tn)
95 except:
96 return 0.0
model_selection.py
1 import numpy as np
2
3
4 def train_test_split(X, y, test_radio=0.2, seed=None):
5 """将数据X和y按照test_radio分割成X_train、y_train、X_test、y_test"""
6 assert X.shape[0] == y.shape[0],\
7 "the size of X must be equal to the size of y"
8 assert 0.0 <= test_radio <= 1.0,\
9 "test_ration must be valid"
10
11 if seed:
12 np.random.seed(seed)
13
14 shuffled_indexes = np.random.permutation(len(X))
15
16 test_size = int(test_radio * len(X))
17 test_indexes = shuffled_indexes[:test_size]
18 train_indexes = shuffled_indexes[test_size:]
19
20 X_train = X[train_indexes]
21 y_train = y[train_indexes]
22
23 X_test = X[test_indexes]
24 y_test = y[test_indexes]
25
26 return X_train, X_test, y_train, y_test
PCA.py
1 import numpy as np
2
3
4 class PCA:
5
6 def __init__(self, n_components):
7 """ 初始化"""
8 assert n_components >= 1, "n_components must be valid"
9 self.n_components = n_components
10 self.components = None
11
12 def fit(self, X, eta=0.01, n_iters=1e4):
13 """获得数据集 X 的前 n 个主成分"""
14 assert self.n_components <= X.shape[1], \
15 "n_components must not be greater than feature number of X"
16
17 def demean(X):
18 return X - np.mean(X, axis=0)
19
20 def f(w, X):
21 return np.sum((X.dot(w) ** 2)) / len(X)
22
23 def df(w, X):
24 return X.T.dot(X.dot(w)) * 2.0 / len(X)
25
26 def direction(w):
27 return w / np.linalg.norm(w)
28
29 def first_componet(X, initial_w, eta, n_iters=1e4, epsilon=1e-8):
30 w = direction(initial_w)
31 i_iters = 0
32
33 while i_iters < n_iters:
34 gradient = df(w, X)
35 last_w = w
36 w = w + eta * gradient
37 w = direction(w) # 注意1:每次求一个单位方向
38 if (abs(f(w, X) - f(last_w, X)) < epsilon):
39 break
40
41 i_iters += 1
42
43 return w
44
45 X_pca = demean(X)
46 self.components_ = np.empty(shape=(self.n_components, X.shape[1]))
47 res = []
48 for i in range(self.n_components):
49 initial_w = np.random.random(X_pca.shape[1])
50 w = first_componet(X_pca, initial_w, eta, n_iters)
51 self.components_[i, :] = w
52
53 X_pca = X_pca - X_pca.dot(w).reshape(-1, 1) * w
54
55 return self
56
57 def transform(self, X):
58 """将X给定的,映射到各个主成分分量中"""
59 assert X.shape[1] == self.components_.shape[1]
60
61 return X.dot(self.components_.T)
62
63 def inverse_transform(self, X):
64 """将给定的X,反向映射回原来的特征空间"""
65 assert X.shape[1] == self.components_.shape[0]
66
67 return X.dot(self.components_)
68
69 def __repr__(self):
70 return "PCA(n_components=%d)" % self.n_components
preprocessing.py
1 import numpy as np
2
3
4 class StandardScaler:
5
6 def __int__(self):
7 self.mean_ = None
8 self.scale_ = None
9
10 def fit(self, X):
11 """根据训练数据集X获得数据的均值和方差"""
12 assert X.ndim == 2, "The dimension of X must be 2"
13
14 self.mean_ = np.array(np.mean(X[:, i]) for i in range(X.shape[1]))
15 self.scale_ = np.array(np.std(X[:, i]) for i in range(X.shape[1]))
16
17 return self
18
19 def transform(self, X):
20 """ 将 X 根据这个StandardScaler进行均值方差归一化处理"""
21 assert X.ndim == 2, "The dimension of X must be 2"
22 assert self.mean_ is not None and self.scale_ is not None,\
23 "must fit before transform!"
24 assert X.shape[1] == len(self.mean_), \
25 "The feature number of X must be equal to mean_ and std_"
26
27 resX = np.empty(shape=X.shape, dtype=float)
28 for col in range(X.shape[1]):
29 resX[:, col] = (X[:, col] - self.mean_[col]) / self.scale_[col]
30 return resX
SimpleLinearRegression.py
1 import numpy as np
2 from .metrics import r2_score # 加点下标运行报错,不加点下标jupyter 能运行
3
4
5 class SimpleLinearRegression1:
6
7 def __int__(self):
8 """初始化 Simple Linear Regression 模型"""
9 self.a_ = None
10 self.b_ = None
11
12 def fit(self, x_train, y_train):
13 """根据训练数据集x_train,y_train 训练 Simple Linear Regression 模型"""
14 assert x_train.ndim == 1, \
15 " Simple Linear Regression can only solve single feature training data"
16 assert len(x_train) == len(y_train), \
17 "the size of x_train must be equal to the size of y_train"
18
19 x_mean = np.mean(x_train)
20 y_mean = np.mean(y_train)
21
22 num = 0.0
23 d = 0.0
24 for x, y in zip(x_train, y_train):
25 num += (x - x_mean) * (y - y_mean)
26 d += (x - x_mean) ** 2
27
28 self.a_ = num / d
29 self.b_ = y_mean - self.a_ * x_mean
30
31 return self
32
33 def predict(self, x_predict):
34 """给定待测数据集x_predict,返回表示x_predict的结果向量"""
35 # print(x_predict.ndim)
36 assert x_predict.ndim == 1, \
37 "Simple Linear Regression can only solve single feature training data"
38 assert self.a_ is not None and self.b_ is not None, \
39 "must fit before predict!"
40
41 return np.array([self._predict(x) for x in x_predict]) # predict(x)无下划线问题严重
42
43 def _predict(self, x_single):
44 """给定单个待测数据 x_single,返回x_single的预测结果值"""
45 return self.a_ * x_single + self.b_
46
47 def __repr__(self):
48 return "Simple Linear Regression1()"
49
50
51 class SimpleLinearRegression2:
52
53 def __int__(self):
54 """初始化 Simple Linear Regression 模型"""
55 self.a_ = None
56 self.b_ = None
57
58 def fit(self, x_train, y_train):
59 """根据训练数据集x_train,y_train 训练 Simple Linear Regression 模型"""
60 assert x_train.ndim == 1, \
61 " Simple Linear Regression can only solve single feature training data"
62 assert len(x_train) == len(y_train), \
63 "the size of x_train must be equal to the size of y_train"
64
65 x_mean = np.mean(x_train)
66 y_mean = np.mean(y_train)
67
68 num = 0.0
69 d = 0.0
70 num = (x_train - x_mean).dot(y_train - y_mean)
71 d = (x_train - x_mean).dot(x_train - x_mean)
72 self.a_ = num / d
73 self.b_ = y_mean - self.a_ * x_mean
74
75 return self
76
77 def predict(self, x_predict):
78 """给定待测数据集x_predict,返回表示x_predict的结果向量"""
79 # print(x_predict.ndim)
80 assert x_predict.ndim == 1, \
81 "Simple Linear Regression can only solve single feature training data"
82 assert self.a_ is not None and self.b_ is not None, \
83 "must fit before predict!"
84
85 return np.array([self._predict(x) for x in x_predict]) # predict(x)无下划线问题严重
86
87 def _predict(self, x_single):
88 """给定单个待测数据 x_single,返回x_single的预测结果值"""
89 return self.a_ * x_single + self.b_
90
91 def score(self, x_test, y_test):
92 """根据测试数据集 x_test 和 y_test 确定当前模型的准确度"""
93 y_predict = self.predict(x_test)
94 return r2_score(y_test, y_predict)
95
96 def __repr__(self):
97 return "Simple Linear Regression2()"