1.12 - 动手搓KNN近邻-分类和回归

1. 通过sklearn调用机器学习api处理问题通用流程

 

# -*- coding: utf-8 -*-
import time
import joblib
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


def general_process_ML():
    # 1. 数据爬取和清洗

    # 2. 数据读取 data/label
    names = ['x1', 'x2', 'x3', 'x4', 'y']
    data = pd.read_csv(r'./../data/iris.data', header=None, names=names)
    X = data.iloc[:, :-1]
    Y = data.iloc[:, -1]
    label_encoder = LabelEncoder()
    Y = label_encoder.fit_transform(Y)
    # print(data.describe())  # 数据描述
    # print('=' * 10)
    # print(data.head())  # 查看前五行
    # print('=' * 10)
    # print(data.isnull().any())  # 数据都不为空

    # 3. 划分训练集和测试集
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

    # 4. 特征工程(正则化、标准化、word2Vector)
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    # 模型初始化
    model = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='kd_tree')

    # 模型训练
    train_start = time.time()
    model.fit(x_train, y_train)
    train_total_time = time.time() - train_start
    print(f"模型训练过程用时: {train_total_time}")

    # 模型评估
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    train_score = accuracy_score(y_train, y_train_pred)
    test_score = accuracy_score(y_test, y_test_pred)
    print(f"模型在训练集上的accuracy: {train_score}")
    print(f"模型在测试集上的accuracy: {test_score}")

    # 模型持久化&部署
    scaler_path = r"./../models/knn_scaler.joblib"
    knn_path = r"./../models/knn_model.joblib"
    joblib.dump(scaler, scaler_path)
    joblib.dump(model, knn_path)

    # 加载复用模型权重
    # reload_scaler = joblib.load(scaler_path)
    # reload_knn = joblib.load(knn_path)

 

 

2. 手搓KNN

# -*- coding: utf-8 -*-
import time
import joblib
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


class KNNModel:
    def __init__(self, k=None, classify=False):
        self.k = k
        self.X = None
        self.Y = None
        self.classify = classify

    def fit(self, X, Y):
        self.X = np.array(X)
        self.Y = np.array(Y)

    def get_k_nearest_neighbors(self, X):
        near_neighbors = []
        num = X.shape[0]
        for i in range(num):
            dist = [np.sum((np.array(X[i]) - x) ** 2) for x in self.X]
            dist_list = list(zip(dist, self.Y))
            dist_list.sort(key=lambda pair: pair[0])
            near_neighbors.append(dist_list[:self.k])
        return np.array(np.array(near_neighbors)[:, :, -1])

    def predict(self, X):
        nearest_k_neighbor = self.get_k_nearest_neighbors(X)  # X中每个元素的最近的k个元素的标签
        # 分类取类别最多的,回归取均值
        if self.classify:
            return np.array([Counter(item).most_common(1)[0][0] for item in nearest_k_neighbor])
        else:
            return np.array(nearest_k_neighbor).mean(axis=1)

    def score(self, X, Y):
        y_hat = self.predict(X)
        acc = np.mean(y_hat == Y)
        return acc

 

posted @ 2024-08-08 15:28  橘子葡萄火龙果  阅读(32)  评论(0)    收藏  举报