机器学习第2章端到端的机器学习项目

机器学习实战:基于Scikit-Learn和TensorFlow的笔记

参考:作者的Jupyter Notebook
Chapter 2 – End-to-end Machine Learning project

  1. 下载数据

    • 打开vscode,建立新的python文件,输入以下代码,下载housing.tgz文件,并将housing.csv解压到这个目录
    import os
    import tarfile
    from six.moves import urllib
    
    download_root = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
    HOUSING_PATH = "datasets/housing"
    HOUSING_URL = download_root + HOUSING_PATH + "/housing.tgz"
    
    def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
        if not os.path.isdir(housing_path):
            os.makedirs(housing_path)
        tgz_path = os.path.join(housing_path, "housing.tgz")
        urllib.request.urlretrieve(housing_url, tgz_path)
        housing_tgz = tarfile.open(tgz_path)
        housing_tgz.extractall(path=housing_path)
        housing_tgz.close()
    
    fetch_housing_data()
    

    下载后可将函数注释

  2. 快速查看数据结构

    • 使用pandas加载数据
    mport pandas as pd
    def load_housing_data(housing_path=HOUSING_PATH):
      csv_path = os.path.join(housing_path, "housing.csv")
      return pd.read_csv(csv_path)
    

    函数返回一个包含所有数据的Pandas DataFrame对象

    • 调用DataFrames的head()方法查看前5行数据(由于使用的是vscode所以会和书里有所不同),查看完可注释
    housing = load_housing_data()
    print(housing.head())
    

    总共有10个属性

    • 通过info()方法可以快速获取数据集的简单描述,特别是总行数、每个属性的类型和非空值的数量
      print(housing.info())

    • 使用value_counts()方法查看有多少种分类存在,每种类别下分别有多少个区域
      print(housing["ocean_proximity"].value_counts())

    • 通过describe()方法可以显示数值属性的摘要
      print(housing.describe())

    • 在整个数据集上调用hist()方法,绘制每个属性的直方图

    import matplotlib.pyplot as plt
    housing.hist(bins=50, figsize=(50,15))
    plt.show()
    
  3. 创建测试集

    • 理论上,创建测试集非常简单:只需要随机选择一些实例,通常是数据集的20%,然后将它们放在一边:
    import numpy as np
    def split_train_test(data, test_ratio):
        shuffled_indices = np.random.permutation(len(data))
        test_set_size = int(len(data) * test_ratio)
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        return data.iloc[train_indices], data.iloc[test_indices]
    
    train_set, test_set = split_train_test(housing, 0.2)
    print(len(train_set), "train +", len(test_set), "test")
    
    • 但这并不完美:如果你再运行一遍,它又会产生一个不同的数据集!这样下去,你(或者是你的机器学习算法)将会看到整个完整的数据集,而这正是创建测试集时需要避免的。常见的解决办法是每个实例都使用一个标识符(identifier)来决定是否进入测试集(假定每个实例都有一个唯一且不变的标识符)
    import hashlib
    def test_set_check(identifier,test_ratio, hash):
        return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
    
    def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
        ids = data[id_column]
        in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
        return data.loc[~in_test_set], data.loc[in_test_set]
    
    #housing_with_id = housing.reset_index()
    #housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
    #train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")
    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(housing, test_size=0.2, random=42)
    
    • 分层抽样
    housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
    housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
    from sklearn.model_selection import StratifiedShuffleSplit
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    print(housing["income_cat"].value_counts() / len(housing))
    for set in (strat_train_set, strat_test_set):
        set.drop(["income_cat"], axis=1, inplace=True)
    
  4. 数据探索和可视化

    • 创建一个副本housing = strat_train_set.copy()
    • 将地理数据可视化
    #housing.plot(kind="scatter", x="longitude", y="latitude")
    #housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
    housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"] / 100, label="population",
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,)
    plt.legend()
    plt.show()
    
    • 寻找相关性
    #corr_matrix = housing.corr()
    #print(corr_matrix["median_house_value"].sort_values(ascending=False))
    from pandas.plotting import scatter_matrix #少了tools
    attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
    scatter_matrix(housing[attributes], figsize=(12, 8))
    housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
    plt.show()
    
  5. 试验不同属性的组合

    housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
    housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
    housing["population_per_household"]=housing["population"]/housing["households"]
    corr_matrix = housing.corr()
    print(corr_matrix["median_house_value"].sort_values(ascending=False))
    
  6. 机器学习算法的数据准备

    housing = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set["median_house_value"].copy()
    
  7. 数据清理4选1

    #housing.dropna(subset=["total_bedrooms"])    # option 1
    #housing.drop("total_bedrooms", axis=1)       # option 2
    #median = housing["total_bedrooms"].median()
    #housing["total_bedrooms"].fillna(median)     # option 3
    
    #option4: Scikit-Learn提供的imputer, 指定你要用属性的中位数值替换该属性的缺失值
    from sklearn.impute import SimpleImputer #与书中不同,进化了
    imputer = SimpleImputer(strategy="median")   #创建一个imputer实例
    housing_num = housing.drop("ocean_proximity", axis=1)   #创建一个没有文本属性的数据副本ocean_proximity
    imputer.fit(housing_num)   #使用fit()方法将imputer实例适配到训练集
    #print(imputer.statistics_)
    #print(housing_num.median().values)
    X = imputer.transform(housing_num)   #替换
    housing_tr = pd.DataFrame(X, columns=housing_num.columns)   #放回Pandas DataFrame
    
  8. 处理文本和分类属性

    #先将这些文本标签转化为数字,Scikit-Learn为这类任务提供了一个转换器LabelEncoder:
    from sklearn.preprocessing import LabelEncoder
    encoder = LabelEncoder()
    housing_cat = housing["ocean_proximity"]
    housing_cat_encoded = encoder.fit_transform(housing_cat)
    #print(housing_cat_encoded)
    #print(encoder.classes_)
    
    #Scikit-Learn提供了一个OneHotEncoder编码器,可以将整数分类值转换为独热向量
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder()
    housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
    #print(housing_cat_1hot.toarray())
    
    #使用LabelBinarizer类可以一次性完成两个转换
    from sklearn.preprocessing import LabelBinarizer
    encoder = LabelBinarizer()
    housing_cat_1hot = encoder.fit_transform(housing_cat)
    print(housing_cat_1hot)
    
  9. 自定义转换器

    from sklearn.base import BaseEstimator, TransformerMixin
    rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
    class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
            self.add_bedrooms_per_room = add_bedrooms_per_room
        def fit(self, X, y=None):
            return self    #nothing else to do
        def transform(self, X, y=None):
            rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
            population_per_household = X[:, population_ix] / X[:, household_ix]
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
                return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
            else:
                return np.c_[X, rooms_per_household, population_per_household]
    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing.values)
    
  10. 转换流水线

    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
        ])
    housing_num_tr = num_pipeline.fit_transform(housing_num)
    #print(housing_num_tr)
    
    from sklearn.compose import ColumnTransformer
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]
    
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    
    housing_prepared = full_pipeline.fit_transform(housing)
    #print(housing_prepared)
    #print(housing_prepared.shape)
    
  11. 选择和训练模型

    • 训练一个线性回归模型:
    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)
    #print(lin_reg)
    #实例试试
    some_data = housing.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    #print("Predictions:", lin_reg.predict(some_data_prepared))
    #print("Labels:", list(some_labels))
    #print(some_data_prepared)
    
    • 使用Scikit-Learn的mean_squared_error函数来测量整个训练集上回归模型的RMSE:
    from sklearn.metrics import mean_squared_error
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    #print(lin_rmse)
    from sklearn.metrics import mean_absolute_error
    lin_mae = mean_absolute_error(housing_labels, housing_predictions)
    #print(lin_mae)
    
    • 我们来训练一个(决策树)DecisionTreeRegressor。
    from sklearn.tree import DecisionTreeRegressor
    tree_reg = DecisionTreeRegressor(random_state=42)
    tree_reg.fit(housing_prepared, housing_labels)
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    #print(tree_rmse)    #可能对数据严重过度拟合
    
    • 使用交叉验证来更好地进行评估
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    tree_rmse_scores = np.sqrt(-scores)
    
    def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    #display_scores(tree_rmse_scores)
    
    • 计算一下线性回归模型的评分
    lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)
    #display_scores(lin_rmse_scores)
    
    • 随机森林模型RandomForestRegressor
    from sklearn.ensemble import RandomForestRegressor
    forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
    forest_reg.fit(housing_prepared, housing_labels)
    housing_predictions = forest_reg.predict(housing_prepared)
    forest_mse = mean_squared_error(housing_labels, housing_predictions)
    forest_rmse = np.sqrt(forest_mse)
    #print(forest_rmse)
    from sklearn.model_selection import cross_val_score
    forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)
    #display_scores(forest_rmse_scores)
    scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
    #print(pd.Series(np.sqrt(-scores)).describe())
    
  12. 微调模型

  13. 网格搜索

    #你可以用Scikit-Learn的GridSearchCV来替你进行探索。你所要做的只是告诉它你要进行实验的超参数是什么,以及需要尝试的值,它将会使用交叉验证来评估超参数值的所有可能的组合。
    #下面这段代码搜索RandomForestRegressor的超参数值的最佳组合:
    #当你不知道超参数应该赋什么值时,一个简单的方法是连续尝试10的幂次方
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import GridSearchCV
    param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, # try 12 (3×4) combinations of hyperparameters
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, # then try 6 (2×3) combinations with bootstrap set as False
    ]
    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(housing_prepared, housing_labels)
    #print(grid_search.best_params_)
    #print(grid_search.best_estimator_)
    
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
       print(np.sqrt(-mean_score), params)
    print(pd.DataFrame(grid_search.cv_results_))
    #随机搜索
    #集成方法
    
  14. 分析最佳模型及其错误

    feature_importances = grid_search.best_estimator_.feature_importances_
    #print(feature_importances)
    #将这些重要性分数显示在对应的属性名称旁边:
    extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
    #cat_encoder = cat_pipeline.named_steps["cat_encoder"] # old solution
    cat_encoder = full_pipeline.named_transformers_["cat"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    sorted(zip(feature_importances, attributes), reverse=True)
    #print(sorted(zip(feature_importances, attributes), reverse=True))
    #通过测试集评估系统
    from sklearn.metrics import mean_squared_error
    final_model = grid_search.best_estimator_
    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()
    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)
    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    #print(final_rmse)
    
  15. 启动、监控和维护系统

posted @ 2020-03-26 19:42  吻雪  阅读(248)  评论(0编辑  收藏  举报