机器学习项目的完整流程

A complete ML Project

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Get data

data = pd.read_csv(path)
#大致看columns
data.head()

#看类型,是否有缺失等
data.info()

#看到object或其他类型,用value_counts()查看
data["xx"].value_counts()

data.describe()

#直方图,检查数据是否被scaled or capped、单位是否合理,尤其是重要features和target、峰度、偏度等。
data.hist(bins=50, figsize=(20,15));

Create a Training/ Test Set

If the training set is very large, you may want to sample an exploration set to make manipulations easy and fast.

random sampling methods: This is generally fine if your dataset is large enough (especially relative to the number of attributes).否则要用分层抽样

from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
# 下面作阅读理解
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

#确保更新后的数据不会与旧数据混淆
import hashlib

def test_set_check(identifier, test_ratio, hash):
    return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash==hashlib.md5):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))
    return data.loc[~in_test_set], data.loc[in_test_set]
#当数据没有identifier时,可以用ID(要确保更新的数据连接在后面,而且旧数据没有row变化):
housing_with_id = housing.reset_index()   # adds an `index` column
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")
#又或者根据feature的特殊性创作id变量:
housing_with_id["id"] = housing["longitude"] * 1000 + housing["latitude"]
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "id")

stratifed sampling: It is important to have a sufficient number of instances in your dataset for each stratum.This means that you should not have too many strata.

# 从直方图看出income的主要集中在2-5
housing["median_income"].hist(bins=50)
# 也可以根据比例确定集中度
housing["income_cat"].value_counts()/len(data)

#为了使抽样更具代表性,下面进行分层。下面压缩某个feature,以减少分层数
# Divide by 1.5 to limit the number of income categories。用ceil进行分类
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# 这里where的使用不同于np,将大于5的变为5.0
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
# 分好层后,用StratifiedShuffleSplit进行抽样。该函数n_splits是返回的train/test组数
# 返回的结果是通过split获得的两组ndarray
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
# 检查test_set比例和整体比例是否匹配
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
housing["income_cat"].value_counts() / len(housing)
#最后要将上面创建的用于抽样的categories删去
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

当不知道选哪种sample时,可以实现两个后进行比较

# 各份数据中income_cat按种类划分后各种类的比例
def sampling_proportions(data):
    return data["income_cat"].value_counts() / len(data)

# 创建一个df比较strat和random得到的test_set中某个col里的种类比例与原始数据的比较。
compare_props = pd.DataFrame({
    "Overall": income_cat_proportions(housing),
    "Stratified": income_cat_proportions(strat_test_set),
    "Random": income_cat_proportions(random_test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] / compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100

Discover and visualize the data to gain insights

#copy一份
housing = strat_train_set.copy()

Visualizing Geographical Data

#先简单地看一下,设置alpha方便看密度(部分透明)
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
#然后加入其他features。这里s是点的大小,注意population要scaling,否则圈会很大;
#c是点的颜色,引入cmap=plt.get_cmap("jet")为调色版,colorbar是是否显示调色板,sharex显示x轴?
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()# 图例
save_fig("housing_prices_scatterplot")
#从图中看出地理位置、人口和target feature(价格)关系很大。这可以用clustering来创造
#找出人口中心来测量价格,还有与海的距离也可能是一个很好的变量(但是北边跟海的关系不大)。

Looking for Correlations

#查看各个feature和target的关系
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
#由于corr只能发现线性关系,所以有必要看一下散点图。由于features较多,下面只关注部分
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")
#之后具体查看
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")

Experimenting with Attribute Combinations

创建后再查看关联度,这是一个重复的过程

#根据feature的实际情况和关系构建新features
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

#这个过程可以建立一个class
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

Prepare the Data for Machine Learning Algorithms

#先将除去label的train_data提取出来
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()
#对缺失值有下面三个选择
#Get rid of the corresponding districts.
housing.dropna(subset=["total_bedrooms"]) #有nan的行都会被去掉
#Get rid of the whole attribute.
housing.drop("total_bedrooms", axis=1)
#Set the values to some value 
housing["total_bedrooms"].fillna(median)
#第三种方法用sklearn会更好
#这里对所有attributes进行同样操作,这是为了以后或许其他attributes出现缺失提前准备
#只有数值型有中位数,所以要把部分object attributes去掉
housing_num = housing.drop('ocean_proximity', axis=1)

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")
imputer.fit(housing_num)
imputer.statistics_#查看各attribute的中位数,imputer.strategy查看strategy
X = imputer.transform(housing_num)# X是与housing_sum维度相同的np,下面转换回df
housing_tr = pd.DataFrame(X, columns=housing_num.columns,index = list(housing.index.values))
# Handling Text and Categorical Attributes
#当sklearn 0.20发布时,可以从sklearn.preprocessing引入
#文字转化为数字
from future_encoders import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
ordinal_encoder.categories_#查看种类
# 转换成数字后,还要转换为one-hot vectors
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))# 要将1d转换为2d
housing_cat_1hot#输出默认为SciPy sparse matrix
housing_cat_1hot.toarray()#转换回np或者提前设OneHotEncoder(sparse=False)

#对于label的文字转化,可以用sklearn,这里把两次转换结合在一起。CategoricalEncoder等待更新
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot#输出默认为np

#未来版本
from sklearn.preprocessing import CategoricalEncoder
cat_encoder = CategoricalEncoder(encoding="onehot-dense")#输出默认为np
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot


Feature Scaling

It is important to fit the scalers to the training data only, not to the full dataset (including the test set). Only then can you use them to transform the training set and the test set (and new data).

sklearn: MinMaxScaler and StandardScaler

Transformation Pipelines

#对于piperline:All but the last estimator must be transformers.最后一项不是的话,
# 就不能用fit_transform(),只能用fit(用后就只剩下最后一步没有转换)
#对于FeatureUnion,同时运行多个pipeline,最后concatenates
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler

#分开数字和文字attributes
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
# Create a class to transform df to np
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

Select and Train a Model

Training and Evaluating on the Training Set

先找2~5个promising models再去考虑tweaking the hyperparameters

  1. 监督学习
  2. 非监督学习
# get_params 查看模型设定
#普通线性回归
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

# SGD回归(default learning schedule,fit后会显示详细参数)
from sklearn.linear_model import SGDRegressor
sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1, random_state=42)
# Batch 和 Mini-batch GD 在后面补充

# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)# 将X_poly代入普通线性回归就可以了

# Ridge Regression 用的是近似的正解的方法 using a matrix factorization technique(cholesky)
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42)

# Logistic regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=42)
log_reg.predict_proba()#能知道两个class的概率
# Softmax Regression 的设置为multi_class="multinomial",solver="lbfgs"

#决策树
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.predict_proba()
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(random_state=42)

#随机森林(决策树+Bagging的组合)
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(random_state=42)
# 例外:没有splitter,它是随机的;没有presort,它是关闭的; 没有max_samples,它是1.0
# 可以提取重要feature:
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):

# KNN
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

#SVM
from sklearn.svm import LinearSVR
svm_reg = LinearSVR(epsilon=1.5, random_state=42)
from sklearn.svm import SVR 
svm_poly_reg = SVR(kernel="poly", degree=2, C=100, epsilon=0.1)# 在reg下,c越大,规整化越小

# 这个方法regularizes the bias term,所以数据要先中心化
# 对于 non-linearly separable 数据,同样用PolynomialFeatures后再用下面模型。但如果features
#太多,要用kernel
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=1, loss="hinge", dual= False random_state=42)#不提供每个class的prob
#当数据量较大(out-of-core training),可以用下面的SGDClassifier(loss="hinge",alpha=1/(m*C))
#相当于SVC的(kernel="linear", C=1),但更慢。

# Kernel
from sklearn.svm import SVC
poly_kernel_svm_clf = SVC(kernel="poly", degree=3, coef0=1, C=5)
# coef0 控制模型受到高维度和低维度影响的程度,相当于调整penality
rbf_kernel_svm_clf = SVC(kernel="rbf", gamma=5, C=0.001)
# SVC可以设probability=True,然乎使用CV来产生prob,生成predict_proba(),会比较慢

# SGDClassifier
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
#sgd_clf.decision_function([some_digit]) 能得到评分,多class时多个分数,
#可用sgd_clf.classes_查对应的class
# 非监督学习
# k-means
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4)# n_init默认为10, init="k-means++"默认。
# algorithm默认密集数据用elkan,sparse数据用full
# predict和labels_一样
kmeans.transform(X_new)#与每一个中心的距离
# K的选取
kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)
                for k in range(1, 10)]
inertias = [model.inertia_ for model in kmeans_per_k]

# plt.figure(figsize=(8, 3.5))
# plt.plot(range(1, 10), inertias, "bo-")
# plt.xlabel("$k$", fontsize=14)
# plt.ylabel("Inertia", fontsize=14)
# plt.annotate('Elbow',
#              xy=(4, inertias[3]),
#              xytext=(0.55, 0.55),
#              textcoords='figure fraction',
#              fontsize=16,
#              arrowprops=dict(facecolor='black', shrink=0.1)
#             )
# plt.axis([1, 8.5, 0, 1300])
# save_fig("inertia_vs_k_diagram")
# plt.show()

from sklearn.metrics import silhouette_score
silhouette_scores = [silhouette_score(X, model.labels_)
                     for model in kmeans_per_k[1:]]
# plt.figure(figsize=(8, 3))
# plt.plot(range(2, 10), silhouette_scores, "bo-")
# plt.xlabel("$k$", fontsize=14)
# plt.ylabel("Silhouette score", fontsize=14)
# plt.axis([1.8, 8.5, 0.55, 0.7])
# save_fig("silhouette_score_vs_k_diagram")
# plt.show()

#SpectralClustering
from sklearn.cluster import SpectralClustering
model = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                           assign_labels='kmeans')

#Gaussian Mixtures
from sklearn.mixture import GaussianMixture
gm = GaussianMixture(n_components=3, n_init=10, random_state=42)
#components的选择
gms_per_k = [GaussianMixture(n_components=k, n_init=10, random_state=42).fit(X)
             for k in range(1, 11)]
bics = [model.bic(X) for model in gms_per_k]
aics = [model.aic(X) for model in gms_per_k]
#或者直接用Variational Bayesian Gaussian Mixtures
#设一个认为绝对够大的n,模型会自动选
from sklearn.mixture import BayesianGaussianMixture
bgm = BayesianGaussianMixture(n_components=10, n_init=10)

evaluation

  1. 准确性
  2. L2、 L1
  3. P/R 和 ROC 评分
  4. 交叉检验评分
#准确性评分
# L2评分
from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

# L1评分
from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)
lin_mae

# P/R 和 ROC 评分
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
#下面用于binary classification有点小bug。 
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function")
#当使用随机森林时,只能用method="predict_proba",返回概率(每个样本有属于各个class的概率)
#然后直接用pro作为分数,如在binary classification里面,第二项就是True
#y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class


# P/R
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import precision_score, recall_score
# from sklearn.metrics import f1_score # 衡量两者,偏向数值小的。可以选average="macro"or"weighted"
# confusion_matrix(y_train_5, y_train_pred)

# 图
# from sklearn.metrics import precision_recall_curve
# precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

# def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
#     plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
#     plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
#     plt.xlabel("Threshold", fontsize=16)
#     plt.legend(loc="upper left", fontsize=16)
#     plt.ylim([0, 1])

# plt.figure(figsize=(8, 4))
# plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
# plt.xlim([-700000, 700000])
# save_fig("precision_recall_vs_threshold_plot")
# plt.show()

# ROC 
# from sklearn.metrics import roc_curve
# fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

# 图
# def plot_roc_curve(fpr, tpr, label=None):
#     plt.plot(fpr, tpr, linewidth=2, label=label)
#     plt.plot([0, 1], [0, 1], 'k--')
#     plt.axis([0, 1, 0, 1])
#     plt.xlabel('False Positive Rate', fontsize=16)
#     plt.ylabel('True Positive Rate', fontsize=16)

# plt.figure(figsize=(8, 6))
# plot_roc_curve(fpr, tpr)
# save_fig("roc_curve_plot")
# plt.show()

# 得分
# from sklearn.metrics import roc_auc_score
# roc_auc_score(y_train_5, y_scores)
#原始Cross-Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_5[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_5[test_index])

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))
# Cross-Validation评分
#上面评分和CV评分比较可知道模型是否overfitting
from sklearn.model_selection import cross_val_score

#普通线性回归
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,
                             scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)#用的是效用函数(越大越好),所以最后加个-号
#决策树
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
#随机森林
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(lin_rmse_scores)
display_scores(tree_rmse_scores)
display_scores(forest_rmse_scores)

Fine-Tune Your Model

  1. Grid Search
  2. Randomized Search
#Grid Search
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
grid_search.best_estimator_
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)
pd.DataFrame(grid_search.cv_results_)
#Randomized Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {
        'n_estimators': randint(low=1, high=200),
        'max_features': randint(low=1, high=8),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

Analyze the Best Models and Their Errors

  • 找出重要变量,去掉不显著的变量
  • adding extra features or, on the contrary, getting rid of uninformative ones, cleaning up outliers, etc.
#从grid_search中找出最重要attributes
feature_importances = grid_search.best_estimator_.feature_importances_#attribute values
#合并attributes名称,原变量加上combine的attributes和encode后的attributes
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

Evaluate Your System on the Test Set

# The improvements would be unlikely to generalize to new data。所以只需一次评分
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)# 注意只能用transform
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse
# compute a 95% confidence interval for the test RMSE
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)

np.sqrt(stats.t.interval(confidence, m - 1,
                         loc=np.mean(squared_errors),
                         scale=stats.sem(squared_errors)))

Additional materials

  1. Mini-batch and Batch gradient descent
  2. Learning curves
  3. Early Stopping
  4. Ensemble System
  5. Dimensionality Reduction

Mini-batch and Batch gradient descent

# Mini-batch gradient descent
theta_path_mgd = []

n_iterations = 50
minibatch_size = 20

np.random.seed(42)
theta = np.random.randn(2,1)  # random initialization

t0, t1 = 200, 1000
def learning_schedule(t):
    return t0 / (t + t1)

t = 0
for epoch in range(n_iterations):
    shuffled_indices = np.random.permutation(m)
    X_b_shuffled = X_b[shuffled_indices]
    y_shuffled = y[shuffled_indices]
    for i in range(0, m, minibatch_size):
        t += 1
        xi = X_b_shuffled[i:i+minibatch_size]
        yi = y_shuffled[i:i+minibatch_size]
        gradients = 2/minibatch_size * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(t)
        theta = theta - eta * gradients
        theta_path_mgd.append(theta)

# Batch GD
eta = 0.1
n_iterations = 1000
m = 100
theta = np.random.randn(2,1)

for iteration in range(n_iterations):
    gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
    theta = theta - eta * gradients
    
#使用
X_new_b.dot(theta)

Learning curves

from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
        val_errors.append(mean_squared_error(y_val, y_val_predict))

    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.legend(loc="upper right", fontsize=14)   
    plt.xlabel("Training set size", fontsize=14)
    plt.ylabel("RMSE", fontsize=14)              
    

polynomial_regression = Pipeline([
        ("poly_features", PolynomialFeatures(degree=10, include_bias=False)),
        ("lin_reg", LinearRegression()),
    ])

plot_learning_curves(polynomial_regression, X, y)
plt.axis([0, 80, 0, 3])           
save_fig("learning_curves_plot")  
plt.show()                        

Early Stopping

from sklearn.base import clone
sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None,
                       learning_rate="constant", eta0=0.0005, random_state=42)

minimum_val_error = float("inf")
best_epoch = None
best_model = None
for epoch in range(1000):
    sgd_reg.fit(X_train_poly_scaled, y_train)  # continues where it left off
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    val_error = mean_squared_error(y_val, y_val_predict)
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(sgd_reg)

Ensemble System

# Voting
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')# 可改soft,但全部estimators要有predict_proba方法
voting_clf.fit(X_train, y_train)


for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
# bagging and pasting
#默认soft voting(如果方法产生概率)
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True, random_state=42)
# bootstrap=True指用bagging,否则用pasting;max_samples指每个模型的样本数,可以设百分比0.0~1.0
# n_jobs指用多少个CPU core,-1为全部;oob_score可以用来做test测试,bag_clf.oob_score_的评分
# 可以设置 max_features 和 bootstrap_features
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42)
ada_clf.fit(X_train, y_train)
# algorithm="SAMME.R"是当base estimator产生概率时才能用

# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)
gbrt.fit(X, y)
# staged_predict
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)
gbrt_best.fit(X_train, y_train)
# early stop
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break  # early stopping

Dimensionality Reduction

# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)# 可以设置0.0~1.0来保留variance的百分比;可选svd_solver="randomized"
X2D = pca.fit_transform(X) # 也有pca.inverse_transform
pca.components_.T[:,0]# 查看第一PCs
pca.explained_variance_ratio_ # 查看每个PC包含了多少variance

from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)
# 结合pipeline找最优kernel和gamma
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression())
    ])

param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

参考:
Hands-On Machine Learning with Scikit-Learn and TensorFlow

posted @ 2018-10-29 22:02  justcodeit  阅读(512)  评论(0编辑  收藏  举报