## 基于 Python 和 Scikit-Learn 的机器学习介绍

R和Python是提供给数据科学家的最常用的两种工具。每一个工具都有其优缺点，但Python最近在各个方面都有所胜出（仅为鄙人愚见，虽然我两者都用）。这一切的发生是因为Scikit-Learn库的腾空出世，它包含有完善的文档和丰富的机器学习算法。

import numpy as np

import urllib

# url with dataset

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

raw_data = urllib.urlopen(url)

# load the CSV file as a numpy matrix

dataset = np.loadtxt(raw_data, delimiter=",")

# separate the data from the target attributes

X = dataset[:,0:7]

y = dataset[:,8]

from sklearn import metrics

from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()

model.fit(X, y)

# display the relative importance of each attribute

print(model.feature_importances_)

from sklearn import metrics

from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()

model.fit(X, y)

# display the relative importance of each attribute

print(model.feature_importances_)

from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

# create the RFE model and select 3 attributes

rfe = RFE(model, 3)

rfe = rfe.fit(X, y)

# summarize the selection of the attributes

print(rfe.support_)

print(rfe.ranking_)

from sklearn import metrics

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X, y)

print(model)

# make predictions

expected = y

predicted = model.predict(X)

# summarize the fit of the model

print(metrics.classification_report(expected, predicted))

print(metrics.confusion_matrix(expected, predicted))

from sklearn import metrics

from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X, y)

print(model)

# make predictions

expected = y

predicted = model.predict(X)

# summarize the fit of the model

print(metrics.classification_report(expected, predicted))

print(metrics.confusion_matrix(expected, predicted))

k-最近邻

kNN（k-最近邻）方法通常用于一个更复杂分类算法的一部分。例如，我们可以用它的估计值做为一个对象的特征。有时候，一个简单的kNN算法在良好选择的特征上会有很出色的表现。当参数（主要是metrics）被设置得当，这个算法在回归问题中通常表现出最好的质量。

from sklearn import metrics

from sklearn.neighbors import KNeighborsClassifier

# fit a k-nearest neighbor model to the data

model = KNeighborsClassifier()

model.fit(X, y)

print(model)

# make predictions

expected = y

predicted = model.predict(X)

# summarize the fit of the model

print(metrics.classification_report(expected, predicted))

print(metrics.confusion_matrix(expected, predicted))

from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier

# fit a CART model to the data

model = DecisionTreeClassifier()

model.fit(X, y)

print(model)

# make predictions

expected = y

predicted = model.predict(X)

# summarize the fit of the model

print(metrics.classification_report(expected, predicted))

print(metrics.confusion_matrix(expected, predicted))

SVM（支持向量机）是最流行的机器学习算法之一，它主要用于分类问题。同样也用于逻辑回归，SVM在一对多方法的帮助下可以实现多类分类。

from sklearn import metrics

from sklearn.svm import SVC

# fit a SVM model to the data

model = SVC()

model.fit(X, y)

print(model)

# make predictions

expected = y

predicted = model.predict(X)

# summarize the fit of the model

print(metrics.classification_report(expected, predicted))

print(metrics.confusion_matrix(expected, predicted))

import numpy as np

from sklearn.linear_model import Ridge

from sklearn.grid_search import GridSearchCV

# prepare a range of alpha values to test

alphas = np.array([1,0.1,0.01,0.001,0.0001,0])

# create and fit a ridge regression model, testing each alpha

model = Ridge()

grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

grid.fit(X, y)

print(grid)

# summarize the results of the grid search

print(grid.best_score_)

print(grid.best_estimator_.alpha)

import numpy as np

from scipy.stats import uniform as sp_rand

from sklearn.linear_model import Ridge

from sklearn.grid_search import RandomizedSearchCV

# prepare a uniform distribution to sample for the alpha parameter

param_grid = {'alpha': sp_rand()}

# create and fit a ridge regression model, testing random alpha values

model = Ridge()

rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)

rsearch.fit(X, y)

print(rsearch)

# summarize the results of the random parameter search

print(rsearch.best_score_)

print(rsearch.best_estimator_.alpha)

posted on 2016-09-27 22:15  alex.shu  阅读(3139)  评论(0编辑  收藏  举报