K近邻(KNN)支持向量机(SVM)随机森林(RF)逻辑回归(LR)演示
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from pandas.plotting import scatter_matrix
from sklearn.datasets import load_iris
#第一步,数据集的获取
iris = load_iris()
#第二步,数据预处理,删除缺失值
iris_d = pd.DataFrame(iris['data'], columns=['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width']) #数据转化为DataFrame类型
iris_d['Species'] = iris.target #目标类型的获取
iris_d.dropna(inplace=True) #数据预处理,删除缺失值
iris_d
| Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | 2 |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | 2 |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | 2 |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | 2 |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | 2 |
150 rows × 5 columns
#第三步,数据可视化
iris_d.groupby('Species').size() #数据集的数量
Species
0 50
1 50
2 50
dtype: int64
iris_d.describe()#数据集的描述性统计
| Sepal_Length | Sepal_Width | Petal_Length | Petal_Width | Species | |
|---|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 5.843333 | 3.057333 | 3.758000 | 1.199333 | 1.000000 |
| std | 0.828066 | 0.435866 | 1.765298 | 0.762238 | 0.819232 |
| min | 4.300000 | 2.000000 | 1.000000 | 0.100000 | 0.000000 |
| 25% | 5.100000 | 2.800000 | 1.600000 | 0.300000 | 0.000000 |
| 50% | 5.800000 | 3.000000 | 4.350000 | 1.300000 | 1.000000 |
| 75% | 6.400000 | 3.300000 | 5.100000 | 1.800000 | 2.000000 |
| max | 7.900000 | 4.400000 | 6.900000 | 2.500000 | 2.000000 |
iris_d.plot(kind = 'box') #变量之间的箱线图,展现数据的离散程度
plt.show()
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SRzISdsR-1657980589619)(output_5_0.png)]](https://img-blog.csdnimg.cn/b6a2e84e8153493482cde0f02e1df4f6.png)
iris_d.hist() #数据集的直方图,用于展示数据的分布特征
plt.show()
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nGtOklWn-1657980589620)(output_6_0.png)]](https://img-blog.csdnimg.cn/295e28a0ad2d46718e811f94c653f5d9.png)
scatter_matrix(iris_d) #展现了变量之间的关系,非线性相关和线性相关
plt.show()
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ME0vkIc4-1657980589621)(output_7_0.png)]](https://img-blog.csdnimg.cn/b213aea848e6463a9b88191a2c8a5249.png)
#第四部分,特征工程
array = iris_d.values
X = array[:,0:4]
Y = array[:,4]
#选取模型的特征,本模型选择80%数据量作为训练数据,20%作为测试数据
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=7)
#第五部分,机器学习模型和评估
#K近邻(KNN)
model = KNeighborsClassifier()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.9
#机器学习模型
#支持向量机(SVM)
model = SVC()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.8666666666666667
#机器学习模型
#随机森林(RF)
model = RandomForestClassifier()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.8666666666666667
#机器学习模型
#逻辑回归(LR)
model = LogisticRegression()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
#模型评估,交叉验证
print(accuracy_score(y_test, predictions))
0.8666666666666667
d:\program files\python3.7\lib\site-packages\sklearn\linear_model\_logistic.py:765: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)

浙公网安备 33010602011771号