机器学习入门-信用卡欺诈数据案例

在前几个博客，我们将各个部分进行了拆分，现在写一个整体的代码
1.统计两种标签的个数，画直方图
2. 变量与标签的拆分，训练集与测试集数据的拆分(train_test_split)，对训练数据进行下采样
3. 使用交叉验证进行超参数正则化因子的选择 KFold
4. 混淆矩阵的绘制，即准确度，召回率，F1score的说明
5. 概率阈值的逻辑回归对召回率和准确度结果的影响
6.对数据进行过采样
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import  StandardScaler
from sklearn.cross_validation import train_test_split

# 数据读取
data = pd.read_csv('creditcard.csv')

data['Normal_Amount'] = StandardScaler().fit_transform(np.array(data['Amount']).reshape(-1, 1))
print(data.head())

data = data.drop(['Time', 'Normal_Amount'], axis=1)

X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']
# 1. 统计个数画图
count_class = pd.value_counts(data.Class, sort=True).sort_index()
count_class.plot(kind='bar')
plt.show()

# 2 进行数据的下采样

negtive_len = len(data[data.Class==1])
negtive_index = data[data.Class==1].index

# 获得正常样本的数据便签
normal_len = len(data[data.Class==0])
normal_index = data[data.Class==0].index
# 随机抽取
under_normal_index = np.random.choice(normal_index, negtive_len)
# 将两个样本的索引进行合并
under_index = np.concatenate([negtive_index, under_normal_index])

under_data = data.iloc[under_index, :]
under_x = under_data.loc[:, under_data.columns != 'Class']
under_y = under_data.loc[:, under_data.columns == 'Class']



# 进行整体数据的拆分
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)

# 进行下采样数据的拆分
under_train_x, under_text_x, under_train_y, under_test_y = train_test_split(under_x, under_y, test_size=0.3, random_state=0)
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score

# 3.使用交叉验证来选择参数
def printing_KFold_score(train_x, train_y):
    """
    进行数据的交叉验证
    :param train_x:输入的数据的变量
    :param train_y:输入数据的标签
    :return: 返回最佳的参数
    """
    # 对数据的索引进行拆分
    fold = KFold(len(train_x), 5, shuffle=False)
    # 正则化参数
    c_parameter = [0.01, 0.1, 1, 10, 100]
    # 建立DataFrame用于参数和recall得分的储存
    train_score = pd.DataFrame(index=range(len(c_parameter), 2), columns=['c_parameter', 'F_score_mean'])
    train_score['c_parameter'] = c_parameter
    for c in c_parameter:
        scores = []
        for iter, fol in enumerate(fold, start=1):
            lr = LogisticRegression(C=c, penalty='l1')
            lr.fit(train_x.iloc[fol[0], :], train_y.iloc[fol[0], :])
            pred_y = lr.predict(train_x.iloc[fol[1], :])
            # 导入recall_score模块进行计算
            score = recall_score(train_y.iloc[fol[1], :], pred_y)
            print('{} {}'.format(iter, score))
            scores.append(score)
        mean_score = np.mean(scores)
        train_score['F_score_mean'] = mean_score
    print(train_score)
    # 根据索引, idxmax() 表示获得最大值的索引,获得最佳的best_parameter
    best_parameter = train_score.iloc[train_score['F_score_mean'].idxmax(), :]['c_parameter']
    print('the best_parameter is {}'.format(best_parameter))

    return best_parameter


best_c = printing_KFold_score(under_train_x, under_train_y)

import itertools
# 4. 画出混淆矩阵, 导入confusion_matrix
def plot_matrix(conf, classes,
                title='confusion matrix', cmap=plt.cm.Blues):
    """
    :param conf: 输入的混淆矩阵
    :param classes: 混淆矩阵的类别数
    :param title: 图形的标题
    :param cmap: 图形的颜色风格
    :return: 
    """
    # 展示直方图
    plt.imshow(conf, cmap=cmap)
    # 图片标题
    plt.title(title)
    # 图片颜色条
    plt.colorbar()
    # 设置x轴和y轴位置
    x_index = np.array(classes)
    # 第一个参数是位置，第二个参数是标签名
    plt.xticks(x_index, classes, rotation=0)
    plt.yticks(x_index, classes)
    conf_mean = conf.max() / 2
    # itertools.product
    # [0, 1] & [0, 1]
    # [0, 0], [0, 1], [1, 0], [1, 1]
    # 将数字添加到混合矩阵中
    for i, j in itertools.product(range(conf.shape[0]), range(conf.shape[1])):
        plt.text(j, i, conf[i, j], horizontalalignment='center',
                 color='white'if conf[i, j] > conf_mean else 'black')
    # 画出的图更加的紧凑
    plt.tight_layout()

from sklearn.metrics import confusion_matrix
# 建立逻辑回归模型
lr = LogisticRegression(C=best_c, penalty='l1')
# 模型训练
lr.fit(under_train_x, under_train_y)
# 模型预测
pred_y = lr.predict(under_text_x)
# 获得混合矩阵
conf = confusion_matrix(under_test_y, pred_y)
# 画图
plot_matrix(conf, classes=[0, 1])
# accrurracy
# 精度
accurracy = (conf[0, 0] + conf[1, 1]) / (conf[0, 0] + conf[0, 1] + conf[1, 0] + conf[1, 1])
# 召回率
recall = conf[1, 1] / (conf[1, 0] + conf[1, 1])
# F1得分
F1_score =  1 / (accurracy + recall)
plt.show()
#
#
# # 使用当前的测试数据进行测试
#
# pred_y = lr.predict(test_x)
# # 获得混合矩阵
# conf = confusion_matrix(test_y, pred_y)
# # 画图
# plot_matrix(conf, classes=[0, 1])
# plt.show()
#
# 5. 测试不同的概率阈值的逻辑回归对准确率和召回率的影响
lr = LogisticRegression(C=best_c, penalty='l1')
lr.fit(under_train_x, under_train_y)

pred_array = np.array(lr.predict_proba(under_text_x))

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

j = 1
for threshold in thresholds:
    pred_y_new = np.zeros([len(under_text_x), 1])
    pred_y_new[pred_array[:, 1] > threshold] = 1
    # 获得矩阵
    plt.subplot(3, 3, j)
    conf = confusion_matrix(under_test_y, pred_y_new)
    # 画图
    plot_matrix(conf, classes=[0, 1], title='threshod is {}'.format(threshold))
    accurracy = (conf[0, 0] + conf[1, 1]) / (conf[0, 0] + conf[0, 1] + conf[1, 0] + conf[1, 1])
    # 召回率
    recall = conf[1, 1] / (conf[1, 0] + conf[1, 1])
    j = j + 1
plt.show()
#
#
# 6. 进行数据过采样操作
from imblearn.over_sampling import SMOTE
from sklearn.cross_validation import train_test_split

X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)
overstamp = SMOTE(random_state=0)

SMOTE_train_x, SMOTE_train_y = overstamp.fit_sample(train_x, train_y)
# 统计数据的标签0，1个数
print(pd.value_counts(SMOTE_train_y, sort=True).sort_index())
posted on 2019-01-15 13:14 python我的最爱阅读(692) 评论(0) 收藏举报