金融数据分析(三)

本章主要进行一些关于机器学习的demo实验

实验数据

房屋贷款数据集

目标:自动化贷款资格验证过程,基于客户在线申请表中提供的详细信息,识别符合贷款条件的客户群体。

应用场景:Dream Housing Finance公司,涉及所有住房贷款,覆盖城市、半城市和农村地区。

客户信息:性别、婚姻状况、教育程度、依赖人数、收入、贷款金额、信用历史等。

数据集变量描述

Loan_ID:唯一贷款ID

Gender:性别(男/女)

Married:婚姻状况(是/否)

Dependents:依赖人数

Education:教育程度(本科/研究生)

Self_Employed:是否自雇(是/否)

ApplicantIncome:申请人收入

CoapplicantIncome:共同申请人收入

LoanAmount:贷款金额(以千为单位)

Loan_Amount_Term:贷款期限(月)

Credit_History:信用历史是否符合指南

Property_Area:物业区域(城市/半城市/农村)

Loan_Status:贷款批准状态(是/否)

机器学习算法

决策树算法

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

target_col = train.columns[-1]
target_col = str(target_col)  
X_train = train.drop(columns=[target_col])
y_train = train[target_col]
X_test = test


for col in X_train.columns:
    if X_train[col].dtype == 'object':
        le = LabelEncoder()
        all_values = pd.concat([X_train[col], X_test[col]], axis=0)
        le.fit(all_values.astype(str))
        X_train[col] = le.transform(X_train[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))

if y_train.dtype == 'object':
    y_le = LabelEncoder()
    y_train = y_le.fit_transform(y_train.astype(str))
else:
    y_le = None

# 划分训练集和验证集,test_size=0.2 表示20%做验证
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

# 训练模型
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_tr, y_tr)

# 训练集评估
y_tr_pred = clf.predict(X_tr)
print("训练子集评估:")
print("准确率:", accuracy_score(y_tr, y_tr_pred))

# 验证集评估
y_val_pred = clf.predict(X_val)
print("验证子集评估:")
print("准确率:", accuracy_score(y_val, y_val_pred))
print("混淆矩阵:\n", confusion_matrix(y_val, y_val_pred))
print("分类报告:\n", classification_report(y_val, y_val_pred, target_names=y_le.classes_ if y_le else None))

# 5. 预测
y_pred = clf.predict(X_test)
if y_le:
    y_pred = y_le.inverse_transform(y_pred)

# print("\n测试集预测结果:")
# for i, pred in enumerate(y_pred):
#     print(f"样本{i+1}: {pred}")

XGboost

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 加载数据
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. 特征工程
def feature_engineering(df):
    df = df.copy()
    
    # 创建总收入特征
    df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
    
    # 创建收入与贷款金额比率
    df['Income_Loan_Ratio'] = df['TotalIncome'] / df['LoanAmount']
    
    # 创建每月收入
    df['Monthly_Income'] = df['TotalIncome'] / 12
    
    # 创建月供比率
    df['EMI_Ratio'] = (df['LoanAmount'] * 1000) / df['Loan_Amount_Term']
    
    # 创建月供收入比
    df['EMI_Income_Ratio'] = df['EMI_Ratio'] / df['Monthly_Income']
    
    return df

# 3. 数据预处理
def preprocess_data(train_df, test_df):
    # 处理目标变量
    train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y': 1, 'N': 0})
    
    # 合并数据集进行预处理
    test_df['Loan_Status'] = -1
    data = pd.concat([train_df, test_df])
    
    # 特征工程
    data = feature_engineering(data)
    
    # 处理类别变量
    categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 
                          'Self_Employed', 'Property_Area']
    
    # 使用LabelEncoder进行编码
    for col in categorical_columns:
        le = LabelEncoder()
        data[col] = data[col].fillna('Missing')
        data[col] = le.fit_transform(data[col].astype(str))
    
    # 处理数值变量
    numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 
                        'Loan_Amount_Term', 'Credit_History', 'TotalIncome',
                        'Income_Loan_Ratio', 'Monthly_Income', 'EMI_Ratio',
                        'EMI_Income_Ratio']
    
    for col in numerical_columns:
        data[col] = data[col].fillna(data[col].median())
    
    # 标准化数值特征
    scaler = StandardScaler()
    data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
    
    # 分离回训练集和测试集
    train_data = data[data['Loan_Status'] != -1]
    test_data = data[data['Loan_Status'] == -1]
    
    return train_data, test_data, categorical_columns + numerical_columns

# 4. 模型训练和评估
def train_and_evaluate_model(X, y, X_test):
    # 分割训练集和验证集
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 定义参数网格
    param_grid = {
        'max_depth': [3, 4, 5],
        'learning_rate': [0.01, 0.1],
        'n_estimators': [100, 200],
        'min_child_weight': [1, 3],
        'gamma': [0, 0.1],
        'subsample': [0.8, 0.9],
        'colsample_bytree': [0.8, 0.9]
    }
    
    # 初始化模型
    base_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
    
    # 网格搜索
    print("\nPerforming GridSearchCV...")
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=1
    )
    #这段代码会自动遍历 param_grid 中所有参数组合,对每一组参数用5折交叉验证训练模型,并用 roc_auc 作为评判标准,最终找到表现最好的参数组合。
    # 训练模型
    grid_search.fit(X_train, y_train)
    
    # 获取最佳模型
    best_model = grid_search.best_estimator_
    print(f"\nBest parameters: {grid_search.best_params_}")
    
    # 交叉验证评估
    print("\nPerforming cross-validation...")
    cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='roc_auc')
    print(f"Cross-validation ROC-AUC scores: {cv_scores}")
    print(f"Mean CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # 在验证集上评估
    val_pred = best_model.predict(X_val)
    val_pred_proba = best_model.predict_proba(X_val)[:, 1]
    
    print("\nValidation Set Metrics:")
    print(f"Accuracy: {accuracy_score(y_val, val_pred):.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_val, val_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, val_pred))
    
    # 特征重要性可视化
    plt.figure(figsize=(10, 6))
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    sns.barplot(x='importance', y='feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    
    return best_model, grid_search.best_params_

# 主程序
print("Starting preprocessing...")
train_processed, test_processed, features = preprocess_data(train, test)

X = train_processed[features]
y = train_processed['Loan_Status']
X_test = test_processed[features]

# 训练和评估模型
best_model, best_params = train_and_evaluate_model(X, y, X_test)

# 预测测试集
print("\nMaking predictions on test set...")
test_preds = best_model.predict(X_test)
test_preds_proba = best_model.predict_proba(X_test)[:, 1]

# 保存预测结果
test_predictions = pd.DataFrame({
    'Loan_ID': test['Loan_ID'],
    'Loan_Status': ['Y' if pred == 1 else 'N' for pred in test_preds],
    'Prediction_Probability': test_preds_proba
})

test_predictions.to_csv('loan_predictions_improved.csv', index=False)
print("\nPredictions saved to 'loan_predictions_improved.csv'")

# 保存模型配置信息
with open('model_info.txt', 'w') as f:
    f.write("Best Model Parameters:\n")
    f.write(str(best_params))
    f.write("\n\nFeature List:\n")
    f.write(str(features))

log

(base) root@ubuntu22:~# python loan_xgboost_improved.py
Loading data...
Starting preprocessing...

Performing GridSearchCV...
Fitting 5 folds for each of 192 candidates, totalling 960 fits

Best parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.8}

Performing cross-validation...
Cross-validation ROC-AUC scores: [0.73219814 0.76037152 0.78998779 0.83058608 0.76566416]
Mean CV ROC-AUC: 0.7758 (+/- 0.0660)

Validation Set Metrics:
Accuracy: 0.7886
ROC-AUC: 0.7698

Classification Report:
precision recall f1-score support

       0       0.90      0.44      0.59        43
       1       0.76      0.97      0.86        80

accuracy                           0.79       123

macro avg 0.83 0.71 0.73 123
weighted avg 0.81 0.79 0.77 123

Making predictions on test set...

Predictions saved to 'loan_predictions_improved.csv'
Best Model Parameters:

Feature List:
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'TotalIncome', 'Income_Loan_Ratio', 'Monthly_Income', 'EMI_Ratio', 'EMI_Income_Ratio']

posted @ 2025-07-18 15:20  Sun-Wind  阅读(26)  评论(0)    收藏  举报