金融数据分析(三)
本章主要进行一些关于机器学习的demo实验
实验数据
房屋贷款数据集
目标:自动化贷款资格验证过程,基于客户在线申请表中提供的详细信息,识别符合贷款条件的客户群体。
应用场景:Dream Housing Finance公司,涉及所有住房贷款,覆盖城市、半城市和农村地区。
客户信息:性别、婚姻状况、教育程度、依赖人数、收入、贷款金额、信用历史等。
数据集变量描述
Loan_ID:唯一贷款ID
Gender:性别(男/女)
Married:婚姻状况(是/否)
Dependents:依赖人数
Education:教育程度(本科/研究生)
Self_Employed:是否自雇(是/否)
ApplicantIncome:申请人收入
CoapplicantIncome:共同申请人收入
LoanAmount:贷款金额(以千为单位)
Loan_Amount_Term:贷款期限(月)
Credit_History:信用历史是否符合指南
Property_Area:物业区域(城市/半城市/农村)
Loan_Status:贷款批准状态(是/否)
机器学习算法
决策树算法
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
target_col = train.columns[-1]
target_col = str(target_col)
X_train = train.drop(columns=[target_col])
y_train = train[target_col]
X_test = test
for col in X_train.columns:
if X_train[col].dtype == 'object':
le = LabelEncoder()
all_values = pd.concat([X_train[col], X_test[col]], axis=0)
le.fit(all_values.astype(str))
X_train[col] = le.transform(X_train[col].astype(str))
X_test[col] = le.transform(X_test[col].astype(str))
if y_train.dtype == 'object':
y_le = LabelEncoder()
y_train = y_le.fit_transform(y_train.astype(str))
else:
y_le = None
# 划分训练集和验证集,test_size=0.2 表示20%做验证
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
# 训练模型
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_tr, y_tr)
# 训练集评估
y_tr_pred = clf.predict(X_tr)
print("训练子集评估:")
print("准确率:", accuracy_score(y_tr, y_tr_pred))
# 验证集评估
y_val_pred = clf.predict(X_val)
print("验证子集评估:")
print("准确率:", accuracy_score(y_val, y_val_pred))
print("混淆矩阵:\n", confusion_matrix(y_val, y_val_pred))
print("分类报告:\n", classification_report(y_val, y_val_pred, target_names=y_le.classes_ if y_le else None))
# 5. 预测
y_pred = clf.predict(X_test)
if y_le:
y_pred = y_le.inverse_transform(y_pred)
# print("\n测试集预测结果:")
# for i, pred in enumerate(y_pred):
# print(f"样本{i+1}: {pred}")
XGboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
# 1. 加载数据
print("Loading data...")
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# 2. 特征工程
def feature_engineering(df):
df = df.copy()
# 创建总收入特征
df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome']
# 创建收入与贷款金额比率
df['Income_Loan_Ratio'] = df['TotalIncome'] / df['LoanAmount']
# 创建每月收入
df['Monthly_Income'] = df['TotalIncome'] / 12
# 创建月供比率
df['EMI_Ratio'] = (df['LoanAmount'] * 1000) / df['Loan_Amount_Term']
# 创建月供收入比
df['EMI_Income_Ratio'] = df['EMI_Ratio'] / df['Monthly_Income']
return df
# 3. 数据预处理
def preprocess_data(train_df, test_df):
# 处理目标变量
train_df['Loan_Status'] = train_df['Loan_Status'].map({'Y': 1, 'N': 0})
# 合并数据集进行预处理
test_df['Loan_Status'] = -1
data = pd.concat([train_df, test_df])
# 特征工程
data = feature_engineering(data)
# 处理类别变量
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education',
'Self_Employed', 'Property_Area']
# 使用LabelEncoder进行编码
for col in categorical_columns:
le = LabelEncoder()
data[col] = data[col].fillna('Missing')
data[col] = le.fit_transform(data[col].astype(str))
# 处理数值变量
numerical_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
'Loan_Amount_Term', 'Credit_History', 'TotalIncome',
'Income_Loan_Ratio', 'Monthly_Income', 'EMI_Ratio',
'EMI_Income_Ratio']
for col in numerical_columns:
data[col] = data[col].fillna(data[col].median())
# 标准化数值特征
scaler = StandardScaler()
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])
# 分离回训练集和测试集
train_data = data[data['Loan_Status'] != -1]
test_data = data[data['Loan_Status'] == -1]
return train_data, test_data, categorical_columns + numerical_columns
# 4. 模型训练和评估
def train_and_evaluate_model(X, y, X_test):
# 分割训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# 定义参数网格
param_grid = {
'max_depth': [3, 4, 5],
'learning_rate': [0.01, 0.1],
'n_estimators': [100, 200],
'min_child_weight': [1, 3],
'gamma': [0, 0.1],
'subsample': [0.8, 0.9],
'colsample_bytree': [0.8, 0.9]
}
# 初始化模型
base_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
# 网格搜索
print("\nPerforming GridSearchCV...")
grid_search = GridSearchCV(
estimator=base_model,
param_grid=param_grid,
cv=5,
scoring='roc_auc',
n_jobs=-1,
verbose=1
)
#这段代码会自动遍历 param_grid 中所有参数组合,对每一组参数用5折交叉验证训练模型,并用 roc_auc 作为评判标准,最终找到表现最好的参数组合。
# 训练模型
grid_search.fit(X_train, y_train)
# 获取最佳模型
best_model = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
# 交叉验证评估
print("\nPerforming cross-validation...")
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='roc_auc')
print(f"Cross-validation ROC-AUC scores: {cv_scores}")
print(f"Mean CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# 在验证集上评估
val_pred = best_model.predict(X_val)
val_pred_proba = best_model.predict_proba(X_val)[:, 1]
print("\nValidation Set Metrics:")
print(f"Accuracy: {accuracy_score(y_val, val_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, val_pred_proba):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, val_pred))
# 特征重要性可视化
plt.figure(figsize=(10, 6))
feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
return best_model, grid_search.best_params_
# 主程序
print("Starting preprocessing...")
train_processed, test_processed, features = preprocess_data(train, test)
X = train_processed[features]
y = train_processed['Loan_Status']
X_test = test_processed[features]
# 训练和评估模型
best_model, best_params = train_and_evaluate_model(X, y, X_test)
# 预测测试集
print("\nMaking predictions on test set...")
test_preds = best_model.predict(X_test)
test_preds_proba = best_model.predict_proba(X_test)[:, 1]
# 保存预测结果
test_predictions = pd.DataFrame({
'Loan_ID': test['Loan_ID'],
'Loan_Status': ['Y' if pred == 1 else 'N' for pred in test_preds],
'Prediction_Probability': test_preds_proba
})
test_predictions.to_csv('loan_predictions_improved.csv', index=False)
print("\nPredictions saved to 'loan_predictions_improved.csv'")
# 保存模型配置信息
with open('model_info.txt', 'w') as f:
f.write("Best Model Parameters:\n")
f.write(str(best_params))
f.write("\n\nFeature List:\n")
f.write(str(features))
log
(base) root@ubuntu22:~# python loan_xgboost_improved.py
Loading data...
Starting preprocessing...
Performing GridSearchCV...
Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 200, 'subsample': 0.8}
Performing cross-validation...
Cross-validation ROC-AUC scores: [0.73219814 0.76037152 0.78998779 0.83058608 0.76566416]
Mean CV ROC-AUC: 0.7758 (+/- 0.0660)
Validation Set Metrics:
Accuracy: 0.7886
ROC-AUC: 0.7698
Classification Report:
precision recall f1-score support
0 0.90 0.44 0.59 43
1 0.76 0.97 0.86 80
accuracy 0.79 123
macro avg 0.83 0.71 0.73 123
weighted avg 0.81 0.79 0.77 123
Making predictions on test set...
Predictions saved to 'loan_predictions_improved.csv'
Best Model Parameters:
Feature List:
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'TotalIncome', 'Income_Loan_Ratio', 'Monthly_Income', 'EMI_Ratio', 'EMI_Income_Ratio']
浙公网安备 33010602011771号