拿到原始数据

import pandas as pd
y=pd.read_excel("D:/python文件勿删/样本提取/原始样本.xlsx")
y=y[['dpd30+','idcard_md5']]
import pandas as pd
yy=pd.read_excel("D:/python文件勿删/建模文档/加百融建模/用户idcard加分数.xlsx")
yy=yy[['id_card','分数']]
yy.rename(columns={'id_card':'idcard_md5'},inplace=True)
import pandas as pd
import numpy as np
al=pd.read_excel("C:/Users/53066/Desktop/三方数据分析/阿里/朴道(企惠)-申反v5结果-20250314.xlsx")
al
ZT=y.merge(al,on="idcard_md5",how="inner")
import pandas as pd
import numpy as np
hn2=pd.read_excel("C:/Users/53066/Desktop/三方数据分析/海纳/海纳数科_百川分(回溯)_ZV4.2ZV5.2ZV6.2_企惠小贷_2025-03-19.xlsx")
hn2.rename(columns={'idcard':'idcard_md5'},inplace=True)
ZT=ZT.merge(hn2,on="idcard_md5",how="inner")
ZT.rename(columns={'dpd30+':'y'},inplace=True)
ZT=ZT[ZT['y']!=2]
ZT
ZT.drop_duplicates(subset="idcard_md5",inplace=True)
ZT

交叉表 假设不知道分割点先十等分

yb=ZT.copy()
yb1=yb.copy()
cut_points = pd.qcut(yb1['申反v5'], q=10, retbins=True)[1]

打印分割点

cut_points
def qj(x):
values=[0,16.09,23.71,29.3,33.69,37.68,41.906,49.17,70.1,92.71,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return 'A0_空值'
yb1['申反v5']=yb1['申反v5'].astype('float')
yb1['申反v5_bin']=yb1['申反v5'].apply(qj)
cut_points = pd.qcut(yb1['百川分ZV6.2'], q=10, retbins=True)[1]

打印分割点

cut_points
def qj(x):
values=[0,89,121,150,178,207,236,266,300,350,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return 'A0_空值'
yb1['百川分ZV6.2']=yb1['百川分ZV6.2'].astype('float')
yb1['百川分ZV6.2_bin']=yb1['百川分ZV6.2'].apply(qj)
pd.crosstab(yb1['申反v5_bin'],yb1['百川分ZV6.2_bin'],margins=True)
pd.crosstab(yb1['申反v5_bin'],yb1['百川分ZV6.2_bin'],margins=True,normalize=True)
def calculate_overdue_rate(series):
bad_users = series.sum() # 计算坏用户的数量
total_users = len(series) # 计算总用户数量
return bad_users / total_users if total_users > 0 else 0
pd.crosstab(yb1['申反v5_bin'],yb1['百川分ZV6.2_bin'],values=yb1['y'],aggfunc=calculate_overdue_rate)

简单可视化

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置字体为黑体
plt.rcParams['axes.unicode_minus'] = False # 正确显示负号

然后绘制您的图形

def visualize_split_data_matplotlib(data: pd.DataFrame, splits: list[tuple[str, float]], total_samples: int, output_path: str = "decision_tree.png"):
"""
按照给定的分割规则,递归地分割数据,并使用 matplotlib 生成决策树的可视化图像。
在每个节点增加当前样本数和整体样本数占比。

Args:
    data: 包含特征和目标变量的 Pandas DataFrame。目标变量名为 'y',取值为 0 或 1。
    splits: 一个列表,包含分割规则。每个规则是一个元组,包含特征名和分割点。
    total_samples: 总样本数。
    output_path: 输出图像的路径 (例如: "decision_tree.png")
"""

fig, ax = plt.subplots(figsize=(12, 8))  # 创建一个图形和轴对象
node_positions = {}  # 存储节点的位置
node_id = 0  # 用于唯一标识每个节点

def _split(data: pd.DataFrame, splits: list[tuple[str, float]], x: float, y: float, level_width: float, parent_node: int = None, edge_label: str = None):
    nonlocal node_id  # 允许修改外部作用域的 node_id

    current_node = node_id
    node_id += 1
    node_positions[current_node] = (x, y)  # 存储节点位置

    # 计算当前节点的信息
    good = len(data[data['y'] == 0])
    bad = len(data[data['y'] == 1])
    current_samples = len(data)
    bad_rate = (bad / current_samples)*100 if current_samples > 0 else 0
    sample_percentage = (current_samples / total_samples) * 100

    # 创建节点标签
    label = f"Good: {good}\nBad: {bad}\nSamples: {current_samples}\nBad Rate: {bad_rate:.1f}%\nrate: {sample_percentage:.1f}%"

    if not splits:
        # 叶子节点
        ax.text(x, y, label, ha='center', va='center', bbox=dict(facecolor='lightgreen', alpha=0.5))
        if parent_node is not None:
            x_parent, y_parent = node_positions[parent_node]
            ax.plot([x_parent, x], [y_parent, y], 'k-', lw=1)
            ax.text((x_parent + x) / 2, (y_parent + y) / 2, edge_label, ha='center', va='center')
        return

    feature_name, split_value = splits[0]
    splits = splits[1:]  # 移除已经使用的分割规则

    # 创建内部节点标签
    label = f"{feature_name} <= {split_value}\n{label}"
    ax.text(x, y, label, ha='center', va='center', bbox=dict(facecolor='lightblue', alpha=0.5))

    if parent_node is not None:
        x_parent, y_parent = node_positions[parent_node]
        ax.plot([x_parent, x], [y_parent, y], 'k-', lw=1)
        ax.text((x_parent + x) / 2, (y_parent + y) / 2, edge_label, ha='center', va='center')

    # 分割数据
    left_data = data[data[feature_name] <= split_value]
    right_data = data[data[feature_name] > split_value]

    # 递归分割左右两侧的数据
    next_y = y - 1
    next_level_width = level_width / 2
    _split(left_data, splits.copy(), x - next_level_width, next_y, next_level_width, current_node, f"<= {split_value}")
    _split(right_data, splits.copy(), x + next_level_width, next_y, next_level_width, current_node, f"> {split_value}")

# 启动递归分割
start_x = 0.5
start_y = 0.9
start_level_width = 0.4
_split(data, splits.copy(), start_x, start_y, start_level_width)

ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis('off')
plt.savefig(output_path)
print(f"Decision tree saved to {output_path}")

yb=ZT.copy()
yb1=yb.copy()
splits = [('百川分ZV6.2', 173.5),
('申反v5', 45.29)]
total_samples = len(yb1)
visualize_split_data_matplotlib(yb1, splits, total_samples, output_path="decision_tree.png")

拿到决策树分箱区间

import pycard as pc
num_iv_woedf = pc.WoeDf()
clf = pc.NumBin(max_bins_num=8,min_impurity_decrease=4e-5)
for i in ['申反v5','百川分ZV6.2']:
clf.fit(yb1[i] ,yb1.y)
yb1[i+'bin'] = clf.transform(yb1[i]) #这样可以省略掉后面转换成_bin的一步骤
clf.bin_code(i)
num_iv_woedf.append(clf.woe_df
)

def 申反v5_trans(x):
# 申反v5 连续型特征的分箱转换函数
inf = np.inf
bins = [0, 39.4, 43.27, 81.87, 90.16, 98.3, 99.305, inf]
for i in range(len(bins)-1):
start = bins[i]
end = bins[i+1]
if start < x <= end:
return "{0}({1}, {2}]".format(i+1, start, end) # 分箱字符串格式: i(start, end]
return '0_nan'
yb1['申反v5']=yb1['申反v5'].astype('float')
yb1['申反v5_bin']=yb1['申反v5'].apply(申反v5_trans)

def 百川分ZV6_trans(x):
# 百川分ZV6.2 连续型特征的分箱转换函数
inf = np.inf
bins = [-inf, 33.5, 105.5, 170.5, 173.5, 223.5, 251.5, 317.5, inf]
for i in range(len(bins)-1):
start = bins[i]
end = bins[i+1]
if start < x <= end:
return "{0}({1}, {2}]".format(i+1, start, end) # 分箱字符串格式: i(start, end]
return '0_nan'
yb1['百川分ZV6.2']=yb1['百川分ZV6.2'].astype('float')
yb1['百川分ZV6.2_bin']=yb1['百川分ZV6.2'].apply(百川分ZV6_trans)

pd.crosstab(yb1['申反v5_bin'],yb1['百川分ZV6.2_bin'],margins=True,normalize=True)

def calculate_overdue_rate(series):
bad_users = series.sum() # 计算坏用户的数量
total_users = len(series) # 计算总用户数量
return bad_users / total_users if total_users > 0 else 0
pd.crosstab(yb1['申反v5_bin'],yb1['百川分ZV6.2_bin'],values=yb1['y'],aggfunc=calculate_overdue_rate)

cs=yb1.merge(yy,on="idcard_md5",how="inner")
cs

splits = [('百川分ZV6.2', 173.5),
('申反v5', 45.29),
('分数',517)]
total_samples = len(cs)
visualize_split_data_matplotlib(cs, splits, total_samples, output_path="decision_tree.png")

不同通过率下两个变量组合最大逾期率。也可以稍微改一下来拿到最小逾期率。初始值为1.然后把大于改为。

import pandas as pd
import numpy as np

def find_optimal_cutoffs(df, primary_var, secondary_var, y_col, rejection_rate_step=0.01):
"""
df: DataFrame,包含变量和 y 值
primary_var: 主变量名
secondary_var: 次变量名
y_col: y 的列名
rejection_rate_step: 步长,默认0.01(每次增加1%)
"""
results = []

total_rows = len(df)

# 逾期率计算函数
def calculate_bad_rate(selected_indices):
    # 选中用户的 y 值为 1 的比例
    selected_df = df.loc[selected_indices]
    return selected_df[y_col].sum() / len(selected_df)  # 逾期率 = y=1的用户数 / 总用户数

# 遍历不同的目标拒绝率
for target_rejection_pct in np.arange(0.01, 1.01, rejection_rate_step):  # 从1%到100%
    target_reject_count = int(np.floor(total_rows * target_rejection_pct))
    best_bad_rate = -1
    best_thresholds = {}

    # 主变量顺序排序,次变量控制
    sorted_df = df.sort_values(by=primary_var, ascending=True)
    for primary_pct in np.arange(target_rejection_pct, 1.01, rejection_rate_step):
        # 确定主变量阈值
        primary_cut_count = int(np.floor(total_rows * primary_pct))
        selected_primary_df = sorted_df.head(primary_cut_count)
        if len(selected_primary_df) == 0:
            continue
        primary_threshold = selected_primary_df[primary_var].max()

        # 控制次变量阈值以达到目标拒绝率
        selected_secondary_df = selected_primary_df.sort_values(by=secondary_var, ascending=False)
        if len(selected_secondary_df) == 0:
            continue
        # 计算需要从次变量中选择的用户比例
        required_secondary_pct = target_reject_count / len(selected_primary_df)
        if required_secondary_pct < 0 or required_secondary_pct > 1:
            continue
        secondary_cut_count = int(np.floor(len(selected_primary_df) * required_secondary_pct))
        if secondary_cut_count == 0:
            continue
        secondary_threshold = selected_secondary_df[secondary_var].iloc[secondary_cut_count - 1]
        
        # 确定最终选择的用户
        selected_indices = selected_primary_df[selected_primary_df[secondary_var] >= secondary_threshold].index


        # 计算逾期率
        bad_rate = calculate_bad_rate(selected_indices)

        if bad_rate > best_bad_rate:
            best_bad_rate = bad_rate
            best_thresholds = {
                'primary_threshold': primary_threshold, 
                'secondary_threshold': secondary_threshold,
                'primary_var_name': primary_var, 
                'secondary_var_name': secondary_var,
                'primary_direction': 'ascending',
                'secondary_direction': 'descending'
            }

    # 主变量倒序排序,次变量控制
    sorted_df = df.sort_values(by=primary_var, ascending=False)
    for primary_pct in np.arange(target_rejection_pct, 1.01, rejection_rate_step):
        # 确定主变量阈值
        primary_cut_count = int(np.floor(total_rows * primary_pct))
        selected_primary_df = sorted_df.head(primary_cut_count)
        if len(selected_primary_df) == 0:
            continue
        primary_threshold = selected_primary_df[primary_var].min()

        # 控制次变量阈值以达到目标拒绝率
        selected_secondary_df = selected_primary_df.sort_values(by=secondary_var, ascending=False)
        if len(selected_secondary_df) == 0:
            continue
        # 计算需要从次变量中选择的用户比例
        required_secondary_pct = target_reject_count / len(selected_primary_df)
        if required_secondary_pct < 0 or required_secondary_pct > 1:
            continue
        secondary_cut_count = int(np.floor(len(selected_primary_df) * required_secondary_pct))
        if secondary_cut_count == 0:
            continue
        secondary_threshold = selected_secondary_df[secondary_var].iloc[secondary_cut_count - 1]
        
        # 确定最终选择的用户
        selected_indices = selected_primary_df[selected_primary_df[secondary_var] >= secondary_threshold].index

        # 计算逾期率
        bad_rate = calculate_bad_rate(selected_indices)

        if bad_rate > best_bad_rate:
            best_bad_rate = bad_rate
            best_thresholds = {
                'primary_threshold': primary_threshold, 
                'secondary_threshold': secondary_threshold,
                'primary_var_name': primary_var, 
                'secondary_var_name': secondary_var,
                'primary_direction': 'descending',
                'secondary_direction': 'descending'
            }

    # 交换主次变量:次变量顺序排序,主变量控制
    sorted_df = df.sort_values(by=secondary_var, ascending=True)
    for primary_pct in np.arange(target_rejection_pct, 1.01, rejection_rate_step):
        # 确定主变量阈值
        primary_cut_count = int(np.floor(total_rows * primary_pct))
        selected_primary_df = sorted_df.head(primary_cut_count)
        if len(selected_primary_df) == 0:
            continue
        primary_threshold = selected_primary_df[secondary_var].max()

        # 控制次变量阈值以达到目标拒绝率
        selected_secondary_df = selected_primary_df.sort_values(by=primary_var, ascending=False)
        if len(selected_secondary_df) == 0:
            continue
        # 计算需要从次变量中选择的用户比例
        required_secondary_pct = target_reject_count / len(selected_primary_df)
        if required_secondary_pct < 0 or required_secondary_pct > 1:
            continue
        secondary_cut_count = int(np.floor(len(selected_primary_df) * required_secondary_pct))
        if secondary_cut_count == 0:
            continue
        secondary_threshold = selected_secondary_df[primary_var].iloc[secondary_cut_count - 1]
        
        # 确定最终选择的用户
        selected_indices = selected_primary_df[selected_primary_df[primary_var] >= secondary_threshold].index


        # 计算逾期率
        bad_rate = calculate_bad_rate(selected_indices)

        if bad_rate > best_bad_rate:
            best_bad_rate = bad_rate
            best_thresholds = {
                'primary_threshold': primary_threshold, 
                'secondary_threshold': secondary_threshold,
                'primary_var_name': secondary_var, 
                'secondary_var_name': primary_var,
                'primary_direction': 'ascending',
                'secondary_direction': 'descending'
            }

    # 交换主次变量:次变量倒序排序,主变量控制
    sorted_df = df.sort_values(by=secondary_var, ascending=False)
    for primary_pct in np.arange(target_rejection_pct, 1.01, rejection_rate_step):
        # 确定主变量阈值
        primary_cut_count = int(np.floor(total_rows * primary_pct))
        selected_primary_df = sorted_df.head(primary_cut_count)
        if len(selected_primary_df) == 0:
            continue
        primary_threshold = selected_primary_df[secondary_var].min()

        # 控制次变量阈值以达到目标拒绝率
        selected_secondary_df = selected_primary_df.sort_values(by=primary_var, ascending=False)
        if len(selected_secondary_df) == 0:
            continue
        # 计算需要从次变量中选择的用户比例
        required_secondary_pct = target_reject_count / len(selected_primary_df)
        if required_secondary_pct < 0 or required_secondary_pct > 1:
            continue
        secondary_cut_count = int(np.floor(len(selected_primary_df) * required_secondary_pct))
        if secondary_cut_count == 0:
            continue
        secondary_threshold = selected_secondary_df[primary_var].iloc[secondary_cut_count - 1]
        
        # 确定最终选择的用户
        selected_indices = selected_primary_df[selected_primary_df[primary_var] >= secondary_threshold].index


        # 计算逾期率
        bad_rate = calculate_bad_rate(selected_indices)

        if bad_rate > best_bad_rate:
            best_bad_rate = bad_rate
            best_thresholds = {
                'primary_threshold': primary_threshold, 
                'secondary_threshold': secondary_threshold,
                'primary_var_name': secondary_var, 
                'secondary_var_name': primary_var,
                'primary_direction': 'descending',
                'secondary_direction': 'descending'
            }

    # 保存结果
    result_row = {
        "rejection_rate": target_rejection_pct,
        "bad_rate": best_bad_rate,
        'primary_threshold': best_thresholds.get('primary_threshold', np.nan),
        'secondary_threshold': best_thresholds.get('secondary_threshold', np.nan),
        'primary_var_name': best_thresholds.get('primary_var_name', np.nan),
        'secondary_var_name': best_thresholds.get('secondary_var_name', np.nan),
        'primary_direction': best_thresholds.get('primary_direction', np.nan),
        'secondary_direction': best_thresholds.get('secondary_direction', np.nan)
    }
    results.append(result_row)

return pd.DataFrame(results)

df=pd.read_excel("D:/python文件勿删/建模文档/天问分建模/阈值.xlsx.")
df

df.drop_duplicates(subset="idcard",inplace=True)
df

主变量

primary_var = '分数'

次变量(model_risk_v8_ghighirr_score_threshold)

secondary_var = 'model_risk_v8_ghighirr_score'

计算最优阈值

result = find_optimal_cutoffs(df, primary_var, secondary_var, 'target')

输出结果

result

单变量看阈值

import pandas as pd
def calculate_rejection_rates(df, variable, yname, ascending=False):
# 确保变量和yname在数据框中
if variable not in df.columns or yname not in df.columns:
raise ValueError("变量或yname不在数据框中")

# 对变量进行排序
df_sorted = df.sort_values(by=variable, ascending=True).reset_index(drop=True)

# 添加累计和累计百分比列
df_sorted['cumsum'] = df_sorted[yname].cumsum()

# 计算不同拒绝率下的逾期率
results = []
for reject_rate in range(1, 101):
    percentile = reject_rate / 100
    num_to_reject = int(len(df_sorted) * percentile)
    
    # 当前拒绝率下的坏用户数
    bad_in_reject = df_sorted[yname].iloc[:num_to_reject].sum()
    
    # 当前拒绝率下的逾期率(坏用户占拒绝用户的比例)
    rejection_rate = bad_in_reject / num_to_reject
    
    # 添加结果
    results.append([reject_rate, rejection_rate, df_sorted[variable].iloc[num_to_reject - 1]])

# 创建结果数据框
result_df = pd.DataFrame(results, columns=['拒绝率(%)', '逾期率', '分割点'])
return result_df

result_df = calculate_rejection_rates(df, '分数', 'target', ascending=False)
result_df
或者
import pandas as pd

def calculate_rejection_rates(df, variable, yname, ascending=False):
# 确保变量和yname在数据框中
if variable not in df.columns or yname not in df.columns:
raise ValueError("变量或yname不在数据框中")

# 对变量进行排序
df_sorted = df.sort_values(by=variable, ascending=ascending).reset_index(drop=True)

# 添加累计和累计百分比列
df_sorted['cumsum'] = df_sorted[yname].cumsum()

# 计算不同拒绝率下的逾期率
results = []
for reject_rate in range(1, 21):
    percentile = reject_rate / 200
    num_to_reject = int(len(df_sorted) * percentile)
    
    # 当前拒绝率下的坏用户数
    bad_in_reject = df_sorted[yname].iloc[:num_to_reject].sum()
    
    # 当前拒绝率下的逾期率(坏用户占拒绝用户的比例)
    rejection_rate = bad_in_reject / num_to_reject
    
    # 添加结果
    results.append([reject_rate*0.5, rejection_rate, df_sorted[variable].iloc[num_to_reject - 1]])

# 创建结果数据框
result_df = pd.DataFrame(results, columns=['拒绝率(%)', '逾期率', '分割点'])
return result_df

result_df = calculate_rejection_rates(ZT, 'TD腾云分87', 'y', ascending=False)
result_df