有用的信息

1、主题设置:https://blog.csdn.net/qq_41554005/article/details/109776371
可编辑区域增加
from IPython.display import display, HTML
display(HTML(""))

2、快速找到重复字段
c=[n for n in list1 if list1.count(n) > 1]

3、行列显示问题
import pandas as pd
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.max_columns', None) # 显示所有列

4、json获取
dates['data']=dates['data'].apply(lambda x:json.loads(x)['data']['data']['data']['result_detail'])
dates['data']=dates['data'].apply(lambda x:json.dumps(x))

5、对比重复信息
AA=YB['idcard_md5'].to_list()
BB=BR['idcard_md5'].to_list()
for i in AA:
if i not in BB:
print(i)N

6、相关系数设置对角线上为空
把对角线以上元素全设置为空值
import pandas as pd
import numpy as np

假设您有一个数据框df

df = ...

计算相关性矩阵

corr_matrix = YB.corr()

创建一个新的数据框来存储结果

corr_matrix_modified = corr_matrix.copy()

获取对角线以上的索引(不包括对角线)

rows, cols = np.triu_indices_from(corr_matrix_modified, k=1)

将对角线以上的元素设置为NaN

corr_matrix_modified.values[rows, cols] = np.nan

查看结果

corr_matrix_modified

7、查看电脑运行内存

查看运行内存

import psutil

查看总内存

total_memory = psutil.virtual_memory().total

查看可用内存

available_memory = psutil.virtual_memory().available

8、循环批量读取数据
import pandas as pd
import pymysql

给定的最大ID值

max_id = 460000000 # 最大ID

每次处理的记录数

chunk_size = 200000

初始化空的DataFrame用于存储最终结果

SJ4 = pd.DataFrame()
import pandas as pd
import pymysql
from sqlalchemy import create_engine

数据库连接信息

conn=pymysql.connect(host="work.baiyizu.cn",user="flows",passwd="CtpnYFphST556DLC",port=21306,
db='flows')
sql = """SELECT max(max_id) as id FROM maxid """
SJ2 = pd.read_sql_query(sql, conn)
current_id=SJ2['id'].max()
db_user = 'readonly_user'
db_password = 'Readonly123!'
db_host = '8.137.70.99'
db_port = 3306
db_name = 'flows'

创建 SQLAlchemy 引擎

engine = create_engine(f'mysql+pymysql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')

循环处理数据

while current_id < max_id:
# 构建SQL查询,使用LIMIT和OFFSET进行分页查询
sql = f"""
SELECT flow_type, trade_type, trade_amount, trade_desc, counterparty, id
FROM fl_record
WHERE id >= {current_id} AND id < {current_id + chunk_size}
ORDER BY id """
# 执行SQL查询
SJ2 = pd.read_sql_query(sql, engine)
# 如果查询结果为空,则结束循环
import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="192.168.254.48",user="flows",passwd="CtpnYFphST556DLC",db='flows',port=21306)
conn = create_engine('mysql+pymysql://flows:CtpnYFphST556DLC@192.168.254.48:21306/flows')
if SJ2.empty:
current_id += chunk_size
current_id1=pd.DataFrame({'max_id':[current_id]})
current_id1.to_sql("maxid", conn, if_exists='append',index=False)
continue
# 更新current_id为下一个批次的起始ID
current_id += chunk_size
current_id1=pd.DataFrame({'max_id':[current_id]})
current_id1.to_sql("maxid", conn, if_exists='append',index=False)
# 打印当前处理的ID范围
print(f"Processing IDs from {current_id - chunk_size} to {current_id}")
# 数据处理逻辑
SJ2.fillna('', inplace=True)
SJ2['wy'] = SJ2['counterparty'].apply(lambda x: find_first_keyword(x, aa))
SJ2['wy1'] = SJ2['counterparty'].apply(lambda x: find_first_keyword(x, bb))
SJ2=SJ2[SJ2['wy'].isna()|SJ2['wy1'].isna()]
# 筛选结果
SJ41=SJ2.copy()
# 将处理后的数据追加到最终结果DataFrame
import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="192.168.254.48",user="flows",passwd="CtpnYFphST556DLC",db='flows',port=21306)
conn = create_engine('mysql+pymysql://flows:CtpnYFphST556DLC@192.168.254.48:21306/flows')
SJ41[['flow_type','trade_type','trade_amount','trade_desc','counterparty','wy','wy1']].to_sql("zjsbde2", conn, if_exists='append',index=False)

9、sql 常用语句
INSERT INTO fl_platform (name, type, company, logo) VALUES ('汇趣租机', '租机', '安徽讯步科技有限公司', 'https://saas-body.oss-cn-chengdu.aliyuncs.com/e2c1767a1ba74818d72b8380b297b9c4');
UPDATE fl_platform SET logo = 'https://saas-body.oss-cn-chengdu.aliyuncs.com/44a31c162072f9473570a744fabf9936' WHERE name = 'e宝认证';
delete from fl_platform where name ="人人租机_扫码租赁";
INSERT INTO fl_platform (name,tags,type, company, logo) VALUES ('汇趣租机','借条签约' ,'租机', '安徽讯步科技有限公司',);

10、两次加密
import hashlib
def sha256_jiami(x):
sha256 = hashlib.sha256()
sha256.update(x.encode('utf-8'))
return sha256.hexdigest()
data['name_sha256'] = data['name'].apply(sha256_jiami)
data['phone_sha256'] = data['address_mobile'].apply(sha256_jiami)
data['idcard_sha256'] = data['id_num'].apply(sha256_jiami)

import hashlib
def md5_jiami(x):
Jpwd = hashlib.md5()
Jpwd.update(x.encode('utf-8'))
return Jpwd.hexdigest()
data['name_md5'] = data['name'].apply(md5_jiami)
data['phone_md5'] = data['address_mobile'].apply(md5_jiami)
data['idcard_md5'] = data['id_num'].apply(md5_jiami)

11、结果处理
.apply(lambda x: f"{x * 100:.2f}%")

.apply(lambda x: round(x, 2))

.map(lambda x: f"{x:.2f}")

12、docker常用命令
docker logs -f flow-report-dev
docker ps -a
docker rm flow-report-dev
docker stop flow-report-dev
docker compose build
docker compose -f docker-compose-dev.yaml up -d
docker logs -f flow-report-dev
docker compose push

13、linux临时跑任务
nohup python3 /home/analysis/zjreport.py &
kill 142758

14、深度学习资料
1、神经网络认识
https://blog.csdn.net/illikang/article/details/82019945
2、深度学习基础
https://blog.csdn.net/dongjinkun/article/details/117776124
https://zhuanlan.zhihu.com/p/114573981

15、逻辑回归完整代码

!/usr/bin/env python

coding: utf-8

In[1]:

from IPython.display import display, HTML
display(HTML(""))

In[3]:

import pandas as pd
import numpy as np
import pymysql
conn=pymysql.connect(host="work.baiyizu.cn",user="readonly_user",passwd="Readonly!0818~",port=21306,
db='flows')
sql=""" select * from importidcard1224 """
xb=pd.read_sql_query(sql,conn)
xb

In[4]:

CS1=pd.read_excel('C:/Users/12463/Desktop/创硕第三次测试.xlsx',sheet_name='Sheet1')
CS1

In[5]:

CS2=pd.read_excel('C:/Users/12463/Desktop/创硕第三次测试.xlsx',sheet_name='Sheet2')
CS2

In[6]:

import hashlib
def sha256_jiami(x):
sha256 = hashlib.sha256()
sha256.update(x.encode('utf-8'))
return sha256.hexdigest()
xb['证件号256'] = xb['idcard'].apply(sha256_jiami)

In[7]:

import hashlib
def md5_jiami(x):
Jpwd = hashlib.md5()
Jpwd.update(x.encode('utf-8'))
return Jpwd.hexdigest()
xb['证件号md5'] = xb['idcard'].apply(md5_jiami)

In[8]:

CS1=CS1.merge(xb,on='证件号256',how='inner')
CS1

In[9]:

CS2=CS2.merge(xb,on='证件号md5',how='inner')
CS2

In[10]:

CS=pd.concat([CS1,CS2],axis=0)
CS

In[11]:

df=CS[['user_id','FPD30+']]
df

In[12]:

df.columns=['userId','FPD30+']
df

In[13]:

df['FPD30+']=df['FPD30+'].apply(lambda x:1 if x=="是" else 0)
df

In[14]:

import pandas as pd
df1=pd.read_excel('C:/Users/12463/Desktop/数据结果第三次.xlsx')
df1

In[15]:

zt=df1.merge(df,on='userId',how='inner')
zt

In[16]:

zt.head(10)

In[17]:

zt1=zt.iloc[:,1:]
zt1

In[18]:

zt1.head(10)

In[19]:

zt1.drop(['is_daytime','is_night','is_weekday','近3个月','近6个月','历史月','max_yearMonth'],axis=1,inplace=True)

In[20]:

import pandas as pd
def convert_percent_to_float(s):
if isinstance(s, str) and s.endswith('%'):
return float(s.rstrip('%'))
return s

应用这个函数到每一列

for col in zt1.columns:
zt1[col] = zt1[col].apply(convert_percent_to_float)

将所有列转换为数值类型,并保留两位小数

zt1 = zt1.apply(pd.to_numeric, errors='coerce').round(2)
zt1

In[21]:

zt1=zt1.iloc[:,1:]
zt1

In[52]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="work.baiyizu.cn",user="flows",passwd="CtpnYFphST556DLC",db='flows',port=21306)
conn = create_engine('mysql+pymysql://flows:CtpnYFphST556DLC@work.baiyizu.cn:21306/flows')
YB.to_sql("trans0208", conn, if_exists='replace',index=False)

In[51]:

YB

In[49]:

YB=zt1.copy()

In[50]:

YB=YB[[
"月最高支出金额",
"单笔最高收入金额",
"近3天是否整百交易支出总金额",
"近3天是否整百交易笔数占比",
"历史是否整百交易总笔数",
"历史租机支出总金额",
"历史还款支出总金额",
"夜间近30天累计交易笔数",
"近30天工作日高峰时间段交易笔数占比",
"近6个月支出笔数集中度",
"FPD30+"]]
YB

# 相关性筛选

In[24]:

import pandas as pd
import numpy as np

假设您有一个数据框df

df = ...

计算相关性矩阵

corr_matrix = YB.corr()

创建一个新的数据框来存储结果

corr_matrix_modified = corr_matrix.copy()

获取对角线以上的索引(不包括对角线)

rows, cols = np.triu_indices_from(corr_matrix_modified, k=1)

将对角线以上的元素设置为NaN

corr_matrix_modified.values[rows, cols] = np.nan

查看结果

corr_matrix_modified

In[25]:

corr_matrix_modified.to_excel('相关性九安.xlsx')

In[26]:

import numpy as np
import statsmodels.stats.outliers_influence as sm

2. 计算 VIF

vif_data = pd.DataFrame()
vif_data["feature"] = YB.columns
vif_data["VIF"] = [sm.variance_inflation_factor(YB, i) for i in range(YB.shape[1])]
print(vif_data)

# 手动分箱

In[27]:

YB=zt1.copy()

In[28]:

YB=YB[[
"月最高支出金额",
"单笔最高收入金额",
"近3天是否整百交易支出总金额",
"近3天是否整百交易笔数占比",
"历史是否整百交易总笔数",
"历史租机支出总金额",
"历史还款支出总金额",
"夜间近30天累计交易笔数",
"近30天工作日高峰时间段交易笔数占比",
"近6个月支出笔数集中度",
"FPD30+"]]
YB

In[29]:

"""def qj(x):
values=[0,210,290,822,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['月最高支出笔数']=YB['月最高支出笔数'].apply(qj)
YB"""

def qj(x):
values=[0,31770,390989,747114,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['月最高支出金额']=YB['月最高支出金额'].apply(qj)
YB

def qj(x):
values=[0,17792,84494,118000,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['单笔最高收入金额']=YB['单笔最高收入金额'].apply(qj)
YB

"""def qj(x):
values=[0,2,6,9,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['近3天是否整百交易支出笔数']=YB['近3天是否整百交易支出笔数'].apply(qj)
YB"""

def qj(x):
values=[0,3250,34350,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['近3天是否整百交易支出总金额']=YB['近3天是否整百交易支出总金额'].apply(qj)
YB

def qj(x):
values=[0,8,25,59,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['近3天是否整百交易笔数占比']=YB['近3天是否整百交易笔数占比'].apply(qj)
YB

def qj(x):
values=[0,166,688,1849,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['历史是否整百交易总笔数']=YB['历史是否整百交易总笔数'].apply(qj)
YB

"""def qj(x):
values=[0,145,1012,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['历史是否整百交易支出笔数']=YB['历史是否整百交易支出笔数'].apply(qj)
YB"""

def qj(x):
values=[0,967,7645,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['历史租机支出总金额']=YB['历史租机支出总金额'].apply(qj)
YB

def qj(x):
values=[0,257,12872,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['历史还款支出总金额']=YB['历史还款支出总金额'].apply(qj)
YB

def qj(x):
values=[0,37,121,196,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['夜间近30天累计交易笔数']=YB['夜间近30天累计交易笔数'].apply(qj)
YB

"""def qj(x):
values=[0,141,354,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['夜间近90天累计交易笔数']=YB['夜间近90天累计交易笔数'].apply(qj)
YB"""

"""def qj(x):
values=[0,59860,493844,977072,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['夜间历史累计交易金额']=YB['夜间历史累计交易金额'].apply(qj)
YB"""

def qj(x):
values=[0,28,41,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['近30天工作日高峰时间段交易笔数占比']=YB['近30天工作日高峰时间段交易笔数占比'].apply(qj)
YB

def qj(x):
values=[0,60,212,np.inf]
index=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T']
for i in range(len(values)-1):
if values[i]<=x<values[i+1]:
return '{0}_[{1},{2})'.format(index[i],values[i],values[i+1])
return '空值'
YB['近6个月支出笔数集中度']=YB['近6个月支出笔数集中度'].apply(qj)
YB

# 转化woe(如果没有相同woe或者特殊inf不需要手动转)

In[30]:

YB.columns

In[38]:

import pycard as pc
data_iv=pd.DataFrame()
for i in ['FPD30+']:
for j in ['月最高支出金额', '单笔最高收入金额', '近3天是否整百交易支出总金额',
'近3天是否整百交易笔数占比', '历史是否整百交易总笔数', '历史租机支出总金额',
'历史还款支出总金额', '夜间近30天累计交易笔数',
'近30天工作日高峰时间段交易笔数占比', '近6个月支出笔数集中度']:
tmp_data = pc.cross_woe(YB[j],YB[i]).reset_index()
tmp_data.insert(0,'type',value=i)
tmp_data.rename({j:'区间'},axis=1,inplace=True)
data_iv = data_iv.append(tmp_data)
data_iv

In[39]:

ys=data_iv[['区间','colName','woe']]
for i in ['月最高支出金额', '单笔最高收入金额', '近3天是否整百交易支出总金额',
'近3天是否整百交易笔数占比', '历史是否整百交易总笔数', '历史租机支出总金额',
'历史还款支出总金额', '夜间近30天累计交易笔数',
'近30天工作日高峰时间段交易笔数占比', '近6个月支出笔数集中度']:
dictys = dict(zip(ys[ys['colName']i]['区间'],ys[ys['colName']i]['woe']))
YB[i]=YB[i].map(dictys)

In[40]:

YB

In[41]:

from sklearn.model_selection import train_test_split
import statsmodels.api as sm#逻辑回归另外一个包
x_train,x_test,y_train,y_test=train_test_split(YB.iloc[:,:-1],YB['FPD30+'],test_size=0.3,stratify=YB['FPD30+'],random_state=2025)
X1=sm.add_constant(x_train)
X2=sm.add_constant(x_test)
X3=sm.add_constant(YB.iloc[:,:-1])

In[42]:

logit=sm.Logit(y_train.astype(float),X1.astype(float))
result=logit.fit()
result.summary()
result.params#系数全正或全负

In[43]:

result.params.reset_index()

In[44]:

import math
import toad
p = 50 / math.log(2) #factor = 20 / np.log(2)
q = 600 + 50 * math.log(0.1) / math.log(2)
p1=result.predict(X1)
p2=result.predict(X2)
p1=pd.DataFrame(p1)
p1.columns=['分数']
p1['分数']=p1['分数'].apply(lambda x:q-p*math.log(x/(1-x)))
xlj=toad.metrics.KS_bucket(p1['分数'],y_train,bucket=10, method = 'quantile')
xlj[['min','max','bads','goods','total','bad_rate','good_rate','ks','lift']]

In[45]:

p1

In[46]:

y_train

In[36]:

p2=pd.DataFrame(p2)
p2.columns=['分数']
p2['分数']=p2['分数'].apply(lambda x:q-p*math.log(x/(1-x)))
csj=toad.metrics.KS_bucket(p2['分数'],y_test,bucket=10, method = 'quantile')
csj[['min','max','bads','goods','total','bad_rate','good_rate','ks','lift']]

In[37]:

p3=result.predict(X3)
p3=pd.DataFrame(p3)
p3.columns=['分数']
p3['分数']=p3['分数'].apply(lambda x:q-p*math.log(x/(1-x)))
zt=toad.metrics.KS_bucket(p3['分数'],YB['FPD30+'],bucket=10, method = 'quantile')
zt[['min','max','bads','goods','total','bad_rate','good_rate','ks','lift']]

In[38]:

toad.metrics.PSI(p1['分数'],p2['分数'])

In[39]:

toad.metrics.PSI(p1['分数'],p2['分数'])

In[40]:

pp=p1.copy()
p1=result.predict(X1)
p2=result.predict(X2)
p1=pd.DataFrame(p1)
p2=pd.DataFrame(p2)

In[41]:

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
n=p1
from sklearn.metrics import roc_curve
label=y_train
score=n
fpr,tpr,thresholds= roc_curve(label,score)
ks_value = max(abs(fpr-tpr))
plt.plot(fpr, label='bad')
plt.plot(tpr, label='good')
plt.plot(abs(fpr-tpr), label='diff')
x = np.argwhere(abs(fpr-tpr) == ks_value)[0, 0]
plt.plot((x, x), (0, ks_value), label='ks - {:.2f}'.format(ks_value), color='r', marker='o', markerfacecolor='r', markersize=5)
plt.scatter((x, x), (0, ks_value), color='r')
plt.legend()
plt.show()

In[42]:

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
n=p2
from sklearn.metrics import roc_curve
label=y_test
score=n
fpr,tpr,thresholds= roc_curve(label,score)
ks_value = max(abs(fpr-tpr))
plt.plot(fpr, label='bad')
plt.plot(tpr, label='good')
plt.plot(abs(fpr-tpr), label='diff')
x = np.argwhere(abs(fpr-tpr) == ks_value)[0, 0]
plt.plot((x, x), (0, ks_value), label='ks - {:.2f}'.format(ks_value), color='r', marker='o', markerfacecolor='r', markersize=5)
plt.scatter((x, x), (0, ks_value), color='r')
plt.legend()
plt.show()

# 分数验证

In[43]:

coe=[-2.56437,
0.556339,
0.250822,
0.268622,
0.661473,
0.352373,
0.886951,
0.728574,
0.541455,
0.574655,
0.581366]
def get_score(coe,woe,factor):
scores=[]
for w in woe:
score=round(coewfactor,0)
scores.append(score)
return scores
p = 50 / math.log(2) #factor = 20 / np.log(2)
q = 600 + 50 * math.log(0.1) / math.log(2) #
baseScore = round(q - p * coe[0], 0) #b=offset+factor*log(o)
print(p,q,baseScore)
model_col=['月最高支出金额', '单笔最高收入金额', '近3天是否整百交易支出总金额',
'近3天是否整百交易笔数占比', '历史是否整百交易总笔数', '历史租机支出总金额',
'历史还款支出总金额', '夜间近30天累计交易笔数',
'近30天工作日高峰时间段交易笔数占比', '近6个月支出笔数集中度']
coe=coe=[-2.56437,
0.556339,
0.250822,
0.268622,
0.661473,
0.352373,
0.886951,
0.728574,
0.541455,
0.574655,
0.581366]
for i in range(1,11):
X1[model_col[i-1]+'_scores']=get_score(coe[i], X1[model_col[i-1]], p)
scores_col = [ i for i in X1.columns if i[-7:]=='_scores']
X1['fenshu'] = X1[scores_col].sum(axis=1)
X1['fenshu'] = baseScore-X1['fenshu']

In[44]:

X1

In[45]:

pp

In[46]:

for i in range(1,11):
X3[model_col[i-1]+'_scores']=get_score(coe[i], X3[model_col[i-1]], p)
scores_col = [ i for i in X3.columns if i[-7:]=='_scores']
X3['fenshu'] = X3[scores_col].sum(axis=1)
X3['fenshu'] = baseScore-X3['fenshu']
X3

In[49]:

X3[X3['fenshu']<=550]

In[47]:

baseScore

# 各部分评分

In[370]:

dpd1 = pd.DataFrame()
for i in model_col:
dd=X3.drop_duplicates(subset=[i])[[i,i+'_scores']]
dd.columns=['区间','分数']
dd['变量']=i
dpd1 = dpd1.append(dd)
pfk=data_iv[['区间','colName','woe']]
dpd1.rename(columns={'变量':"colName",'区间':"woe"},inplace=True)
PFK=pfk.merge(dpd1,on=['colName','woe'],how="inner")
PFK

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

In[ ]:

# 鼎力租机效果测试

In[250]:

import pandas as pd
df1=pd.read_excel('C:/Users/12463/Desktop/鼎力租机最新分析结果.xlsx',sheet_name='原始数据')
df1

In[278]:

YB=df1.copy()
YB

In[279]:

YB=YB[[
"月最高支出金额",
"单笔最高收入金额",
"近3天是否整百交易支出总金额",
"近3天是否整百交易笔数占比",
"历史是否整百交易总笔数",
"历史租机支出总金额",
"历史还款支出总金额",
"夜间近30天累计交易笔数",
"近30天工作日高峰时间段交易笔数占比",
"近6个月支出笔数集中度",
"dpd30+"]]
YB

In[280]:

import pandas as pd
def convert_percent_to_float(s):
if isinstance(s, str) and s.endswith('%'):
return float(s.rstrip('%'))
return s

应用这个函数到每一列

for col in YB.columns:
YB[col] = YB[col].apply(convert_percent_to_float)

将所有列转换为数值类型,并保留两位小数

YB = YB.apply(pd.to_numeric, errors='coerce').round(2)
YB

In[ ]:

In[ ]:

In[ ]:

In[282]:

ys=data_iv[['区间','colName','woe']]
for i in ['月最高支出金额', '单笔最高收入金额', '近3天是否整百交易支出总金额',
'近3天是否整百交易笔数占比', '历史是否整百交易总笔数', '历史租机支出总金额',
'历史还款支出总金额', '夜间近30天累计交易笔数',
'近30天工作日高峰时间段交易笔数占比', '近6个月支出笔数集中度']:
dictys = dict(zip(ys[ys['colName']i]['区间'],ys[ys['colName']i]['woe']))
YB[i]=YB[i].map(dictys)

In[283]:

YB

In[286]:

YB=YB[YB['dpd30+']!=2]
YB

In[293]:

X3=sm.add_constant(YB.iloc[:,:-1])
p3=result.predict(X3)
p3=pd.DataFrame(p3)
p3.columns=['分数']
p3['分数']=p3['分数'].apply(lambda x:q-p*math.log(x/(1-x)))
zt=toad.metrics.KS_bucket(p3['分数'],YB['dpd30+'],bucket=20, method = 'quantile')
zt[['min','max','bads','goods','total','bad_rate','good_rate','ks','lift']]

In[ ]:

16、报表开发

!/usr/bin/env python

coding: utf-8

In[1]:

from IPython.display import display, HTML
display(HTML(""))

# 整体运营情况

In[2]:

import numpy as np
import pandas as pd
import pymysql
conn=pymysql.connect(host="114.55.91.201",user="read",passwd="readrisk123456",db='likeshop')
sql="""SELECT o.user_id,
o.create_time,o.enjoy_time,o.pay_time,
order_review,is_examine,order_status
FROM ls_order as o """
order=pd.read_sql_query(sql,conn)
import datetime
def dates(x):
if pd.isnull(x):
return x
elif x==0:
return np.nan
else:
datetime_obj = datetime.datetime.fromtimestamp(x)
return datetime_obj.date()
#datetime_obj
#atetime_obj.time()
order["create_time"]=order["create_time"].apply(dates)
order["enjoy_time"]=order["enjoy_time"].apply(dates)
order["pay_time"]=order["pay_time"].apply(dates)
order['create_time']=order['create_time'].astype(str)
order=order[order['create_time']>="2024-05-02"]
import pandas as pd
import numpy as np
import pycard as pc
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
import pymysql
conn=pymysql.connect(host="114.55.91.201",user="read",passwd="readrisk123456",db='likeshop')
sql="""select uid as user_id,name,id_num as idcard from ls_user_id_card where step =4 """
idcard=pd.read_sql_query(sql,conn)
order=order.merge(idcard,on="user_id",how="inner")
order['type']="牛牛商城"

ZT.drop_duplicates(subset="idcard",inplace=True)

order=order[order['idcard'].notna()]
order

In[3]:

order['月份']=order['create_time'].apply(lambda x:x[:7])
order
import datetime as dt
def get_monday_to_sunday(today):
today = dt.datetime.strptime(str(today), "%Y-%m-%d")
monday = dt.datetime.strftime(today - dt.timedelta(today.weekday()), "%Y-%m-%d")
monday_ = dt.datetime.strptime(monday, "%Y-%m-%d")
sunday = dt.datetime.strftime(monday_ + dt.timedelta(monday_.weekday() + 6), "%Y-%m-%d")
monday1=str(monday)
sunday1=str(sunday)
return monday1+"~"+sunday1
order['周']=order['create_time'].apply(get_monday_to_sunday)
order

In[4]:

order['create_time'].value_counts()

In[5]:

order1=order.copy()
order.drop_col(['idcard','name'],inplace=True)
order

In[6]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="112.124.40.41",user="score",passwd="jxy7RwPaLBryzFF8",db='score',port=9806)
conn = create_engine('mysql+pymysql://score:jxy7RwPaLBryzFF8@112.124.40.41:9806/score')
order.to_sql("zttgl",conn,if_exists='replace',index=False)

# 评分分布逾期率

In[ ]:

import pandas as pd
import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="112.124.40.41",user="score",passwd="jxy7RwPaLBryzFF8",db='score',port=9806)
sql="""SELECT idcard,近12个月贷款金额在1w以上的笔数,消金贷款类机构最大授信额度,近1个月履约贷款总金额,申请准入分,近3个月履约贷款总金额,近1个月履约贷款次数,
近3个月履约贷款次数,履约笔数,最大逾期金额,
tw_v100_bin from score2 """
score=pd.read_sql_query(sql,conn)
score

In[ ]:

score['tw_v100_bin'].value_counts(dropna=False)

In[ ]:

order1=order1[order1['is_examine']==1]
order1

In[ ]:

order1=order1.merge(score,on="idcard",how="left")
order1

In[ ]:

order1[order1['tw_v100_bin'].isnull()]

In[ ]:

order2=order1[(order1['order_review']1)&(order1['is_examine']1)&(order1['order_status']!=4)&(order1['order_status']!=5)]
order2

In[ ]:

order[order1['tw_v100_bin'].isnull()]

In[ ]:

import datetime as dt
a=dt.datetime.now().date()
b=str(a)
order2['watchday']=b
for i in ['enjoy_time','pay_time','watchday']:
order2[i]=pd.to_datetime(arg=order2[i])
order2['datediff']=order2.apply(lambda x:(x.watchday-x.enjoy_time).days,axis=1)
order2

In[ ]:

def yq(x):
if pd.isnull(x.pay_time)&(x.watchday >x.enjoy_time):
return (x.watchday-x.enjoy_time).days
elif pd.isnull(x.pay_time)&(x.watchday <=x.enjoy_time):
return -999
elif x.pay_time > x.enjoy_time:
return (x.pay_time-x.enjoy_time).days
else: return 0
order2['overday']=order2.apply(yq,axis=1)
order2

In[ ]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="localhost",user="root",passwd="select@20",db='score')
conn = create_engine('mysql+pymysql://root:select%4020@localhost:3306/score')
order2.to_sql("ztpf",conn,if_exists='replace',index=False)
order1.to_sql("qbpf",conn,if_exists='replace',index=False)

# 逾期率

In[ ]:

order2['max_overday']=order2.groupby("idcard")["overday"].transform("max")
order2['max_datediff']=order2.groupby("idcard")["datediff"].transform("max")
order2

In[ ]:

order2.drop_duplicates(subset="idcard",inplace=True)
order2

In[ ]:

order2[order2['tw_v100_bin'].isnull()]

In[ ]:

order2[order2['idcard']=="511525199805311938"]

In[ ]:

order2['tw_v100_bin'].value_counts(dropna=False)

In[ ]:

order2['dpd1+']=order2.apply(lambda x:'还没到期_1+'if x.max_datediff<1 else '1+' if
x.max_overday>=1 else '0',axis=1)
order2['dpd3+'] =order2.apply(lambda x:'还没到期_3+'if x.max_datediff<3 else '3+' if
x.max_overday>=3 else '0-2',axis=1)
order2['dpd7+'] =order2.apply(lambda x:'还没到期_7+'if x.max_datediff<7 else '7+' if
x.max_overday>=7 else '0-6',axis=1)
order2['dpd15+'] =order2.apply(lambda x:'还没到期_15+'if x.max_datediff<15 else '15+' if
x.max_overday>=15 else '0-14',axis=1)

In[ ]:

order2

In[ ]:

order2['tw_v100_bin'].value_counts(dropna=False)

In[ ]:

dpd1 = pd.DataFrame()
dpd3 = pd.DataFrame()
dpd7 = pd.DataFrame()
dpd15 = pd.DataFrame()
def f(x):
return x.nunique()
for i in ['tw_v100_bin']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd1+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd1+_rate'] = t1['1+']/(t1['1+']+t1['0'])
dpd1 = dpd1.append(t1)

In[ ]:

for i in ['tw_v100_bin']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd3+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd3+_rate'] = t1['3+']/(t1['3+']+t1['0-2'])
dpd3 = dpd3.append(t1)

In[ ]:

for i in ['tw_v100_bin']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd7+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd7+_rate'] = t1['7+']/(t1['7+']+t1['0-6'])
dpd7 = dpd7.append(t1)

In[ ]:

for i in ['tw_v100_bin']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd15+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd15+_rate'] = t1['15+']/(t1['15+']+t1['0-14'])
dpd15 = dpd15.append(t1)

In[ ]:

yq=dpd1.merge(dpd3,on=["区间",'变量'],how="left")
yq=yq.merge(dpd7,on=["区间",'变量'],how="left")
yq=yq.merge(dpd15,on=["区间",'变量'],how="left")
yq

In[ ]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="112.124.40.41",user="score",passwd="jxy7RwPaLBryzFF8",db='score',port=9806)
conn = create_engine('mysql+pymysql://score:jxy7RwPaLBryzFF8@112.124.40.41:9806/score')
yq.to_sql("ztyq",conn,if_exists='replace',index=False)

# 分状态

In[ ]:

zd={"A_[0,262)":"1",
"B_[262,316)":"1",
"C_[316,352)":"1",
"D_[352,374)":"2",
"E_[374,390)":"2",
"F_[390,405)":"2",
"G_[405,416)":"2",
"H_[416,428)":"2",
"I_[428,445)":"3",
"J_[445,inf)":"3"}
order2['status']=order2['tw_v100_bin'].map(zd)

In[ ]:

order2['status'].value_counts(dropna=False)

In[ ]:

order2['status'].value_counts()

In[ ]:

order2['status']

In[ ]:

order2['status']=order2['status'].astype('str')
order2['status']=order2['status'].apply(lambda x:"建议拒绝" if x =="1" else "建议人审" if x =="2" else "建议通过")
order2

In[ ]:

dpd1 = pd.DataFrame()
dpd3 = pd.DataFrame()
dpd7 = pd.DataFrame()
dpd15 = pd.DataFrame()
def f(x):
return x.nunique()
for i in ['status']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd1+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd1+_rate'] = t1['1+']/(t1['1+']+t1['0'])
dpd1 = dpd1.append(t1)

In[ ]:

for i in ['status']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd3+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd3+_rate'] = t1['3+']/(t1['3+']+t1['0-2'])
dpd3 = dpd3.append(t1)

In[ ]:

for i in ['status']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd7+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd7+_rate'] = t1['7+']/(t1['7+']+t1['0-6'])
dpd7 = dpd7.append(t1)

In[ ]:

for i in ['status']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd15+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd15+_rate'] = t1['15+']/(t1['15+']+t1['0-14'])
dpd15 = dpd15.append(t1)

In[ ]:

yq=dpd1.merge(dpd3,on=["区间",'变量'],how="left")
yq=yq.merge(dpd7,on=["区间",'变量'],how="left")
yq=yq.merge(dpd15,on=["区间",'变量'],how="left")
yq

In[ ]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="112.124.40.41",user="score",passwd="jxy7RwPaLBryzFF8",db='score',port=9806)
conn = create_engine('mysql+pymysql://score:jxy7RwPaLBryzFF8@112.124.40.41:9806/score')
yq.to_sql("ztyqstatus",conn,if_exists='replace',index=False)

In[ ]:

order2

In[ ]:

order3=order2[['user_id','create_time','月份', '周','近12个月贷款金额在1w以上的笔数', '消金贷款类机构最大授信额度', '近1个月履约贷款总金额', '申请准入分',
'近3个月履约贷款总金额', '近1个月履约贷款次数', '近3个月履约贷款次数', '履约笔数', '最大逾期金额']]

In[ ]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="112.124.40.41",user="score",passwd="jxy7RwPaLBryzFF8",db='score',port=9806)
conn = create_engine('mysql+pymysql://score:jxy7RwPaLBryzFF8@112.124.40.41:9806/score')
order3.to_sql("blfb",conn,if_exists='replace',index=False)

In[ ]:

dpd1 = pd.DataFrame()
dpd3 = pd.DataFrame()
dpd7 = pd.DataFrame()
dpd15 = pd.DataFrame()
def f(x):
return x.nunique()
for i in ['近12个月贷款金额在1w以上的笔数', '消金贷款类机构最大授信额度', '近1个月履约贷款总金额', '申请准入分',
'近3个月履约贷款总金额', '近1个月履约贷款次数', '近3个月履约贷款次数', '履约笔数', '最大逾期金额']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd1+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd1+_rate'] = t1['1+']/(t1['1+']+t1['0'])
dpd1 = dpd1.append(t1)

In[ ]:

for i in ['近12个月贷款金额在1w以上的笔数', '消金贷款类机构最大授信额度', '近1个月履约贷款总金额', '申请准入分',
'近3个月履约贷款总金额', '近1个月履约贷款次数', '近3个月履约贷款次数', '履约笔数', '最大逾期金额']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd3+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd3+_rate'] = t1['3+']/(t1['3+']+t1['0-2'])
dpd3 = dpd3.append(t1)

In[ ]:

for i in ['近12个月贷款金额在1w以上的笔数', '消金贷款类机构最大授信额度', '近1个月履约贷款总金额', '申请准入分',
'近3个月履约贷款总金额', '近1个月履约贷款次数', '近3个月履约贷款次数', '履约笔数', '最大逾期金额']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd7+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd7+_rate'] = t1['7+']/(t1['7+']+t1['0-6'])
dpd7 = dpd7.append(t1)

In[ ]:

for i in ['近12个月贷款金额在1w以上的笔数', '消金贷款类机构最大授信额度', '近1个月履约贷款总金额', '申请准入分',
'近3个月履约贷款总金额', '近1个月履约贷款次数', '近3个月履约贷款次数', '履约笔数', '最大逾期金额']:
t1 = pd.pivot_table(data=order2,index=[i],columns='dpd15+',
values='idcard',aggfunc=f).reset_index()
t1.rename(columns={i:'区间'},inplace=True)
t1['变量']=i
t1['dpd15+_rate'] = t1['15+']/(t1['15+']+t1['0-14'])
dpd15 = dpd15.append(t1)

In[ ]:

yq=dpd1.merge(dpd3,on=["区间",'变量'],how="left")
yq=yq.merge(dpd7,on=["区间",'变量'],how="left")
yq=yq.merge(dpd15,on=["区间",'变量'],how="left")
yq

In[ ]:

import pymysql
from sqlalchemy import create_engine
conn=pymysql.connect(host="112.124.40.41",user="score",passwd="jxy7RwPaLBryzFF8",db='score',port=9806)
conn = create_engine('mysql+pymysql://score:jxy7RwPaLBryzFF8@112.124.40.41:9806/score')
yq.to_sql("blyq",conn,if_exists='replace',index=False)

In[ ]: