2018-4-25个人征信

业务梳理

逻辑回归的数理原理

  • 应用场景

    • 逻辑回归被广泛应用在目标变量是二值变量的场合(0,1)
  • 公式

  • 模型估计

    • 极大似然估计
  • 模型阐释/评估

    • 一个解释变量的阐释图
    • C值/AUC,Lift图

得到每个用户的违约概率(信用评分)

目标变量:用户的违约概率

数据清洗

初始数据整理

关联相关表

使用mysql将导入txt数据并且进行合并。

压缩数据

bank_detail 和 bill_detail 和 用户浏览行为表 不能直接进行关联,目前的想法,将这两张表的信息分别压缩到每个 user_id 上。

bank_detail 每个用户的信息压缩为:工资收入、工资外收入、净资产

use test;
-- -----------------------------------------------------
-- Table `user_info`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `user_info` (
  `user_id` INT NOT NULL,
  `sex` INT NULL,
  `jobs` INT NULL,
  `edu_status` INT NULL,
  `marry_status` INT NULL,
  `residence` INT NULL)
ENGINE = InnoDB;


-- -----------------------------------------------------
-- Table `bank_detail`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `bank_detail` (
  `user_id` INT NOT NULL,
  `timestemp` VARCHAR(45) NULL,
  `trade_type` INT NULL,
  `trade_amount` decimal(30,8) NULL,
  `in_come` INT NULL)
ENGINE = InnoDB;


-- -----------------------------------------------------
-- Table `browse_history`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `browse_history` (
  `user_id` INT NOT NULL,
  `timestemp` VARCHAR(45) NULL,
  `browsing_behavior_data` INT NULL,
  `browsing_behavior_number` INT NULL,
  `seg` INT NULL)
ENGINE = InnoDB;


-- -----------------------------------------------------
-- Table `bill_detail`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `bill_detail` (
  `user_id` INT NOT NULL,
  `bill_timestemp` VARCHAR(45) NULL,
  `bank_id` INT NULL,
  `pre_bill_amount` DECIMAL(30,6) NULL,
  `pre_repayment_amount` DECIMAL(30,6) NULL,
  `credit_limit` DECIMAL(30,6) NULL,
  `curr_bill_balance` DECIMAL(30,6) NULL,
  `curr_min_repayment` DECIMAL(30,6) NULL,
  `num_of_consumption` INT NULL,
  `curr_bill_amount` DECIMAL(30,6) NULL,
  `adj_amount` DECIMAL(30,6) NULL,
  `cycle_interest` DECIMAL(30,6) NULL,
  `available_balance` DECIMAL(30,6) NULL,
  `cash_in_advance` DECIMAL(30,6) NULL,
  `repayment_status` DECIMAL(30,6) NULL)
ENGINE = InnoDB;


-- -----------------------------------------------------
-- Table `loan_time`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `loan_time` (
  `user_id` INT NOT NULL,
  `loan_time` VARCHAR(45) NULL)
ENGINE = InnoDB;


-- -----------------------------------------------------
-- Table `overdue`
-- -----------------------------------------------------
CREATE TABLE IF NOT EXISTS `overdue` (
  `user_id` INT NOT NULL,
  `overdue_label` INT NULL)
ENGINE = InnoDB;
####将txt文件导入数据库中####
use test

load data infile "C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\bank_detail_train.txt" 
into table bank_detail
  fields terminated by ',';
select count(*) from bank_detail;

load data infile "C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\bill_detail_train.txt" 
into table bill_detail
  fields terminated by ',';
select count(*) from bill_detail;


load data infile "C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\browse_history_train.txt" 
into table browse_history
  fields terminated by ',';
select count(*) from browse_history;

load data infile "C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\loan_time_train.txt" 
into table loan_time
  fields terminated by ',';
select * from loan_time;

load data infile "C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\overdue_train.txt" 
into table overdue
  fields terminated by ',';
select * from overdue;

load data infile "C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\user_info_train.txt" 
into table user_info
  fields terminated by ',';
select * from user_info;

时间戳无法在mysql中最大为2030年,数据中的时间戳超过了这个时间范围。可以在r中将时间戳转换。

as.POSIXct(5894316387, origin = "1960-01-01", tz = "GMT")  
use test;


##各表数据量
select count(1) from bank_detail;
#6070197
select count(1) from bill_detail;
#2338118
#select count(1) from browse_history;
#22919547
select count(1) from loan_time;
#55596
select count(1) from overdue;
#55596
select count(1) from user_info;
#55596

## 压缩bank_detail的信息
select *
from bank_detail t
where user_id = 29165;

信息汇总

我们的目标是将所有的信息汇总到每一个用户,定义关于用户的衍生变量。
我们发现,bank_detail 表中的user_id数据比其它表的user_id少84%;大概只有9k多条。我们认为bank_detail是与最终目标变量相关性很高的数据,所以我们将这部分数据拿出来,单独做一次分析。

数据清洗的语句如下:

use test;


##各表数据量
select count(1) from bank_detail;
#6070197
select count(1) from bill_detail;
#2338118
select count(1) from browse_history;
#22919547
select count(1) from loan_time;
#55596
select count(1) from overdue;
#55596
select count(1) from user_info;
#55596

## 压缩bank_detail的信息
drop table if exists temp_1;
create table temp_1
select t.*,IF(trade_type=1,-1*trade_amount,trade_amount) as is_inout
from bank_detail t;
#where user_id = 29165;

drop table if exists v_bank_detail;
create table v_bank_detail
select user_id,sum(in_come*trade_amount) wage_income,sum((-trade_type+1)*trade_amount)-sum(in_come*trade_amount) exwage_income,
sum(is_inout) property
from temp_1
#where user_id = 6965
group by user_id;

## 压缩browse_history的信息
drop table if exists v_browse_history;
create table v_browse_history
select user_id,count(1) browse_count
from browse_history
group by user_id;

/*
drop table temp_22;
create table temp_22
select user_id,
if(browsing_behavior_number = 1,num,0) num1,
if(browsing_behavior_number = 2,num,0) num2,
if(browsing_behavior_number = 3,num,0) num3,
if(browsing_behavior_number = 4,num,0) num4,
if(browsing_behavior_number = 5,num,0) num5,
if(browsing_behavior_number = 6,num,0) num6,
if(browsing_behavior_number = 7,num,0) num7,
if(browsing_behavior_number = 8,num,0) num8,
if(browsing_behavior_number = 9,num,0) num9,
if(browsing_behavior_number = 10,num,0) num10,
if(browsing_behavior_number = 11,num,0) num11
from temp_2;
drop table browse_history_temp2;
create table browse_history_temp2
select user_id,
count(1) browse_count
from temp_22
group by user_id;
*/

SET GLOBAL innodb_buffer_pool_size=67108864;
#压缩bill_detail表的信息到user_id上。
drop table if exists v_bill_detail;
create table v_bill_detail
select user_id,
count(distinct(bank_id)) count_bank,
sum(pre_bill_amount)-sum(pre_repayment_amount)     pre_not_repay,
sum(credit_limit)     sum_credit_limit,
sum(curr_bill_balance)     sum_curr_bill_balance,
sum(curr_min_repayment)     sum_curr_min_repayment,
sum(num_of_consumption)     sum_num_of_consumption,
sum(curr_bill_amount)     sum_curr_bill_amount,
sum(adj_amount)     sum_adj_amount,
sum(cycle_interest)     sum_cycle_interest,
sum(available_balance)     sum_available_balance,
sum(cash_in_advance)     sum_cash_in_advance,
count(*)-sum(repayment_status) repay_num,
sum(repayment_status) not_repay_num
from bill_detail 
group by user_id;

##建立索引 
create index idx_1 on v_bank_detail (user_id);
create index idx_2 on v_browse_history(user_id);
create index idx_3 on v_bill_detail(user_id);
create index idx_4 on user_info(user_id);
create index idx_5 on loan_time(user_id);
create index idx_6 on overdue(user_id);


# 关联各表,得到一张宽表。
#因为bank_detail表中的数据明显太少,只有总数据的16%。因此我们决定单独拿出来进行分析。

drop table if exists finall_1;
create table finall_1
select 
t1.* ,
t2.loan_time loan_time ,
t3.overdue_label overdue_label ,
t4.browse_count browse_count ,
 t.wage_income wage_income ,
 t.exwage_income exwage_income ,
 t.property property ,
t5.count_bank count_bank ,
t5.pre_not_repay pre_not_repay ,
t5.sum_credit_limit sum_credit_limit ,
t5.sum_curr_bill_balance sum_curr_bill_balance ,
t5.sum_curr_min_repayment sum_curr_min_repayment ,
t5.sum_num_of_consumption sum_num_of_consumption ,
t5.sum_curr_bill_amount sum_curr_bill_amount ,
t5.sum_adj_amount sum_adj_amount ,
t5.sum_cycle_interest sum_cycle_interest ,
t5.sum_available_balance sum_available_balance ,
t5.sum_cash_in_advance sum_cash_in_advance ,
t5.repay_num repay_num ,
t5.not_repay_num not_repay_num
from v_bank_detail t 
inner  join user_info t1 on t.user_id = t1.user_id
inner join loan_time t2 on t.user_id = t2.user_id
inner join overdue t3 on t.user_id = t3.user_id
inner join v_browse_history t4 on t.user_id = t4.user_id
inner join v_bill_detail t5 on t.user_id=t5.user_id;

#第一部分有bank_detail 的6k多条数据在finall_1中。
select count(*) from finall_1;
#导出到csv文件中。

select 
	*
from finall_2
#order by user_id asc

into outfile 'C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\finall_bank.csv' 

FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n';
    

将这部分数据导入R中:

y<-read.csv("C:/Users/CDAer/Desktop/个人征信预测/finall_bank.csv",header=FALSE,sep=",")
colnames(y1)<-c('user_id',
         'count_bank',
         'sum_pre_bill_amount',
         'sum_pre_repayment_amount',
         'sum_credit_limit',
         'sum_curr_bill_balance',
         'sum_curr_min_repayment',
         'sum_num_of_consumption',
         'sum_curr_bill_amount',
         'sum_adj_amount',
         'sum_cycle_interest',
         'sum_available_balance',
         'sum_cash_in_advance',
         'repay_num',
         'not_repay_num',
         'num1',
         'num2',
         'num3',
         'num4',
         'num5',
         'num6',
         'num7',
         'num8',
         'num9',
         'num10',
         'num11',
         'wage_income',
         'exwage_income',
         'property',
         'edu_status',
         'jobs',
         'marry_status',
         'residence',
         'sex',
         'loan_time',
         'overdue_label')
#导入的数据多了偶数行,重新取应用数据放入y1 中。
p<-seq(from=1,to=nrow(y),by=2)
p
y1<-y[p,]

缺失信息处理

上限/下限

筛选变量

建立模型

###1.数据处理####
rm(list = ls())
y<-read.csv("C:/Users/Guang/Desktop/work/finall_bank.csv",header=FALSE,sep=",",stringsAsFactors = F)
nrow(y)
table(y[!is.na(y)])
View(y[1])
class(y)

y$V9<-as.integer(y$V9)
y$V10<-as.numeric(y$V10)
y$V11<-as.numeric(y$V11)
y$V12<-as.numeric(y$V12)
y$V13<-as.numeric(y$V13)


y$V27<-as.integer(y$V27)
y$V28<-as.integer(y$V28)
y$V29<-as.integer(y$V29)
y$V30<-as.integer(y$V30)
y$V31<-as.integer(y$V31)
y$V32<-as.integer(y$V32)
y$V33<-as.integer(y$V33)
y$V34<-as.integer(y$V34)
y$V35<-as.integer(y$V35)
y$V36<-as.integer(y$V36)
y$V37<-as.integer(y$V37)
#将0为缺失值的化为缺失值
y<-y[1:53174,]
y$sex<-ifelse(y$sex==0,NA,y$sex)
y$loan_time<-ifelse(y$loan_time==0,NA,y$loan_time)

#变量名

colnames(y)<-c("user_id",
               "sex",
               "jobs",
               "edu_status",
               "marry_status",
               "residence",
               "loan_time",
               "overdue_label",
               "wage_income",
               "exwage_income",
               "property",
               "income",
               "outcome",
               "count_bank",
               "pre_not_repay",
               "sum_credit_limit",
               "sum_curr_bill_balance",
               "sum_curr_min_repayment",
               "sum_num_of_consumption",
               "sum_curr_bill_amount",
               "sum_adj_amount",
               "sum_cycle_interest",
               "sum_available_balance",
               "sum_cash_in_advance",
               "repay_num",
               "not_repay_num",
               "browse_num1",
               "browse_num2",
               "browse_num3",
               "browse_num4",
               "browse_num5",
               "browse_num6",
               "browse_num7",
               "browse_num8",
               "browse_num9",
               "browse_num10",
               "browse_num11"
)

#处理时间戳
y$loan_time<-as.POSIXct(y$loan_time, origin = "1960-01-01", tz = "GMT") 
#查看时间,按照月份做违约数量图
#library(plyr)
#library(lubridate)
#y$month<-month(y$loan_time)
#day_plot<-ddply(y,.(month),summarise,yyy=sum(overdue_label),yyyn=length(user_id))
#day_plot
#plot(day_plot$month,day_plot$yyy/day_plot$yyyn,type="b")
###填充sex和browse_count

b<- which(y$sex==0)
set.seed(2)
a<-sample(length(which(y$sex==0)),round(length(y[y$sex==0,"sex"])*0.8))
a2<-(1:length(which(y$sex==0)))[!(1:length(which(y$sex==0)) %in% a)]

b[a]
y[b[a],"sex"]<-1

b[a2]
y[b[a2],"sex"]<-2
y[is.na(y$sex),"sex"]<-1


#y[is.na(y$browse_count),"browse_count"]<-294

##填充browse——num的缺失值
browse<-c(
  "browse_num1",
  "browse_num2",
  "browse_num3",
  "browse_num4",
  "browse_num5",
  "browse_num6",
  "browse_num7",
  "browse_num8",
  "browse_num9",
  "browse_num10",
  "browse_num11"
)
for(i in browse){
  y[is.na(y[,i]),i]<-median(y[,i],na.rm = T)
}

train<-y
###### 2 分割训练集和验证集 ####
table(train$overdue_label)
table(test$overdue_label)

set.seed(1)
a<-sample(nrow(y),round(nrow(y)*0.7))
y[!((1:nrow(y)) %in% a),]
train<-y[a,]
test<-y[!((1:nrow(y)) %in% a),]####


##### 3. profiles图 #############################################################################################
# overall performance
overall_cnt=nrow(train)   #calculate the total count
overall_resp=sum(train$overdue_label,na.rm = T)  #calculate the total responders count
overall_resp_rate=overall_resp/overall_cnt  #calculate the response rate
overall_perf<-c(overall_count=overall_cnt,overall_responders=overall_resp,overall_response_rate=overall_resp_rate) #combine
View(t(overall_perf))  #take a look at the summary

library(plyr)              #call plyr
#分类型变量
"sex"
"jobs" 
"edu_status" 
[5] "marry_status"          
[6] "residence"   
"month"
prof<-ddply(train,.(sex),summarise,cnt=length(user_id),res=sum(overdue_label)) #group by sex
#View(prof)  #check the result
prof
prof1<-within(prof,
              {var1="sex"
              sex
              percent<-cnt/overall_cnt
              res_rate<-res/cnt 
              index<-res_rate/overall_resp_rate*100
              })  #add response_rate,index, percentage
View(prof1)


##连续型
[8] "overdue_label"         
[9] "browse_count"          
[10] "wage_income"           
[11] "exwage_income"         
[12] "property"              
[13] "count_bank"            
[14] "pre_not_repay"         
[15] "sum_credit_limit"      
[16] "sum_curr_bill_balance" 
[17] "sum_curr_min_repayment"
[18] "sum_num_of_consumption"
[19] "sum_curr_bill_amount"  
[20] "sum_adj_amount"        
[21] "sum_cycle_interest"    
[22] "sum_available_balance" 
[23] "sum_cash_in_advance"   
[24] "repay_num"             
[25] "not_repay_num"

table(train$browse_count)
nrow(train)
#separate to 2 parts: missing,nomissing

table(is.na(train$overdue_label))
nomissing<-data.frame(var_data[!is.na(train$browse_count),])  #select the no missing value records 
missing<-data.frame(var_data[is.na(train$browse_count),])  


##numeric Profiling:missing part 
missing2<-ddply(missing,.(browse_count),summarise,cnt=length(overdue_label),res=sum(overdue_label)) #group by em_months_last_open
#View(missing2)  
missing_perf<-within(missing2,{res_rate<-res/cnt 
index<-res_rate/overall_resp_rate*100
percent<-cnt/overall_cnt
var_category<-c('unknown')
})   #summary
View(missing_perf)

##numeric Profiling:Non-missing part
nomissing_value<-train[!is.na(train$browse_count),"browse_count"]
nomissing$var_category<-cut(nomissing_value,unique(quantile(nomissing_value,(0:10)/10)),include.lowest = F)#separte into 10 groups
View(table(nomissing$var_category))  #take a look at the 10 category
prof2<-ddply(nomissing,.(var_category),summarise,cnt=length(overdue_label),res=sum(overdue_label)) #group by the 10 groups
#View(prof2)
nonmissing_perf<-within(prof2,
                        {res_rate<-res/cnt 
                        index<-res_rate/overall_resp_rate*100
                        percent<-cnt/overall_cnt
                        })  #add resp_rate,index,percent
View(nonmissing_perf)

all_perf<-rbind(nonmissing_perf,missing_perf[,-1]) #set 2 data together
View(all_perf)

############################################################  4: Means  #############################################################
train<-y
c_name <- colnames(train)
for(i in c_name){
  mean_var1<-train[,i]
  mean1<-c(
    var=i,
    mean=mean(mean_var1,na.rm=T),
    median=median(mean_var1,na.rm=T),
    quantile(mean_var1,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=T),
    nmiss=sum(is.na(mean_var1))
  )
  all_mean<-rbind(all_mean,t(mean1)) #set 2 data together
}
#需要给all_mean 一个初始值 然后再跑一次循环。
all_mean<-mean1  

for(i in c_name){
  mean_var1<-train[,i]
  mean1<-c(
    var=i,
    mean=mean(mean_var1,na.rm=T),
    median=median(mean_var1,na.rm=T),
    quantile(mean_var1,c(0,0.01,0.1,0.25,0.5,0.75,0.9,0.99,1),na.rm=T),
    nmiss=sum(is.na(mean_var1))
  )
  all_mean<-rbind(all_mean,t(mean1)) #set 2 data together
}
View(all_mean)


### 5消除极值点######
#消除极值点
a<-c(#"browse_count",
  "property",
  "pre_not_repay",
  "sum_credit_limit",
  "sum_curr_bill_balance",
  "sum_curr_min_repayment",
  "sum_num_of_consumption",
  "sum_curr_bill_amount",
  "sum_adj_amount",
  "sum_cycle_interest",
  "sum_available_balance",
  "sum_cash_in_advance",
  "repay_num",
  "browse_num1",
  "browse_num2",
  "browse_num3",
  "browse_num4",
  "browse_num5",
  "browse_num6",
  "browse_num7",
  "browse_num8",
  "browse_num9",
  "browse_num10",
  "browse_num11"
)
scale(train[,a])
summary(train[,a])
quantile(train[,"repay_num"],0.01)
quantile(train$sum_credit_limit,0.99)

boxplot(scale(train[,a]))
#### 将异常值处理 
for(i in a){
  train[,i] <-
    ifelse(train[,i] <= quantile(train[,i],0.01,na.rm = T), quantile(train[,i],0.01,na.rm = T),
           ifelse(train[,i] >=quantile(train[,i],0.99,na.rm = T), quantile(train[,i],0.99,na.rm = T),
                  train[,i]))
}

######### 6哑变量转换 #########   
train_1<-train

train_1$m2_sex<-ifelse(train_1$sex %in% c(2),0,1)
summary(train_1$m2_sex)
summary(train_1$sex)
table(train_1$m2_sex)
#jobs
train_1$m2_jobs<-ifelse(train_1$jobs %in% c(0,1,4),1,0)

#edu_status
train_1$m2_edu_status<-ifelse(train_1$edu_status %in% c(0,4),1,0)

#marry_status
train_1$m2_marry_status<-ifelse(train_1$marry_status %in% c(0,4,5),1,0)

#residence
train_1$m2_residence<-ifelse(train_1$residence %in% c(0,2,4),1,0)
############7 logistic模型############
library(ape)
library(vegan)
library(permute)
library(lattice)
library(nlme)
library(picante)  #call picante
# scale
#标准化-
train$scale_browse_count<-scale(train$browse_count)
train$scale_count_bank<-scale(train$count_bank)
train$scale_pre_not_repay<-scale(train$pre_not_repay)
train$scale_sum_credit_limit<-scale(train$sum_credit_limit)
train$scale_sum_curr_bill_balance<-scale(train$sum_curr_bill_balance)
train$scale_sum_curr_min_repayment<-scale(train$sum_curr_min_repayment)
train$scale_sum_num_of_consumption<-scale(train$sum_num_of_consumption)
train$scale_sum_curr_bill_amount<-scale(train$sum_curr_bill_amount)
train$scale_sum_adj_amount<-scale(train$sum_adj_amount)
train$scale_sum_cycle_interest<-scale(train$sum_cycle_interest)
train$scale_sum_available_balance<-scale(train$sum_available_balance)
train$scale_sum_cash_in_advance<-scale(train$sum_cash_in_advance)
train$scale_repay_num<-scale(train$repay_num)
train$scale_not_repay_num<- scale(train$not_repay_num)

var_list1<-c(
  "m2_sex",
  "m2_edu_status",
  "m2_marry_status",
  "wage_income",
  "exwage_income",
  "income",
  "pre_not_repay",
  "sum_curr_bill_balance",
  "sum_num_of_consumption",
  "sum_available_balance",
  "sum_cash_in_advance",
  "repay_num",
  "not_repay_num"
)

var_list1<-c(
  #"user_id",
  "sex",
  "jobs",
  "edu_status",
  "marry_status",
  "residence",
  #"loan_time",
  #"overdue_label",
  "wage_income",
  "exwage_income",
  "property",
  "income",
  "outcome",
  "count_bank",
  "pre_not_repay",
  "sum_credit_limit",
  "sum_curr_bill_balance",
  "sum_curr_min_repayment",
  "sum_num_of_consumption",
  "sum_curr_bill_amount",
  "sum_adj_amount",
  "sum_cycle_interest",
  "sum_available_balance",
  "sum_cash_in_advance",
  "repay_num",
  "not_repay_num",
  "browse_num1",
  "browse_num2",
  "browse_num3",
  "browse_num4",
  "browse_num5",
  "browse_num6",
  "browse_num7",
  "browse_num8",
  "browse_num9",
  "browse_num10",
  "browse_num11"
)

#View(train[,var_list1])



mods<-train[,c('overdue_label',var_list1)]  #select Y and varibales you want to try
str(mods)

(model_glm<-glm(overdue_label~.,data=mods,family =binomial(link ="logit")))  #logistic model
#########Stepwise先建模然后跑递归


library(MASS)
model_sel<-stepAIC(model_glm,direction ="both")  #using both backward and forward stepwise selection
summary<-summary(model_sel)  #summary

model_summary<-data.frame(var=rownames(summary$coefficients),summary$coefficients) #do the model summary
View(model_summary)


#### 8 模型评估 ####
#预测
train$predict_1<-predict(model_glm,newdata=train,type="response")
summary(train$predict_1)
#train$predict_1<-ifelse(train$predict_1>0.95,1,0)
library(ROCR)
library(gplots)
roc<-prediction(train$predict_1,train$overdue_label)
cc<-performance(roc,"tpr","fpr")
plot(cc)
abline(a=0,b=1)
performance(roc,"auc")
#k-s曲线,越接近1越好,小于0.2不可接受
myks<-function(y,predict_y){
  pred <- prediction(predictions=predict_y,labels=y)
  perf <- performance(pred,"tpr","fpr")
  tmp<-max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
  return(tmp)
}
myks(train$overdue_label,train$predict_1)


#预测test

test$m2_sex<-ifelse(test$sex %in% c(2),0,1)
summary(test$m2_sex)
summary(test$sex)
table(test$m2_sex)
#jobs
test$m2_jobs<-ifelse(test$jobs %in% c(0,1,4),1,0)

#edu_status
test$m2_edu_status<-ifelse(test$edu_status %in% c(0,4),1,0)

#marry_status
test$m2_marry_status<-ifelse(test$marry_status %in% c(0,4,5),1,0)

#residence
test$m2_residence<-ifelse(test$residence %in% c(0,2,4),1,0)
test$predict_1<-predict(model_glm,newdata=test,type="response")
summary(test$predict_1)
#test$predict_1<-ifelse(test$predict_1>0.95,1,0)
library(ROCR)
library(gplots)
roc<-prediction(test$predict_1,test$overdue_label)
cc<-performance(roc,"tpr","fpr")
plot(cc)
abline(a=0,b=1)
performance(roc,"auc")
#k-s曲线,越接近1越好,小于0.2不可接受
myks<-function(y,predict_y){
  pred <- prediction(predictions=predict_y,labels=y)
  perf <- performance(pred,"tpr","fpr")
  tmp<-max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
  return(tmp)
}
myks(test$overdue_label,test$predict_1)




#### bank_detail取出单独做   因子化#####
train_1<-train
train_1<-train[!(is.na(train$wage_income)),]
write.csv(train_1,"C:/Users/CDAer/Desktop/train_1.csv")

hist(train$income)

nrow(train_1)
colnames(train_1)


train_1$m1_sex<-as.factor(train_1$sex)
train_1$m1_jobs<-as.factor(train_1$jobs)
train_1$m1_edu_status<-as.factor(train_1$edu_status)
train_1$m1_marry_status<-as.factor(train_1$marry_status)
train_1$m1_residence<-as.factor(train_1$residence)


var_list1<-c(
  #"user_id",
  "m1_sex",
  "m1_jobs",
  "m1_edu_status",
  "m1_marry_status",
  "m1_residence",
  #"loan_time",
  #"overdue_label",
  "pre_not_repay",
  "sum_credit_limit",
  "sum_curr_bill_balance",
  "sum_curr_min_repayment",
  "sum_num_of_consumption",
  "sum_curr_bill_amount",
  "sum_adj_amount",
  "sum_cycle_interest",
  "sum_available_balance",
  "sum_cash_in_advance",
  "repay_num",
  "not_repay_num",
  "browse_num1",
  "browse_num2",
  "browse_num3",
  "browse_num4",
  "browse_num5",
  "browse_num6",
  "browse_num7",
  "browse_num8",
  "browse_num9",
  "browse_num10",
  "browse_num11"
)

View(train_1[,var_list1])



mods<-train_1[,c('overdue_label',var_list1)]  #select Y and varibales you want to try
str(mods)

(model_glm<-glm(overdue_label~.,data=mods,family =binomial(link ="logit")))  #logistic model
#########Stepwise先建模然后跑递归


library(MASS)
model_sel<-stepAIC(model_glm,direction ="both")  #using both backward and forward stepwise selection
summary<-summary(model_sel)  #summary

model_summary<-data.frame(var=rownames(summary$coefficients),summary$coefficients) #do the model summary
View(model_summary)

#### 8 模型评估 ####

var_list<-c(
  "m2_sex",
  "m2_jobs",
  "m2_edu_status",
  "m2_marry_status",
  "exwage_income",
  "income",
  "pre_not_repay",
  "sum_credit_limit",
  "sum_curr_bill_balance",
  "sum_num_of_consumption",
  "sum_cycle_interest",
  "sum_available_balance",
  "sum_cash_in_advance",
  "browse_num3",
  "browse_num6",
  "browse_num11"
)

mods<-train_1[,c('overdue_label',var_list)]  #select Y and varibales you want to try
str(mods)

(model_glm<-glm(overdue_label~.,data=mods,family =binomial(link ="logit")))  #logistic model
#预测
train_1$predict_1<-predict(model_glm,newdata=train_1,type="response")
summary(train_1$predict_1)
#train$predict_1<-ifelse(train$predict_1>0.95,1,0)
library(ROCR)
library(gplots)
roc<-prediction(train_1$predict_1,train_1$overdue_label)
cc<-performance(roc,"tpr","fpr")
plot(cc)
abline(a=0,b=1)
performance(roc,"auc")
#k-s曲线,越接近1越好,小于0.2不可接受
myks<-function(y,predict_y){
  pred <- prediction(predictions=predict_y,labels=y)
  perf <- performance(pred,"tpr","fpr")
  tmp<-max(attr(perf,"y.values")[[1]]-attr(perf,"x.values")[[1]])
  return(tmp)
}
myks(train_1$overdue_label,train_1$predict_1)


模型评估

posted @ 2018-05-25 16:00  数据分析之路  阅读(255)  评论(0编辑  收藏  举报