玩一个预测人品的比赛-代码积累
用xgboost进行训练,代码见下面
#设置路径,加载包
setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集')
library(xgboost)
library(magrittr)
library(Matrix)
library(dplyr)
# step 1:loading data
train=read.csv('train_x.csv')
test=read.csv('test_x.csv')
train.y=read.csv('train_y.csv')
ft=read.csv('features_type.csv')
# step 2:创建训练集,测试集的index,方便以后进行筛选测试集和训练集
train.index <- seq(1,nrow(train),1)
test.index <- seq(nrow(train)+1, nrow(train)+nrow(test), 1)
#combine train and test
traintest.combine <- rbind(train,test)%>%cbind(index=c(train.index,test.index),.)
#把所有的变量存进feature这个向量
fea <- unique(ft[,1])
#转换数据类型,分类变量转换为因子
for(f in fea){
if(ft[which(ft$feature==f),2]=='category')
traintest.combine[,f] <- as.factor(traintest.combine[,f])
}
#查看转化后的数据类型,和ft中的数据类型是否一致
# str(traintest.combine, list.len=ncol(traintest.combine))
# step 3:将分类变量转化为稀疏矩阵
df <- traintest.combine
res <- do.call('cbind',
lapply(names(df), function(x) model.matrix(as.formula(paste0(' ~',x,'-1')), df[x])))
# 去掉存在分类变量中存在-1的变量
X <- colnames(res)
ol <- grep(glob2rx("*-1"), X)
dat <- Matrix(res[,-ol],sparse=T)
# step 4: modeling
dtrain=xgb.DMatrix(data=dat[train.index,c(-1,-2)],label=train.y$y)
dtest=xgb.DMatrix(data=dat[test.index,c(-1,-2)])
set.seed(1)
model100=xgboost( booster='gbtree',
objective='binary:logistic',
scale_pos_weight=1542/13458,
gamma=0,
lambda=700,
subsample=0.7,
colsample_bytree=0.30,
min_child_weight=5,
max_depth=8,
eta=0.01,
data=dtrain,
nrounds=3820,
eval_metric='auc',
nthread=4)
pred=predict(model100,dtest)
write.csv(data.frame('uid'=test['uid'],'score'=pred),file='submit100.csv',row.names=F)
head(data.frame('uid'=test[,1],'score'=pred))
用随机森林训练,代码见下面
# how to calculate AUC in R?
# http://stackoverflow.com/questions/4903092/calculate-auc-in-r
if(!'ROCR' %in% installed.packages()[,1]) (install.packages('ROCR'))
library(ROCR)
library(randomForest)
library(e1071)
library(gbm)
library(xgboost)
library(data.table)
library(magrittr)
library(stringr)
library(foreach)
# randomForest
# step 1: load data into R and convert data type by batch
setwd('/Users/litao/R/eXtreme Gradient Boosting/eXtreme Gradient Boosting/比赛来一发数据集')
list.files()
features_type <- read.csv('features_type.csv')
train_x <- fread('train_x.csv',header = TRUE)%>%as.data.frame()
train_y <- fread('train_y.csv',header = TRUE)%>%as.data.frame()
train_y$y <- as.factor(train_y$y)
test_x <- fread('test_x.csv',header = TRUE)%>%as.data.frame()
# for train_x, convert category into factor by batch.
for(i in 1:1138){
if(features_type[i,2]=='category')
train_x[,i+1] <- as.factor(train_x[,i+1])
}
# for test_x,convert category into factory by batch
for(i in 1:1138){
if(features_type[i,2]=='category')
test_x[,i+1] <- as.factor(test_x[,i+1])
}
# 统一level
for(i in 1:1138){
if(features_type[i,2]=='category')
levels(test_x[,i+1]) <- levels(train_x[,i+1])
}
# step 2: is there any missing value in train_x????
## calculate missing value ratio for coloumns
missingvalue.ratio <- function(df){
df <- as.data.frame(df)
res <- is.na(df)%>%colSums()/length(df[,1])
return(res)
}
missingvalue.ratio(train_x)
## stratify sampling with replace, down-sampling the majority class ,up-sampling the minority
dat <- cbind(y=train_y[,2],train_x[,-1])
set.seed(12)
#----5000 颗树木
train.rf.1000 <- randomForest(y~.,data=dat
,mtry=34
,ntree=5000
,sampsize=c(1542,5000)
,strata=dat$y
,do.trace=1
,nodesize=2
)
# calculate AUC in randomForest
library(ROCR)
calculate.auc <- function(rf_output,target){
predictions=as.vector(train.rf$votes[,2])
pred=prediction(predictions,dat$y)
perf_AUC=performance(pred,"auc") #Calculate the AUC value
AUC=perf_AUC@y.values[[1]]
perf_ROC=performance(pred,"tpr","fpr") #plot the actual ROC curve
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
#calculate.auc(train.rf.1000,y)
}

浙公网安备 33010602011771号