机器学习与R语言：C5.0

#----------------------------------------
# 功能描述：演示C50建模过程
# 数据集：汉堡大学信贷模型，信贷数据
# 
#----------------------------------------
#第一步：收集数据
# import the CSV file
credit <- read.csv("/Users/chenyangang/R语言/data/credit.csv", stringsAsFactors = TRUE)

# 检查数据
table(credit$checking_balance)
table(credit$savings_balance)

# 五数分析法
summary(credit$months_loan_duration)
summary(credit$amount)

# 查看分类变量
table(credit$default)

# 利用随机数来获取训练数据和测试数据，如果需要重复这里的分析，可以使用随机种子set.seed
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]

# 比较数据集
summary(credit$amount)
summary(credit_rand$amount)
head(credit$amount)
head(credit_rand$amount)

# 分割数据集
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

# 查看分类变量的占比
prop.table(table(credit_train$default))
prop.table(table(credit_test$default))

## 第三步: 训练模型

library(C50)
#---------------------------------------------
# 创建分类器：
# m <- C5.0(train, class, trials = 1, costs = NULL)
# train: 一个包含训练数据的数据框
# class: 包含训练数据每一行的分类的一个因子向量
# trials: 为一个可选数值，用于控制自助法循环的次数（默认为1）
# costs: 为一个可选矩阵，用于给出与各种类型错误相对应的成本
# 该函数返回一个C5.0模型对象，该对象可用于预测
#
# 进行预测：
# p <- predict(m, test, type = "class")
# m: 由C5.0(train, class, trials = 1, costs = NULL) 训练的一个模型
# test: 一个包含测试数据的数据框，该数据框和用来创建分类器的训练数据有相同的特征
# type: 取值为“”或者“”标示预测是最可能的类别值或者是原始的预测概率
# 该函数返回一个向量，根据参数type的取值，该向量含有预测的类别值或者原始的预测概率
#
# example:
# credit_model <- C5.0(credit_train, loan_default)
# credit_prediction <- predict(credit_model, credit_test)
#----------------------------------------------
# 构建决策数据模型
credit_model <- C5.0(credit_train[-17], credit_train$default)

# 显示决策树模型
credit_model

# 显示模型详细信息
summary(credit_model)

## 第四步: 评估模型性能
# create a factor vector of predictions on test data
credit_pred <- predict(credit_model, credit_test)

# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(credit_test$default, credit_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))

## 第五步: 提升模型性能

## Boosting the accuracy of decision trees
# boosted decision tree with 10 trials
credit_boost10 <- C5.0(credit_train[-17], credit_train$default,
trials = 10)
credit_boost10
summary(credit_boost10)

credit_boost_pred10 <- predict(credit_boost10, credit_test)
CrossTable(credit_test$default, credit_boost_pred10,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))

# boosted decision tree with 100 trials (not shown in text)
credit_boost100 <- C5.0(credit_train[-17], credit_train$default,
trials = 100)
credit_boost_pred100 <- predict(credit_boost100, credit_test)
CrossTable(credit_test$default, credit_boost_pred100,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))

## Making some mistakes more costly than others
# create a cost matrix
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2)
error_cost

# apply the cost matrix to the tree
credit_cost <- C5.0(credit_train[-17], credit_train$default,
costs = error_cost)
credit_cost_pred <- predict(credit_cost, credit_test)

CrossTable(credit_test$default, credit_cost_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))

#### Part 2: Rule Learners -------------------

## Example: Identifying Poisonous Mushrooms ----
## Step 2: Exploring and preparing the data ---- 
mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE)

# examine the structure of the data frame
str(mushrooms)

# drop the veil_type feature
mushrooms$veil_type <- NULL

# examine the class distribution
table(mushrooms$type)

## Step 3: Training a model on the data ----
library(RWeka)

# train OneR() on the data
mushroom_1R <- OneR(type ~ ., data = mushrooms)

## Step 4: Evaluating model performance ----
mushroom_1R
summary(mushroom_1R)

## Step 5: Improving model performance ----
mushroom_JRip <- JRip(type ~ ., data = mushrooms)
mushroom_JRip
summary(mushroom_JRip)

# Rule Learner Using C5.0 Decision Trees (not in text)
library(C50)
mushroom_c5rules <- C5.0(type ~ odor + gill_size, data = mushrooms, rules = TRUE)
summary(mushroom_c5rules)
posted @ 2016-04-01 15:54 开心玩数据阅读(4850) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
tychyg's Blog

开心玩数据,专注于大数据、BI

机器学习与R语言：C5.0

公告