R语言学习笔记（三）

第四，五章学习总结，这两章主要讲解了R语言的系统函数，例如怎么把缺失值剔除，数据类型的识别和转换，数据集的合并，怎么使用SQL库，函数以及R程序的顺序，选择和循环的实现方式。


缺失值
> manager <- c(1,2,3,4,5)
> date<-c("10/24/08","10/28/08","10/1/08","10/12/08","5/1/09")
> country<-c("US","US","UK","UK","UK")
> gender<-c("M","F","F","M","F")
> age<-c(32,45,25,39,99)
> q1<-(5,3,3,3,2)
> q2<-c(4,5,5,3,2)
> q3<-c(,2,5,4,1)
> q3<-c(5,2,5,4,1)
> q4<-c(5,5,5,NA,2)
> q5<-c(5,5,2,NA,1)
leadership<-data.frame(manager,date,country,gender,age,q1,q2,q3,q4,q5,stringsAsFactors=FALSE)
> leadership
manager date country gender age q1 q2 q3 q4 q5
1 1 10/24/08 US M 32 5 4 5 5 5
2 2 10/28/08 US F 45 3 5 2 5 5
3 3 10/1/08 UK F 25 3 5 5 5 2
4 4 10/12/08 UK M 39 3 3 4 NA NA
5 5 5/1/09 UK F 99 2 2 1 2 1


运算符号
> #+，-，*，/
> #^ ** 求幂
> # %% 求余
> # %/% 求除数


逻辑运算符
> # <, <=,>, >=, ==, !=, !x, x|y, x&y, isTRUE(x)


矩阵计算
> mydata<-data.frame(x1=c(2,2,6,4),x2=c(3,4,2,8))
> mydata$sumx
NULL
> mydata$sumx<-mydata$x1+mydata$x2
> mydata$meanx<-(mydata$x1+mydata$x2)/2
> mydata$sumx
[1] 5 6 8 12
> mydata$meanx
[1] 2.5 3.0 4.0 6.0

 


扩展数据集
mydata<-transform(mydata,sumx=x1+x2,meanx=(x1+x2)/2)
> mydata
x1 x2 sumx meanx
1 2 3 5 2.5
2 2 4 6 3.0
3 6 2 8 4.0
4 4 8 12 6.0

 


列选择
> leadership$age[leadership$age ==99]<-NA
> leadership$agecat
NULL
> leadership$agecat[leadership$age >75] <- "Elder"
> leadership$agecat[leadership$age >=55 & leadership$age<=75] <- "Middle Aged"
> leadership$agecat[leadership$age <55] <- "Young"
> leadership <- within(leadership,{agecat <- NA})
> 
> leadership
manager date country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 NA
2 2 10/28/08 US F 45 3 5 2 5 5 NA
3 3 10/1/08 UK F 25 3 5 5 5 2 NA
4 4 10/12/08 UK M 39 3 3 4 NA NA NA
5 5 5/1/09 UK F NA 2 2 1 2 1 NA
> leadership$agecat[leadership$age >75] <- "Elder"
> leadership$agecat[leadership$age >=55 & leadership$age<=75] <- "Middle Aged"
> leadership$agecat[leadership$age <55] <- "Young"
> leadership
manager date country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> leadership<-within(leadership,{})
> 
> leadership<-within(leadership,{})
> leadership<-within(leadership,{})
> leadership<-within(leadership,{})
> leadership<-within(leadership,{})
> leadership<-within(leadership,{
+ agecat<-NA
+ agecat[age>75]<-"Elder"
+ agecat[age>=55 & age<=75] <-"Middle Aged"
+ agecat[age<55]<-"Young"})
> leadership
manager date country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>



重命名列
> fix(leadership)
names(leadership)[2] <- "testDate"
> leadership
managerID testDate country gender age q1 q2 q3 q4 q5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> names(leadership)[6:10] <- c("item1","item2","item3","item4","item5")
> leadership
managerID testDate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>


判断空值
is.na(leadership)
managerID testDate country gender age item1 item2 item3 item4 item5
1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
5 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
agecat
1 FALSE
2 FALSE
3 FALSE
4 FALSE
5 TRUE

 


排除缺失值
> x<-c(1,2,NA,3)
> y<-x[1]+x[2]+x[3]+x[4]
> z<-sum(x)
> z
[1] NA
> 
> y
[1] NA
> k<-sum(x,na.rm=TRUE)
> k
[1] 6
> 
> leadership
managerID testDate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> newda<-na.omit(leadership)


> newda
managerID testDate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
>

 


日期转换
> #%d 数字日期（1~31)
> #%a 缩写星期名
> #%A 非缩写星期名
> #%m 月份（00~12）
> #%b 缩写月份 e.g. Jan
> #%B 非缩写月份 e.g. January
> #%y 两位数的年份 07
> #%Y 四位数的年份 2007
>

 


> mydates<-as.Date(c("2007-06-22","2004-02-13"))
> mydates
[1] "2007-06-22" "2004-02-13"
> strDates<-c("01/05/1965","08/16/1975")
> dates<-as.Date(strDates,"%m/%d/%Y")
> dates
[1] "1965-01-05" "1975-08-16"
> 
> myFormat<-"%m/%d/%y"
> leadership$date<-as.Date(leadership$date,myformat)


> leadership
managerID testDate country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> fix(leadership)
> leadership
managerID Date country gender age item1 item2 item3 item4 item5 agecat
1 1 10/24/08 US M 32 5 4 5 5 5 Young
2 2 10/28/08 US F 45 3 5 2 5 5 Young
3 3 10/1/08 UK F 25 3 5 5 5 2 Young
4 4 10/12/08 UK M 39 3 3 4 NA NA Young
5 5 5/1/09 UK F NA 2 2 1 2 1 <NA>
> leadership$date<-as.Date(leadership$date,myformat)


> leadership$Date<-as.Date(leadership$Date,myFormat)
> leadership
managerID Date country gender age item1 item2 item3 item4 item5
1 1 2008-10-24 US M 32 5 4 5 5 5
2 2 2008-10-28 US F 45 3 5 2 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
4 4 2008-10-12 UK M 39 3 3 4 NA NA
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
1 Young
2 Young
3 Young
4 Young
5 <NA>


> Sys.Date()
[1] "2017-04-13"


> date()
[1] "Thu Apr 13 22:19:48 2017"
> 
> 
> today<-Sys.Date()
> format(today,format="%B %d %Y")
[1] "四月 13 2017"
> 
> startDate<-as.Date("2004-02-12")
> endDate<-as.Date("2011-01-22")
> days<-endDate-startDate
> days
Time difference of 2536 days
> 
> 
> today<-Sys.Date()
> dob<-as.Date("1987-01-03")
日期数据运算
> difftime(today,dob,units="days")
Time difference of 11058 days
> difftime(today,dob,units="weeks")
Time difference of 1579.714 weeks


日期转字符
> strDate<-as.character(Sys.Date())
> strDate
[1] "2017-04-13"
>

 

 


类型转换
> #is.numeric() as.numeric()
> #is.character() as.character()
> #is.vector() as.vector()
> #is.matrix() as.matrix()
> #is.data.frame() as.data.frame()
> #is.factor() as.factor()
> #is.logical() as.logical()
> 
> 
> 
> is.numeric(1)
[1] TRUE
> is.numeric("1")
[1] FALSE
> is.vector("1")
[1] TRUE
> is.vector(c(1,2,3))
[1] TRUE
> is.factor("a")
[1] FALSE
> is.logical(TRUE)
[1] TRUE
> is.logical("TRUE")
[1] FALSE

 

 


数据排序
> newdata<-leadership[order(leadership$age),]
> leadership
managerID Date country gender age item1 item2 item3 item4 item5
1 1 2008-10-24 US M 32 5 4 5 5 5
2 2 2008-10-28 US F 45 3 5 2 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
4 4 2008-10-12 UK M 39 3 3 4 NA NA
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
1 Young
2 Young
3 Young
4 Young
5 <NA>
> newdata
managerID Date country gender age item1 item2 item3 item4 item5
3 3 2008-10-01 UK F 25 3 5 5 5 2
1 1 2008-10-24 US M 32 5 4 5 5 5
4 4 2008-10-12 UK M 39 3 3 4 NA NA
2 2 2008-10-28 US F 45 3 5 2 5 5
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
3 Young
1 Young
4 Young
2 Young
5 <NA>


> newdata<-leadership[order(leadership$age,decreasing=TRUE),] #这里有个逗号


> newdata
managerID Date country gender age item1 item2 item3 item4 item5
2 2 2008-10-28 US F 45 3 5 2 5 5
4 4 2008-10-12 UK M 39 3 3 4 NA NA
1 1 2008-10-24 US M 32 5 4 5 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
2 Young
4 Young
1 Young
3 Young
5 <NA>
>

 


数据集合并
> total<-merge(dataframeA,dataframeB,by="ID")


> #横向合并 merge()
> #纵向合并 rbind()

 


排除选择
> myvars<-names(leadership) %in% c("q3","q4")
> newdata<-leadership[!myvars]
> newdata
managerID Date country gender age item1 item2 item3 item4 item5
1 1 2008-10-24 US M 32 5 4 5 5 5
2 2 2008-10-28 US F 45 3 5 2 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
4 4 2008-10-12 UK M 39 3 3 4 NA NA
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
1 Young
2 Young
3 Young
4 Young
5 <NA>
> myvars
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
> myvars<-names(leadership) %in% c("item3","item4")
> newdata<-leadership[!myvars]
> newdata
managerID Date country gender age item1 item2 item5 agecat
1 1 2008-10-24 US M 32 5 4 5 Young
2 2 2008-10-28 US F 45 3 5 5 Young
3 3 2008-10-01 UK F 25 3 5 2 Young
4 4 2008-10-12 UK M 39 3 3 NA Young
5 5 2009-05-01 UK F NA 2 2 1 <NA>
> newdata
managerID Date country gender age item1 item2 item5 agecat
1 1 2008-10-24 US M 32 5 4 5 Young
2 2 2008-10-28 US F 45 3 5 5 Young
3 3 2008-10-01 UK F 25 3 5 2 Young
4 4 2008-10-12 UK M 39 3 3 NA Young
5 5 2009-05-01 UK F NA 2 2 1 <NA>

 


> newdata
managerID Date country gender age item1 item2 item5
1 1 2008-10-24 US M 32 5 4 5
2 2 2008-10-28 US F 45 3 5 5
3 3 2008-10-01 UK F 25 3 5 2
4 4 2008-10-12 UK M 39 3 3 NA
5 5 2009-05-01 UK F NA 2 2 1
> 
>


subset -- 类似SQL中的Select
> newdata<-subset(leadership,age>=35 |age<24,select=c(item1,item2,item3,item4))
> newdata
item1 item2 item3 item4
2 3 5 2 5
4 3 3 4 NA
> newdata<-subset(leadership,age>=35 |age<24,select=gender:item4)


> newdata
gender age item1 item2 item3 item4
2 F 45 3 5 2 5
4 M 39 3 3 4 NA
>


数据抽样
> mysample<-leadership[sample(1:nrow(leadership),3,replace=FALSE),]
> mysample
managerID Date country gender age item1 item2 item3 item4 item5
1 1 2008-10-24 US M 32 5 4 5 5 5
2 2 2008-10-28 US F 45 3 5 2 5 5
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
1 Young
2 Young
5 <NA>
> leadership
managerID Date country gender age item1 item2 item3 item4 item5
1 1 2008-10-24 US M 32 5 4 5 5 5
2 2 2008-10-28 US F 45 3 5 2 5 5
3 3 2008-10-01 UK F 25 3 5 5 5 2
4 4 2008-10-12 UK M 39 3 3 4 NA NA
5 5 2009-05-01 UK F NA 2 2 1 2 1
agecat
1 Young
2 Young
3 Young
4 Young
5 <NA>

 

 


使用SQL
> install.packages("sqldf")
library("sqldf")
dataset<-sqldf("select * from leadership")
Loading required package: tcltk
Warning message:
Quoted identifiers should have class SQL, use DBI::SQL() if the caller performs the quoting. 
> dataset<-sqldf("select * from leadership",row.names=TRUE)
> dataset
managerID Date country gender age item1 item2 item3 item4 item5 agecat
1 1 2008-10-24 US M 32 5 4 5 5 5 Young
2 2 2008-10-28 US F 45 3 5 2 5 5 Young
3 3 2008-10-01 UK F 25 3 5 5 5 2 Young
4 4 2008-10-12 UK M 39 3 3 4 NA NA Young
5 5 2009-05-01 UK F NA 2 2 1 2 1 <NA>
> dataset<-sqldf("select managerID,Date,country,item1+item2+item3+item4 from leadership",row.names=TRUE)
> dataset
managerID Date country item1+item2+item3+item4
1 1 2008-10-24 US 19
2 2 2008-10-28 US 15
3 3 2008-10-01 UK 18
4 4 2008-10-12 UK NA
5 5 2009-05-01 UK 7
> dataset<-sqldf("select managerID,Date,country,item1+item2+item3+item4 'ab' from leadership",row.names=TRUE)
> dataset
managerID Date country ab
1 1 2008-10-24 US 19
2 2 2008-10-28 US 15
3 3 2008-10-01 UK 18
4 4 2008-10-12 UK NA
5 5 2009-05-01 UK 7
> dataset<-sqldf("select managerID,Date,country,item1+item2+item3+item4 'composed field' from leadership",row.names=TRUE)
> dataset
managerID Date country composed field
1 1 2008-10-24 US 19
2 2 2008-10-28 US 15
3 3 2008-10-01 UK 18
4 4 2008-10-12 UK NA
5 5 2009-05-01 UK 7

 


服从正态分布的伪函数 --用来生成测试数据比较好
> runif(5)
[1] 0.8085362 0.4827602 0.4272967 0.8801272 0.2206980
> set.seed(1234)
> runif(5)
[1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
> set.seed(1234)
> runif(5)
[1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154
> runif(6)
[1] 0.640310605 0.009495756 0.232550506 0.666083758 0.514251141 0.693591292


多元正态分布函数


> library(MASS)
> options(digits=3)
> set.seed(1234)
> mean<-c(230.7, 146,7, 3.6)
> sigma<-matrix(c(15360.8,6721.2, -47.1, 6721.2, 4700.9, -16.5,-47.1,-16.5,0.3),nrow=3,ncol=3)


> sigma
[,1] [,2] [,3]
[1,] 15360.8 6721.2 -47.1
[2,] 6721.2 4700.9 -16.5
[3,] -47.1 -16.5 0.3


矩阵操作
> a<-5
> sqrt(a)
[1] 2.24
> b<-c(1.243,5.654,2.99)
> round(b)
[1] 1 6 3
> c<-matrix(runif(12),nrow=3)
> c
[,1] [,2] [,3] [,4]
[1,] 0.694 0.923 0.286 0.232
[2,] 0.545 0.292 0.267 0.317
[3,] 0.283 0.837 0.187 0.303
> log(c)
[,1] [,2] [,3] [,4]
[1,] -0.366 -0.0797 -1.25 -1.46
[2,] -0.607 -1.2299 -1.32 -1.15
[3,] -1.263 -0.1776 -1.68 -1.20
> 
> #将运算运用到单列或单列
> apply(c,1,mean)
[1] 0.534 0.355 0.402
> apply(c,2,mean)
[1] 0.507 0.684 0.247 0.284

 


统计学生成绩排名 -- 下面的代码是一个比较学生排名的例子，学生的排名需要根据三门不同的学科来决定，但是不同的学科没有可比性，所有需要先将不同的学科转化为具有可比性的系数，然后在汇总系数并进行排序。


> options(digits=3)
> Student<-c("John Davis","Angela Williams", "Bullwinkle Moose", "David Jones", "Janice Makhammer", "Cheryl Cushing", "reuven Ytzrhak", "Greg Knox","Joel England","Mary Rayburn")
> Math<-c(502,600,412,358,495,512,410,625,573,522)
> Science<-c(95,99,80,82,75,85,80,95,89,86)
> English<-c(25,22,18,15,20,28,15,30,27,18)
> roster<-data.frame(Student,Math,Science,English,stringAsFactors=FALSE)
> z<-scale(roster[,2:4]) -- 这个函数将具体的分数转化为具有可比性的系数
> roster
Student Math Science English stringAsFactors
1 John Davis 502 95 25 FALSE
2 Angela Williams 600 99 22 FALSE
3 Bullwinkle Moose 412 80 18 FALSE
4 David Jones 358 82 15 FALSE
5 Janice Makhammer 495 75 20 FALSE
6 Cheryl Cushing 512 85 28 FALSE
7 reuven Ytzrhak 410 80 15 FALSE
8 Greg Knox 625 95 30 FALSE
9 Joel England 573 89 27 FALSE
10 Mary Rayburn 522 86 18 FALSE
> z
Math Science English
[1,] 0.0127 1.078 0.5869
[2,] 1.1434 1.591 0.0367
[3,] -1.0257 -0.847 -0.6969
[4,] -1.6487 -0.590 -1.2471
[5,] -0.0681 -1.489 -0.3301
[6,] 0.1281 -0.205 1.1370
[7,] -1.0488 -0.847 -1.2471
[8,] 1.4318 1.078 1.5038
[9,] 0.8319 0.308 0.9536
[10,] 0.2434 -0.077 -0.6969
attr(,"scaled:center")
Math Science English 
500.9 86.6 21.8 
attr(,"scaled:scale")
Math Science English 
86.67 7.79 5.45 
> help(scale)


> roster[1,]
Student Math Science English stringAsFactors
1 John Davis 502 95 25 FALSE
> roster[1,2:4]
Math Science English
1 502 95 25


> score<-apply(z,1,mean)    # 1是按行，2是按列，
> score
[1] 0.559 0.924 -0.857 -1.162 -0.629 0.353 -1.048 1.338 0.698 -0.177
> roster<-cbind(roster,score)
> roster
Student Math Science English stringAsFactors score
1 John Davis 502 95 25 FALSE 0.559
2 Angela Williams 600 99 22 FALSE 0.924
3 Bullwinkle Moose 412 80 18 FALSE -0.857
4 David Jones 358 82 15 FALSE -1.162
5 Janice Makhammer 495 75 20 FALSE -0.629
6 Cheryl Cushing 512 85 28 FALSE 0.353
7 reuven Ytzrhak 410 80 15 FALSE -1.048
8 Greg Knox 625 95 30 FALSE 1.338
9 Joel England 573 89 27 FALSE 0.698
10 Mary Rayburn 522 86 18 FALSE -0.177
> y<-quantile(score, c(.8,.6,.4,.2)
+ )
> y
80% 60% 40% 20% 
0.743 0.436 -0.358 -0.895 
> roster$grade[score>y[1]]<-"A"
> roster$grade[score<y[1] & score>=y[2]]<-"B"
> roster$grade[score<y[2] & score>=y[3]]<="C"
[1] NA NA
> roster$grade[score<y[2] & score>=y[3]]<-"C"
> roster$grade[score<y[3] & score>=y[4]]<-"D"
> roster$grade[score<y[4]]<-"E"
> roster
Student Math Science English stringAsFactors score grade
1 John Davis 502 95 25 FALSE 0.559 B
2 Angela Williams 600 99 22 FALSE 0.924 A
3 Bullwinkle Moose 412 80 18 FALSE -0.857 D
4 David Jones 358 82 15 FALSE -1.162 E
5 Janice Makhammer 495 75 20 FALSE -0.629 D
6 Cheryl Cushing 512 85 28 FALSE 0.353 C
7 reuven Ytzrhak 410 80 15 FALSE -1.048 E
8 Greg Knox 625 95 30 FALSE 1.338 A
9 Joel England 573 89 27 FALSE 0.698 B
10 Mary Rayburn 522 86 18 FALSE -0.177 C
> roster$grade[score<y[4]]<-"F"
> roster
Student Math Science English stringAsFactors score grade
1 John Davis 502 95 25 FALSE 0.559 B
2 Angela Williams 600 99 22 FALSE 0.924 A
3 Bullwinkle Moose 412 80 18 FALSE -0.857 D
4 David Jones 358 82 15 FALSE -1.162 F
5 Janice Makhammer 495 75 20 FALSE -0.629 D
6 Cheryl Cushing 512 85 28 FALSE 0.353 C
7 reuven Ytzrhak 410 80 15 FALSE -1.048 F
8 Greg Knox 625 95 30 FALSE 1.338 A
9 Joel England 573 89 27 FALSE 0.698 B
10 Mary Rayburn 522 86 18 FALSE -0.177 C
> 
>

 

 


> roster$Student
[1] John Davis Angela Williams Bullwinkle Moose David Jones Janice Makhammer
[6] Cheryl Cushing reuven Ytzrhak Greg Knox Joel England Mary Rayburn 
10 Levels: Angela Williams Bullwinkle Moose Cheryl Cushing David Jones ... reuven Ytzrhak

 

 


> options(digits=2)
> roster
Student Math Science English stringAsFactors score grade
1 John Davis 502 95 25 FALSE 0.56 B
2 Angela Williams 600 99 22 FALSE 0.92 A
3 Bullwinkle Moose 412 80 18 FALSE -0.86 D
4 David Jones 358 82 15 FALSE -1.16 F
5 Janice Makhammer 495 75 20 FALSE -0.63 D
6 Cheryl Cushing 512 85 28 FALSE 0.35 C
7 reuven Ytzrhak 410 80 15 FALSE -1.05 F
8 Greg Knox 625 95 30 FALSE 1.34 A
9 Joel England 573 89 27 FALSE 0.70 B
10 Mary Rayburn 522 86 18 FALSE -0.18 C


> roster[order(grade),]

 


R语言控制流
> #for 循环


> for(i in 1:10) print("hello")
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"
[1] "hello"


> for(i in 1:10) print(as.character(i))
[1] "1"
[1] "2"
[1] "3"
[1] "4"
[1] "5"
[1] "6"
[1] "7"
[1] "8"
[1] "9"
[1] "10"

 

 

 


> for(i in 1:10) print(as.character(i))> > 
> roster[order(roster$grade),]
Student Math Science English stringAsFactors score grade
2 Angela Williams 600 99 22 FALSE 0.92 A
8 Greg Knox 625 95 30 FALSE 1.34 A
1 John Davis 502 95 25 FALSE 0.56 B
9 Joel England 573 89 27 FALSE 0.70 B
6 Cheryl Cushing 512 85 28 FALSE 0.35 C
10 Mary Rayburn 522 86 18 FALSE -0.18 C
3 Bullwinkle Moose 412 80 18 FALSE -0.86 D
5 Janice Makhammer 495 75 20 FALSE -0.63 D
4 David Jones 358 82 15 FALSE -1.16 F
7 reuven Ytzrhak 410 80 15 FALSE -1.05 F
> 
> 
> #while 循环
> i<-10
> while(i>0){
+ print("Hello");
+ i<-i-1;
+ }
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
[1] "Hello"
> 
> 
> #条件


> if(is.character(roster$grade)) roster$grade<-as.factor(roster$grade)
> if(!is.factor(roster$grade)) grade<- as.factor(grade) else print("Grade already is a factor")
[1] "Grade already is a factor"
>


> ifelse(roster$score>0.5,print("Passed"),print("Failed"))
[1] "Passed"
[1] "Failed"
[1] "Passed" "Passed" "Failed" "Failed" "Failed" "Failed" "Failed" "Passed" "Passed" "Failed"
> 
> 
> feelings<-c("sad","afraid")
> for (i in feelings)
+ print( switch(i,
+ happy="I am glad you are happy",
+ afraid="There is nothing to fear",
+ sad="Cheer up",
+ angry="Calm down now")
+ )
[1] "Cheer up"
[1] "There is nothing to fear"

 

 


自定义函数
> 
> 
> mystats<-function(x,parametric=TRUE,print=FALSE){ #为形参设置默认值
+ if(parametric){
+ center<-mean(x);spreed<-sd(x)
+ }else{
+ center<-median(x);spread<-mad(x)}
+ if(print $ parametric){
+ cat("Mean=", center, "\n", "SD=", spread, "\n")
+ }else if(print & !parametric){
+ cat("Median=",center,"\n","MAD=",spread,"\n")
+ }
+ result<-list(center=center,spread=spread)
+ return(result)
+ }
> mystats
function(x,parametric=TRUE,print=FALSE){
if(parametric){
center<-mean(x);spreed<-sd(x)
}else{
center<-median(x);spread<-mad(x)}
if(print $ parametric){
cat("Mean=", center, "\n", "SD=", spread, "\n")
}else if(print & !parametric){
cat("Median=",center,"\n","MAD=",spread,"\n")
}
result<-list(center=center,spread=spread)
return(result)
}
> set.seed(1234)
> x<-rnorm(500)

 

 

 

 


> mystats> > 
> function(x,parametric=TRUE,print=FALSE){
+ if(parametric){
+ center<-mean(x);spreed<-sd(x)
+ }else{
+ center<-median(x);spread<-mad(x)}
+ if(print $ parametric){
+ cat("Mean=", center, "\n", "SD=", spread, "\n")
+ }else if(print & !parametric){
+ cat("Median=",center,"\n","MAD=",spread,"\n")
+ }
+ result<-list(center=center,spread=spread)
+ return(result)
+ }
function(x,parametric=TRUE,print=FALSE){
if(parametric){
center<-mean(x);spreed<-sd(x)
}else{
center<-median(x);spread<-mad(x)}
if(print $ parametric){
cat("Mean=", center, "\n", "SD=", spread, "\n")
}else if(print & !parametric){
cat("Median=",center,"\n","MAD=",spread,"\n")
}
result<-list(center=center,spread=spread)
return(result)
}
> 
> mystats<-function(x,parametric=TRUE,print=FALSE){
+ if(parametric){
+ center<-mean(x);spreed<-sd(x)
+ }else{
+ center<-median(x);spread<-mad(x)}
+ if(print & parametric){
+ cat("Mean=", center, "\n", "SD=", spread, "\n")
+ }else if(print & !parametric){
+ cat("Median=",center,"\n","MAD=",spread,"\n")
+ }
+ result<-list(center=center,spread=spread)
+ return(result)
+ }


> mystats
function(x,parametric=TRUE,print=FALSE){
if(parametric){
center<-mean(x);spreed<-sd(x)
}else{
center<-median(x);spread<-mad(x)}
if(print & parametric){
cat("Mean=", center, "\n", "SD=", spread, "\n")
}else if(print & !parametric){
cat("Median=",center,"\n","MAD=",spread,"\n")
}
result<-list(center=center,spread=spread)
return(result)
}
> mystats<-function(x,parametric=TRUE,print=FALSE){
+ if(parametric){
+ center<-mean(x);spread<-sd(x)
+ }else{
+ center<-median(x);spread<-mad(x)}
+ if(print & parametric){
+ cat("Mean=", center, "\n", "SD=", spread, "\n")
+ }else if(print & !parametric){
+ cat("Median=",center,"\n","MAD=",spread,"\n")
+ }
+ result<-list(center=center,spread=spread)
+ return(result)
+ }
> mystats(x)
$center
[1] 0.0018

 


$spread
[1] 1

 


> x

 


转置 - 行转列，列转行
> cars<-mtcars[1:5,1:4]
> cars
mpg cyl disp hp
Mazda RX4 21 6 160 110
Mazda RX4 Wag 21 6 160 110
Datsun 710 23 4 108 93
Hornet 4 Drive 21 6 258 110
Hornet Sportabout 19 8 360 175
> t(cars)
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
mpg 21 21 23 21 19
cyl 6 6 4 6 8
disp 160 160 108 258 360
hp 110 110 93 110 175
> cars
mpg cyl disp hp
Mazda RX4 21 6 160 110
Mazda RX4 Wag 21 6 160 110
Datsun 710 23 4 108 93
Hornet 4 Drive 21 6 258 110
Hornet Sportabout 19 8 360 175
> cars2<-t(cars)
> cars2
Mazda RX4 Mazda RX4 Wag Datsun 710 Hornet 4 Drive Hornet Sportabout
mpg 21 21 23 21 19
cyl 6 6 4 6 8
disp 160 160 108 258 360
hp 110 110 93 110 175

 


aggregate -- 分组计算
aggdata<-aggregate(mtcars,by=list(cyl,gear),FUN=mean,na.rm=TRUE)
> aggdata
Group.1 Group.2 mpg cyl disp hp drat wt qsec vs am gear carb
1 4 3 21.5 4 120 97 3.70 2.46 20.0 1.0 0.00 3 1.00
2 6 3 19.8 6 242 108 2.92 3.34 19.8 1.0 0.00 3 1.00
3 8 3 15.1 8 358 194 3.12 4.10 17.1 0.0 0.00 3 3.08
4 4 4 26.9 4 103 76 4.11 2.38 19.6 1.0 0.75 4 1.50
5 6 4 19.8 6 164 116 3.91 3.09 17.7 0.5 0.50 4 4.00
6 4 5 28.2 4 108 102 4.10 1.83 16.8 0.5 1.00 5 2.00
7 6 5 19.7 6 145 175 3.62 2.77 15.5 0.0 1.00 5 6.00
8 8 5 15.4 8 326 300 3.88 3.37 14.6 0.0 1.00 5 6.00

 

 


融合 & 重铸 -- 融合：将包含多个变量的数据打散为只具有唯一变量的数据集合，重铸：将打散后的数据安装要求组合为包含多个变量的数据集合

#融合
install.packages("reshape2")
library(reshape2)
> ID<-c("1","1","2","2")
> Time<-c("1","2","1","2")
> X1<-c("5","3","6","2")
> X2<-c("6","5","1","4")
> mydata<-data.frame(ID,Time,X1,X2)
> mydata
ID Time X1 X2
1 1 1 5 6
2 1 2 3 5
3 2 1 6 1
4 2 2 2 4
> md<-melt(mydata,id=c("ID","Time"))
Warning message:
attributes are not identical across measure variables; they will be dropped 
> md
ID Time variable value
1 1 1 X1 5
2 1 2 X1 3
3 2 1 X1 6
4 2 2 X1 2
5 1 1 X2 6
6 1 2 X2 5
7 2 1 X2 1
8 2 2 X2 4
> md2<-melt(mydata,id=c("ID"))
Warning message:
attributes are not identical across measure variables; they will be dropped 
> md2
ID variable value
1 1 Time 1
2 1 Time 2
3 2 Time 1
4 2 Time 2
5 1 X1 5
6 1 X1 3
7 2 X1 6
8 2 X1 2
9 1 X2 6
10 1 X2 5
11 2 X2 1
12 2 X2 4
> 
> #重铸
> dcast(mydata,ID+Time~variable)
Using X2 as value column: use value.var to override.
Error in eval(expr, envir, enclos) : 找不到对象'variable'
> dcast(md,ID+Time~variable)
ID Time X1 X2
1 1 1 5 6
2 1 2 3 5
3 2 1 6 1
4 2 2 2 4
> dcast(md,ID~variable)
Aggregation function missing: defaulting to length
ID X1 X2
1 1 2 2
2 2 2 2
> dcast(md,ID~variable+time)
Error in unique.default(x) : unique()只适用于矢量
> dcast(md,ID~variable+Time)
ID X1_1 X1_2 X2_1 X2_2
1 1 5 3 6 5
2 2 6 2 1 4
posted @ 2017-09-25 16:03 aifans2019 阅读(1674) 评论(0) 编辑收藏举报
会员力量，点亮园子希望
刷新页面返回顶部
AI Fans

Data Science门徒

R语言学习笔记（三）

公告