R语言实战 - 高级数据管理(1)
1. 一个数据处理难题

a. 确定一个单一的成绩衡量指标
b. 前20%的学生评定为A,接下来20%评定为B,以此类推
c. 按字母顺序对学生排序
2. 数值和字符处理函数
数值函数(数学,统计,概率)
字符处理函数
2.1 数学函数
> abs(-4) [1] 4 > > sqrt(25) [1] 5 > > ceiling(3.475) [1] 4 > > floor(3.475) [1] 3 > > trunc(5.99) [1] 5 > > round(3.475, digits=2) [1] 3.48 > > signif(3.475, digits=2) [1] 3.5 > > cos(3.1415926) [1] -1 > > sin(3.1415926) [1] 5.358979e-08 > > acos(-0.416) [1] 1.999839 > > sinh(2) [1] 3.62686 > > asinh(3.627) [1] 2.000037 > > log(10) [1] 2.302585 > > log10(10) [1] 1 > > exp(2.3026) [1] 10.00015 >
2.2 统计函数
> mean(c(1, 2, 3, 4))
[1] 2.5
>
> median(c(1, 2, 3, 4))
[1] 2.5
>
> sd(c(1, 2, 3, 4))
[1] 1.290994
>
> var(c(1, 2, 3, 4))
[1] 1.666667
>
> mad(c(1, 2, 3, 4))
[1] 1.4826
>
> x <- c(1, 2, 3, 4)
> y <- quantile(x, c(.3, .84))
> y
30% 84%
1.90 3.52
>
> range(x)
[1] 1 4
>
> diff(range(x))
[1] 3
>
> sum(x)
[1] 10
>
> x <- c(1, 5, 23, 29)
> diff(x)
[1] 4 18 6
>
> min(x)
[1] 1
>
> max(x)
[1] 29
>
> scale(x, center=TRUE, scale=TRUE)
[,1]
[1,] -0.9925397
[2,] -0.6984539
[3,] 0.6249324
[4,] 1.0660612
attr(,"scaled:center")
[1] 14.5
attr(,"scaled:scale")
[1] 13.60147
>
> x <- c(1, 2, 3, 4, 5, 6, 7, 8) > mean(x) [1] 4.5 > sd(x) [1] 2.44949 > > n <- length(x) > meanx <- sum(x)/n > css <- sum((x - meanx)^2) > sdx <- sqrt(css / (n-1)) > meanx [1] 4.5 > sdx [1] 2.44949 >
2.3 概率函数

> x <- pretty(c(-3, 3), 30) > x [1] -3.0 -2.8 -2.6 -2.4 -2.2 -2.0 -1.8 -1.6 -1.4 -1.2 -1.0 -0.8 -0.6 -0.4 -0.2 [16] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2 2.4 2.6 2.8 [31] 3.0 > y <- dnorm(x) > y [1] 0.004431848 0.007915452 0.013582969 0.022394530 0.035474593 0.053990967 [7] 0.078950158 0.110920835 0.149727466 0.194186055 0.241970725 0.289691553 [13] 0.333224603 0.368270140 0.391042694 0.398942280 0.391042694 0.368270140 [19] 0.333224603 0.289691553 0.241970725 0.194186055 0.149727466 0.110920835 [25] 0.078950158 0.053990967 0.035474593 0.022394530 0.013582969 0.007915452 [31] 0.004431848 > plot(x, y, type="l", xlab="NormalDeviate", ylab="Density", yaxs="i") >

> pnorm(1.96) [1] 0.9750021 > qnorm(.9, mean=500, sd=100) [1] 628.1552 > rnorm(50, mean=50, sd=10) [1] 67.26521 54.63231 42.90968 48.38989 73.67308 49.74476 57.81742 67.75197 [9] 66.51772 48.48707 39.37449 35.09612 59.43735 58.02651 40.43783 51.18190 [17] 63.75237 39.67564 42.67555 50.88800 43.47265 58.69022 64.55702 34.35042 [25] 63.23016 45.81644 43.31544 54.58287 50.46310 31.72297 40.34214 55.06260 [33] 42.25432 45.63078 56.23651 53.27949 47.83063 53.69351 56.68358 46.04020 [41] 57.20872 52.52052 49.20011 47.71317 55.79194 42.20664 48.22365 43.57350 [49] 33.02280 36.45630 >
2.3.1 设定随机数种子
> runif(5) [1] 0.8650632 0.2548104 0.7736314 0.9595250 0.5731663 > runif(5) [1] 0.3458971 0.8683841 0.7487097 0.1382841 0.1953431 > set.seed(1234) > runif(5) [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154 > set.seed(1234) > runif(5) [1] 0.1137034 0.6222994 0.6092747 0.6233794 0.8609154 >
2.3.2 生成多元正态数据
> library(MASS)
> options(digits=3)
# 生成随机数种子
> set.seed(1234)
# 指定均值向量、协方差阵
> mean <- c(230.7, 146.7, 3.6)
> sigma <- matrix(c(15360.8, 6721.2, -47.1,
+ 6721.2, 4700.9, -16.5,
+ -47.1, -16.5, 0.3), nrow=3, ncol=3)
# 生成500个伪随机观测数据
> mydata <- mvrnorm(500, mean, sigma)
# 为方便,结果从矩阵转换为数据框
> mydata <- as.data.frame(mydata)
# 为变量指定了名称
> names(mydata) <- c("y", "x1", "x2")
# 确认拥有500个观测和3个变量
> dim(mydata)
[1] 500 3
# 输出前10个观测
> head(mydata, n=10)
y x1 x2
1 98.8 41.3 3.43
2 244.5 205.2 3.80
3 375.7 186.7 2.51
4 -59.2 11.2 4.71
5 313.0 111.0 3.45
6 288.8 185.1 2.72
7 134.8 165.0 4.39
8 171.7 97.4 3.64
9 167.2 101.0 3.50
10 121.1 94.5 4.10
>
2.4 字符处理函数
posted on 2017-09-10 15:20 你的踏板车要滑向哪里 阅读(304) 评论(0) 收藏 举报
浙公网安备 33010602011771号