No.11 异常值的识别与处理
1.什么是异常值
偏离观测值较大的值。
?mtcars mydata <- mtcars #删除第8,9列 mydata1 <- mydata[,-c(8,9)] mydata1
运行结果:
> #删除第8,9列
> mydata1 <- mydata[,-c(8,9)]
> mydata1
mpg cyl disp hp drat wt qsec gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2
2.识别异常值
2.1 range 查看第一列的取值范围(最大最小值)
#range 查看第一列的取值范围(最大最小值) range(mydata1[,1])
结果:
> range(mydata1[,1])
[1] 10.4 33.9
2.2 sapply 查看每一列的取值范围(最大最小值)
#sapply 查看每一列的取值范围(最大最小值) sapply(mydata1, range)
结果:
> sapply(mydata1, range)
mpg cyl disp hp drat wt qsec gear carb
[1,] 10.4 4 71.1 52 2.76 1.513 14.5 3 1
[2,] 33.9 8 472.0 335 4.93 5.424 22.9 5 8
#给mydata1添加一列“A” ,全部赋值10 mydata1["A"] <- 10 mydata1 #删除第10列 mydata1 <- mydata1[,-10] mydata1
结果:
> #给mydata1添加一列“A” ,全部赋值10
> mydata1["A"] <- 10
> mydata1
mpg cyl disp hp drat wt qsec gear carb A
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4 10
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4 10
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1 10
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1 10
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2 10
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1 10
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4 10
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2 10
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2 10
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4 10
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4 10
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3 10
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3 10
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3 10
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4 10
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4 10
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4 10
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1 10
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2 10
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1 10
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1 10
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2 10
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2 10
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4 10
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2 10
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1 10
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2 10
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2 10
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4 10
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6 10
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8 10
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2 10
> mydata1 <- mydata1[,-10]
> mydata1
mpg cyl disp hp drat wt qsec gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2
2.3 rnorm
#生成100个随机数的正态分布 rnorm(100) plot(density(rnorm(100)))
结果:
> rnorm(100)
[1] 1.16812126 0.09121299 0.83385038 -0.65646582 -0.29552396 -0.23749601
[7] -1.79031884 -1.11282169 0.19235486 -0.52366489 0.13862362 1.62990541
[13] -1.11707451 -0.49513437 0.79154736 -1.09576397 -0.10932575 -0.01755107
[19] -0.09119959 -1.24834172 1.83782360 0.02472749 -1.56179781 0.04525409
[25] -0.72421202 -0.46173428 0.96779545 -0.38041404 -0.12286390 1.17213849
[31] -1.78280748 -0.76331701 -2.29322659 0.71971508 0.77251970 1.14962475
[37] 2.10222194 -0.91887320 0.43631280 0.29946445 0.06919667 -0.88430606
[43] 0.08755868 0.10892476 -0.45061797 -0.15228235 -1.06998692 0.70049225
[49] 0.99716511 -0.48687140 0.25055817 1.63833960 0.51469283 0.16183685
[55] 1.11752290 0.56776107 0.40052308 1.03374586 0.58174528 0.41615202
[61] -0.54214179 0.71509075 -0.81845411 -1.42947307 0.02506982 0.00746341
[67] 0.89657339 -0.22376070 0.81466225 -0.00293954 -0.89429036 -0.54005125
[73] -0.55493918 -0.48971420 1.00472584 -1.55206657 -1.49556127 0.20762548
[79] -0.31241761 1.49858090 0.38622734 -0.17299046 1.95234367 -0.79700794
[85] -1.13437636 1.07083286 -0.03549790 -1.40575778 0.26609714 1.05151806
[91] -0.56646006 1.53527908 -0.84863214 0.33583422 -0.79183892 -0.16254438
[97] -0.41619916 0.12012751 1.03583217 -1.06057864

#构造一个向量,包含正态矩阵和一个异常值200 d <- c(rnorm(100),200) d
结果:
> #构造一个向量,包含正态矩阵和一个异常值200
> d <- c(rnorm(100),200)
> d
[1] -0.56001355 1.05238153 0.12994937 0.97432264 -0.60238840
[6] 0.42358562 0.81800149 0.72166480 0.28844043 -1.37637747
[11] 1.57283573 -0.07490066 -0.19501345 -0.38519336 -1.99926797
[16] 0.06252341 0.03724015 -1.01916856 -1.15813221 -0.77049628
[21] 1.03625025 -0.62089178 0.37124752 -0.91627128 -2.11080250
[26] -1.50630384 -0.42341748 -0.02465207 -0.45395521 -0.52911020
[31] -0.89918862 1.80961574 0.49575298 1.11614184 1.44507961
[36] 0.86854770 0.62513437 0.63165574 -0.72413959 -0.19831873
[41] -1.08628031 1.68811785 -1.65809492 -1.02777044 -0.80751298
[46] 0.07285811 -0.84382591 -0.11219811 1.08828834 0.90835285
[51] 0.23240490 0.34246963 0.55302456 1.08317735 -0.86765258
[56] 0.52689518 0.06547722 -1.16916802 0.20016424 1.24468497
[61] 0.63140325 0.76889757 2.16373627 0.01097345 -1.21209642
[66] -1.14192094 1.20751949 -1.21909596 -0.51250581 0.17740712
[71] 0.47884778 -1.54210797 -1.71087851 -1.02963780 -1.70329772
[76] -0.44682489 0.87068263 0.28908129 -2.21313570 -0.04418836
[81] 0.85362812 1.45023516 -1.19358314 0.22766356 -0.46518512
[86] 0.05028882 -2.07874394 -2.06102070 1.25449825 -2.69988603
[91] 0.04762420 -0.25764206 0.27635400 -0.82097771 0.40923734
[96] -0.99378444 2.16360257 -0.31332890 0.69996468 -0.66460736
[101] 200.00000000
2.4 3δ方法找异常值
#3δ方法找异常值 #找到数据d的均值 m <- mean(d) #找到数据d的标准差 s <- sd(d) #计算3倍的标准差 #上限 u <- m+3*s #下限 l <- m-3*s #d小于下限l或者d大于上限u,则为异常值 d<l|d>u
结果:
> #3δ方法找异常值
> #找到数据d的均值
> m <- mean(d)
> #找到数据d的标准差
> s <- sd(d)
> #计算3倍的标准差
> #上限
> u <- m+3*s
> #下限
> l <- m-3*s
> #d小于下限l或者d大于上限u,则为异常值
> d<l|d>u
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[97] FALSE FALSE FALSE FALSE TRUE(异常值)
#查看异常值 d[d<l|d>u] #查看异常值的位置 which(d<l|d>u)
结果:
> #查看异常值
> d[d<l|d>u]
[1] 200
> #查看异常值的位置
> which(d<l|d>u)
[1] 101
将代码提取为函数:选中代码片段——菜单栏code——extract function

sigma(mydata1[,1]) sapply(mydata1, sigma)
2.5 箱线图分析寻找异常值
#箱线图分析寻找异常值 boxplot(mydata1[,1]) mydata1
结果:
> #箱线图分析寻找异常值
> boxplot(mydata1[,1])
> mydata1
mpg cyl disp hp drat wt qsec gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 3 4
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 4 2
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 4 2
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 4 4
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 4 4
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 3 3
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 3 3
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 3 3
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 3 4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 3 4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 3 4
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 4 1
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 4 2
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 4 1
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 3 1
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 3 2
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 3 2
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 3 4
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 3 2
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 4 1
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 5 2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 5 2
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 5 4
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 5 6
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 5 8
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 4 2

#定义1个函数
boxout <- function(x){
#计算上四分位数
prob75 = quantile(x, probs = 0.75)
#计算下四分位数
prob25 = quantile(x, probs = 0.25)
#计算四分位间距
iqr = prob75 - prob25
#计算上边界线
u = prob75 + 1.5*iqr
#计算上边界线
l = prob25 - 1.5*iqr
#大于u小于l的值认定为异常值
x[x<l|x>u]
which(x<l|x>u)
}
boxout(mydata1[,9])
运行结果:
[1] 31
3. 异常值的处理
3.1 删除异常值所在的整行
boxout(mydata1[,9]) #删除异常值所在的行 mydata1[boxout(mydata1[,9]),]
3.2 对数变换
log

浙公网安备 33010602011771号