R语言dplyr包初探
昨天学了一下R语言dplyr包,处理数据框还是很好用的。记录一下免得我忘记了... 先写一篇入门的,以后有空再写一篇详细的用法。
#dplyr learning library(dplyr) #filter() #选择符合条件的数据框的行,返回数据框 #Usage #filter(.data, ...) # ...为限制条件 #eg filter(starwars, species == "Human") filter(starwars, mass > 1000) # Multiple criteria filter(starwars, hair_color == "none" & eye_color == "black") filter(starwars, hair_color == "none" | eye_color == "black") # Multiple arguments are equivalent to and filter(starwars, hair_color == "none", eye_color == "black") #默认为逻辑与 #arrange() #给数据框排序 #Usage# #arrange(.data, ...) ## S3 method for class 'grouped_df' #arrange(.data, ..., .by_group = FALSE) #eg arrange(mtcars, cyl, disp) #先排cyl,再排disp arrange(mtcars, desc(disp)) #desc() 降序 # grouped arrange ignores groups by_cyl <- mtcars %>% group_by(cyl) # %>% 为管道函数,将左侧变量传给右侧函数的第一个参数 by_cyl %>% arrange(desc(wt)) #忽略分类,直接排序 # Unless you specifically ask: by_cyl %>% arrange(desc(wt), .by_group = TRUE) #按照group分组排序 #select() # eg iris <- as_tibble(iris) # so it prints a little nicer select(iris, starts_with("Petal")) #选择以 'Petal' 开头的列 select(iris, ends_with("Width")) # Move Species variable to the front select(iris, Species, everything()) df <- as.data.frame(matrix(runif(100), nrow = 10)) df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)]) select(df, V4:V6) #切片 select(df, num_range("V", 4:6)) #这个还是好用的 # Drop variables with - select(iris, -starts_with("Petal")) #去除以 'Petal' 开头的列 # The .data pronoun is available: select(mtcars, .data$cyl) #这个用的不习惯 select(mtcars, .data$mpg : .data$disp) # Renaming ----------------------------------------- # * select() keeps only the variables you specify select(iris, petal_length = Petal.Length) # * rename() keeps all variables rename(iris, petal_length = Petal.Length) #重命名然后提取所有的列 #mutate() #添加新列 mtcars %>% as_tibble() %>% mutate( cyl2 = cyl * 2, cyl4 = cyl2 * 2 ) mtcars %>% as_tibble() %>% mutate( mpg = NULL, # 用 NULL 去除某列,类似于select 的 - disp = disp * 0.0163871 # 对某列做运算 ) # mutate() vs transmute -------------------------- # mutate() keeps all existing variables mtcars %>% mutate(displ_l = disp / 61.0237) # transmute keeps only the variables you create mtcars %>% transmute(displ_l = disp / 61.0237) #summarise() #对 group_by 后的数据进行统计,这里以均值为例 mtcars %>% summarise(mean = mean(disp), n = n()) mtcars %>% group_by(cyl) %>% summarise(mean = mean(disp), n = n()) mtcars %>% group_by(cyl, vs) %>% summarise(cyl_n = n(),mean_disp = mean(disp)) #这个分组统计很强大
Talk is cheap,show me your code!