R语言dplyr包初探

　昨天学了一下R语言dplyr包，处理数据框还是很好用的。记录一下免得我忘记了... 先写一篇入门的，以后有空再写一篇详细的用法。

#dplyr learning
library(dplyr)

#filter()
#选择符合条件的数据框的行，返回数据框
#Usage

#filter(.data, ...)      # ...为限制条件

#eg
filter(starwars, species == "Human")
filter(starwars, mass > 1000)

# Multiple criteria
filter(starwars, hair_color == "none" & eye_color == "black")
filter(starwars, hair_color == "none" | eye_color == "black")

# Multiple arguments are equivalent to and
filter(starwars, hair_color == "none", eye_color == "black")     #默认为逻辑与

#arrange()
#给数据框排序
#Usage#

#arrange(.data, ...)

## S3 method for class 'grouped_df'
#arrange(.data, ..., .by_group = FALSE)

#eg
arrange(mtcars, cyl, disp)      #先排cyl，再排disp
arrange(mtcars, desc(disp))     #desc() 降序

# grouped arrange ignores groups
by_cyl <- mtcars %>% group_by(cyl)    # %>% 为管道函数，将左侧变量传给右侧函数的第一个参数
by_cyl %>% arrange(desc(wt))          #忽略分类，直接排序
# Unless you specifically ask:
by_cyl %>% arrange(desc(wt), .by_group = TRUE)     #按照group分组排序

#select()

# eg
iris <- as_tibble(iris) # so it prints a little nicer
select(iris, starts_with("Petal"))  #选择以 'Petal' 开头的列
select(iris, ends_with("Width"))

# Move Species variable to the front
select(iris, Species, everything())

df <- as.data.frame(matrix(runif(100), nrow = 10))
df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)])
select(df, V4:V6)                                  #切片
select(df, num_range("V", 4:6))                    #这个还是好用的

# Drop variables with -
select(iris, -starts_with("Petal"))                #去除以 'Petal' 开头的列


# The .data pronoun is available:
select(mtcars, .data$cyl)                          #这个用的不习惯
select(mtcars, .data$mpg : .data$disp)


# Renaming -----------------------------------------
# * select() keeps only the variables you specify
select(iris, petal_length = Petal.Length)

# * rename() keeps all variables
rename(iris, petal_length = Petal.Length)     #重命名然后提取所有的列

#mutate()                                     #添加新列
mtcars %>% as_tibble() %>% mutate(
  cyl2 = cyl * 2,
  cyl4 = cyl2 * 2
)

mtcars %>% as_tibble() %>% mutate(
  mpg = NULL,                               # 用 NULL 去除某列，类似于select 的 -
  disp = disp * 0.0163871                   # 对某列做运算
)

# mutate() vs transmute --------------------------
# mutate() keeps all existing variables
mtcars %>%
  mutate(displ_l = disp / 61.0237)

# transmute keeps only the variables you create
mtcars %>%
  transmute(displ_l = disp / 61.0237)


#summarise()
#对 group_by 后的数据进行统计，这里以均值为例

mtcars %>%
  summarise(mean = mean(disp), n = n())

mtcars %>%
  group_by(cyl) %>%
  summarise(mean = mean(disp), n = n())

mtcars %>%
  group_by(cyl, vs) %>%
  summarise(cyl_n = n()，mean_disp = mean(disp))        #这个分组统计很强大

posted @ 2017-10-14 11:37 orange1002 阅读(394) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

orange1002

R语言dplyr包初探

公告