Data Manipulation with dplyr in R


Data Manipulation with dplyr in R

select

select(data,变量名)

The filter and arrange verbs

arrange

counties_selected <- counties %>%
  select(state, county, population, private_work, public_work, self_employed)

# Add a verb to sort in descending order of public_work
counties_selected %>%arrange(desc(public_work))

filter

counties_selected <- counties %>%
  select(state, county, population)

# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
  filter(state == "California",
         population > 1000000)
#筛选多个变量
filter(id %in% c("a","b","c"...)) 存在
filter(id %in% c("a","b","c"...)) 不存在

fct_relevel

Reorder factor levels by hand
排序,order不好使的时候

f <- factor(c("a", "b", "c", "d"), levels = c("b", "c", "d", "a"))
fct_relevel(f)
fct_relevel(f, "a")
fct_relevel(f, "b", "a")

# Move to the third position
fct_relevel(f, "a", after = 2)

# Relevel to the end
fct_relevel(f, "a", after = Inf)
fct_relevel(f, "a", after = 3)

# Revel with a function
fct_relevel(f, sort)
fct_relevel(f, sample)
fct_relevel(f, rev)

Filtering and arranging

 counties_selected <- counties %>%
    select(state, county, population, private_work, public_work, self_employed)
> 
> # Filter for Texas and more than 10000 people; sort in descending order of private_work
> counties_selected %>%filter(state=='Texas',population>10000)%>%arrange(desc(private_work))
# A tibble: 169 x 6
   state county  population private_work public_work self_employed
   <chr> <chr>        <dbl>        <dbl>       <dbl>         <dbl>
 1 Texas Gregg       123178         84.7         9.8           5.4
 2 Texas Collin      862215         84.1        10             5.8
 3 Texas Dallas     2485003         83.9         9.5           6.4
 4 Texas Harris     4356362         83.4        10.1           6.3
 5 Texas Andrews      16775         83.1         9.6           6.8
 6 Texas Tarrant    1914526         83.1        11.4           5.4
 7 Texas Titus        32553         82.5        10             7.4
 8 Texas Denton      731851         82.2        11.9           5.7
 9 Texas Ector       149557         82          11.2           6.7
10 Texas Moore        22281         82          11.7           5.9
# ... with 159 more rows

Mutate

counties_selected <- counties %>%
  select(state, county, population, public_work)

# Sort in descending order of the public_workers column
counties_selected %>%
  mutate(public_workers = public_work * population / 100) %>%arrange(desc(public_workers))
counties %>%
  # Select the five columns 
  select(state, county, population, men, women) %>%
  # Add the proportion_men variable
  mutate(proportion_men = men / population) %>%
  # Filter for population of at least 10,000
  filter(population >= 10000) %>% 
  # Arrange proportion of men in descending order 
  arrange(desc(proportion_men))

The count verb

counties_selected %>%count(region,sort=TRUE)
counties_selected %>%count(state,wt=citizens,sort=TRUE)

Summarizing

# Summarize to find minimum population, maximum unemployment, and average income
counties_selected %>%summarize(
min_population=min(population),
max_unemployment=max(unemployment),
average_income=mean(income)
)
# Add a density column, then sort in descending order
counties_selected %>%
  group_by(state) %>%
  summarize(total_area = sum(land_area),
            total_population = sum(population),
            density=total_population/total_area) %>%arrange(desc(density))

发现了,归根到底是一种函数关系,看看该怎样处理这个函数比较简单,如果写不出来,可能和小学的时候应用题写不出来有关系

top_n

按照优先级来筛选

# Extract the most populated row for each state
counties_selected %>%
  group_by(state, metro) %>%
  summarize(total_pop = sum(population)) %>%
  top_n(1, total_pop)

Selecting

Using the select verb, we can answer interesting questions about our dataset by focusing in on related groups of verbs.
The colon (😃 is useful for getting many columns at a time.

In the video you learned about the select helper starts_with(). Another select helper is ends_with(), which finds the columns that end with a particular string.

counties %>%
  # Select the state, county, population, and those ending with "work"
  select(state, county, population, ends_with("work")) %>%
  # Filter for counties that have at least 50% of people engaged in public work
  filter(public_work >= 50)

我觉得这种简单的逻辑关系不应该出错,但是老是出错。。是我真的不太适合做编程这一行嘛?

rename

rename()进行重命名

# Rename the n column to num_counties
counties %>%
  count(state)%>%rename(num_counties=n)

也可以在select的时候直接重命名

 # Select state, county, and poverty as poverty_rate
> counties %>%select(state,county,poverty_rate=poverty)
# A tibble: 3,138 x 3
   state   county   poverty_rate
   <chr>   <chr>           <dbl>
 1 Alabama Autauga          12.9
 2 Alabama Baldwin          13.4
 3 Alabama Barbour          26.7
 4 Alabama Bibb             16.8
 5 Alabama Blount           16.7
 6 Alabama Bullock          24.6
 7 Alabama Butler           25.4
 8 Alabama Calhoun          20.5
 9 Alabama Chambers         21.6
10 Alabama Cherokee         19.2
# ... with 3,128 more rows

transmute

combination select & mutate
类似于mutate,添加新列但是只保留新列,删掉旧列
官方解释: use to calculate new columns while dropping other columns

counties %>%
  # Keep the state, county, and populations columns, and add a density column
  transmute(state, county, population, density = population / land_area) %>%
  # Filter for counties with a population greater than one million 
  filter(population > 1000000) %>%
  # Sort density in ascending order 
  arrange(density

这个解释挺好的

给出一个综合的例子

> # Change the name of the unemployment column
> counties %>%
    rename(unemployment_rate = unemployment)
# A tibble: 3,138 x 40
   census_id state county region metro population   men women hispanic white
   <chr>     <chr> <chr>  <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl>
 1 1001      Alab~ Autau~ South  Metro      55221 26745 28476      2.6  75.8
 2 1003      Alab~ Baldw~ South  Metro     195121 95314 99807      4.5  83.1
 3 1005      Alab~ Barbo~ South  Nonm~      26932 14497 12435      4.6  46.2
 4 1007      Alab~ Bibb   South  Metro      22604 12073 10531      2.2  74.5
 5 1009      Alab~ Blount South  Metro      57710 28512 29198      8.6  87.9
 6 1011      Alab~ Bullo~ South  Nonm~      10678  5660  5018      4.4  22.2
 7 1013      Alab~ Butler South  Nonm~      20354  9502 10852      1.2  53.3
 8 1015      Alab~ Calho~ South  Metro     116648 56274 60374      3.5  73  
 9 1017      Alab~ Chamb~ South  Nonm~      34079 16258 17821      0.4  57.3
10 1019      Alab~ Chero~ South  Nonm~      26008 12975 13033      1.5  91.7
# ... with 3,128 more rows, and 30 more variables: black <dbl>, native <dbl>,
#   asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
#   income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
#   child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
#   construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
#   transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
#   mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
#   self_employed <dbl>, family_work <dbl>, unemployment_rate <dbl>,
#   land_area <dbl>
> 
> # Keep the state and county columns, and the columns containing poverty
> counties %>%
    select(state, county, contains("poverty"))
# A tibble: 3,138 x 4
   state   county   poverty child_poverty
   <chr>   <chr>      <dbl>         <dbl>
 1 Alabama Autauga     12.9          18.6
 2 Alabama Baldwin     13.4          19.2
 3 Alabama Barbour     26.7          45.3
 4 Alabama Bibb        16.8          27.9
 5 Alabama Blount      16.7          27.2
 6 Alabama Bullock     24.6          38.4
 7 Alabama Butler      25.4          39.2
 8 Alabama Calhoun     20.5          31.6
 9 Alabama Chambers    21.6          37.2
10 Alabama Cherokee    19.2          30.1
# ... with 3,128 more rows
> 
> # Calculate the fraction_women column without dropping the other columns
> counties %>%
    mutate(fraction_women = women / population)
# A tibble: 3,138 x 41
   census_id state county region metro population   men women hispanic white
   <chr>     <chr> <chr>  <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl>
 1 1001      Alab~ Autau~ South  Metro      55221 26745 28476      2.6  75.8
 2 1003      Alab~ Baldw~ South  Metro     195121 95314 99807      4.5  83.1
 3 1005      Alab~ Barbo~ South  Nonm~      26932 14497 12435      4.6  46.2
 4 1007      Alab~ Bibb   South  Metro      22604 12073 10531      2.2  74.5
 5 1009      Alab~ Blount South  Metro      57710 28512 29198      8.6  87.9
 6 1011      Alab~ Bullo~ South  Nonm~      10678  5660  5018      4.4  22.2
 7 1013      Alab~ Butler South  Nonm~      20354  9502 10852      1.2  53.3
 8 1015      Alab~ Calho~ South  Metro     116648 56274 60374      3.5  73  
 9 1017      Alab~ Chamb~ South  Nonm~      34079 16258 17821      0.4  57.3
10 1019      Alab~ Chero~ South  Nonm~      26008 12975 13033      1.5  91.7
# ... with 3,128 more rows, and 31 more variables: black <dbl>, native <dbl>,
#   asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
#   income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
#   child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
#   construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
#   transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
#   mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
#   self_employed <dbl>, family_work <dbl>, unemployment <dbl>,
#   land_area <dbl>, fraction_women <dbl>
> 
> # Keep only the state, county, and employment_rate columns
> counties %>%
    transmute(state, county, employment_rate = employed / population)
# A tibble: 3,138 x 3
   state   county   employment_rate
   <chr>   <chr>              <dbl>
 1 Alabama Autauga            0.434
 2 Alabama Baldwin            0.441
 3 Alabama Barbour            0.319
 4 Alabama Bibb               0.367
 5 Alabama Blount             0.384
 6 Alabama Bullock            0.362
 7 Alabama Butler             0.384
 8 Alabama Calhoun            0.406
 9 Alabama Chambers           0.402
10 Alabama Cherokee           0.390
# ... with 3,128 more rows

貌似忘记%in%符号的使用了,复习一下啊

# Filter for the names Steven, Thomas, and Matthew 
selected_names <- babynames %>%
  filter(name %in% c("Steven", "Thomas", "Matthew"))

Grouped mutates

这个就是两两组合之前的例子中有的

Window functions

posted @ 2019-12-29 21:16  高文星星  阅读(1266)  评论(0编辑  收藏  举报