R-dplyr相关函数介绍

install.packages('dplyr')
library(dplyr)
> order=read.csv('order.csv',header = T)
> head(order)
  orderid customerid campaignid  orderdate         city state zipcode
1 1002854      45978       2141 2009/10/13       NEWTON    MA    2459
2 1002855     125381       2173 2009/10/13 NEW ROCHELLE    NY   10804
3 1002856     103122       2141   2011/6/2        MIAMI    FL   33137
4 1002857     130980       2173 2009/10/14 E RUTHERFORD    NJ    7073
5 1002886      48553       2141 2010/11/19    BALTIMORE    MD   21218
6 1002887     106150       2173 2009/10/15     ROWAYTON    CT    6853
  paymenttype totalprice numorderlines numunits
1          VI     190.00             3        3
2          VI      10.00             1        1
3          AE      35.22             2        2
4          AE      10.00             1        1
5          VI      10.00             1        1
6          AE      10.00             1        1

 1、去重distinct

> order<-rbind(order,order[1,])#合并一行
> nrow(order)
[1] 100001
> order<-distinct(order)#去重
> nrow(order)
[1] 100000

2、抽样

> nrow(sample_frac(order,0.5))#按照百分之五十抽样
[1] 50000
> nrow(sample_n(order,5))#按照记录数抽样
[1] 5
注意:caret包里的createDataPartition分层抽样
> createDataPartition(order$orderid,p = 0.75,list = F)

3、返回最高记录top_n

> top_n(order,3,totalprice)#数据框、返回最高的3条、按照总价
orderid customerid campaignid orderdate city state zipcode
1 1063586 82645 2141 2012/12/12 NEW YORK NY 10012
2 1054380 88800 2141 2012/11/12 NEW YORK NY 10123
3 1061106 72141 2237 2012/12/12 BIRMINGHAM AL 35203
paymenttype totalprice numorderlines numunits
1 AE 6568.62 14 28
2 AE 6606.00 15 15
3 DB 6780.00 1 400

4、筛选

> head(filter(order,city=='NEWTON'))#等价于order[order$city=='NEWTON',]
  orderid customerid campaignid  orderdate   city state zipcode paymenttype
1 1002854      45978       2141 2009/10/13 NEWTON    MA    2459          VI
2 1007294     163109       2141 2010/11/29 NEWTON    MA    2464          MC
3 1003820      43366       2173   2010/4/9 NEWTON    MA    2465          VI
4 1003183      43834       2141  2011/1/12 NEWTON    MA    2465          MC
5 1005403     108758       2141  2011/1/17 NEWTON    MA    2468          VI
6 1013951      25677       2141  2010/8/16 NEWTON    MA    2459          VI
  totalprice numorderlines numunits
1     190.00             3        3
2      67.50             1        1
3      19.68             1        1
4      67.50             4        4
5      20.25             1        1
6     175.00             1        1
> head(filter(order,city %in% c('NEWTON','MIAMI')))#A%in%B 
  orderid customerid campaignid  orderdate   city state zipcode paymenttype
1 1002854      45978       2141 2009/10/13 NEWTON    MA    2459          VI
2 1002856     103122       2141   2011/6/2  MIAMI    FL   33137          AE
3 1003630      42957       2141 2010/11/23  MIAMI    FL   33156          AE
4 1003173     123834       2141  2011/1/13  MIAMI    FL   33131          AE
5 1004168     112887       2236  2011/1/14  MIAMI    FL   33143          VI
6 1005959     125317       2173 2010/11/24  MIAMI    FL   33143          AE
补充一点:A%in%B 
> c('a','b') %in% c('a','c','b')
[1] TRUE TRUE
> c('a', 'c', 'b') %in%  c('a', 'b')
[1]  TRUE FALSE  TRUE

5、select子集

> head(select(order,city))#也可用来重命名select(order,campid=campaigid)
          city
1       NEWTON
2 NEW ROCHELLE
3        MIAMI
4 E RUTHERFORD
5    BALTIMORE
6     ROWAYTON

选择以某些字符串开头的列

> head(select(order,starts_with('order')))
  orderid  orderdate
1 1002854 2009/10/13
2 1002855 2009/10/13
3 1002856   2011/6/2
4 1002857 2009/10/14
5 1002886 2010/11/19
6 1002887 2009/10/15

同样的用法有:ends_with(‘id’),contains(‘date’)

6、arrange排序

 

> head(arrange(order,orderdate))#按日期升序排列
> head(arrange(order,desc(orderdate)))#按日期降序排列

7、敲黑板:summarize

%>% 将前一步的结果传到下一步summarise中以.来指代上一步结果,实现在R中的数据透视需求
> group_by(order, state) %>% summarise(., n = n(), max = max(totalprice), min = min(totalprice), avg = mean(totalprice))
# A tibble: 81 × 5
    state     n     max   min       avg
   <fctr> <int>   <dbl> <dbl>     <dbl>
1           381 2025.00     0  75.64039
2      AA     7  195.00    10  73.79571
3      AB    54  509.35     0  39.26037
4      AE    20  173.55    10  42.04650
5      AK    54  195.00     0  45.52481
6      AL   269 6780.00     0 103.23814
7      AP    10  204.93    10  52.33800
8      AR   115 1350.00     0  59.08183
9      AZ  1063 2795.00     0  60.68234
10     BC   115  625.00     0  50.39348
# ... with 71 more rows

 

 

 

 
posted @ 2017-07-13 16:42  积水成渊数据分析  阅读(544)  评论(0编辑  收藏  举报