pig笔记

原谅我只是拿这个当笔记来写了,最近写的就这几个常用的

1.基本使用

REGISTER /home/vlab/ykt/StandartTrjn.jar
DEFINE StandartTrjn com.zhangdan.pig.StandartTrjn();
data_load = load 'yktdata/pid06/pid06_09.csv' using PigStorage(',') as (account:chararray,des:chararray,jntime:chararray);
data_group = group data_load by account;
--一次刷卡记录
data_seq = foreach data_group{
    sorted = order data_load by jntime;
    generate flatten(StandartTrjn(sorted));
}
store data_seq into 'yktdata/pid06_result09/standart' using PigStorage(',');


----------生成所有的关系对
REGISTER /home/vlab/ykt/GetConnection.jar
DEFINE GetConnection com.zhangdan.ykt.GetConnection();
data_load = load 'yktdata/pid06_result09/standart/part-r-*' using PigStorage(',') as (account:chararray,des:chararray,year:chararray,month:chararray,day:chararray,jnstart:chararray,jnend:chararray,times:int);
data_group = group data_load by (year,month,day,des);
data_seq = foreach data_group{
   valid = distinct data_load;
   sorted = order valid by jnstart;
   generate flatten(GetConnection(sorted));
};
--data_dis = distinct data_seq;
store data_seq into 'yktdata/pid06_result09/connection' using PigStorage(','); 
--------统计相遇次数
data_load = load 'yktdata/pid06_result09/connection/part-r-*' using PigStorage(',') as (account1:chararray,account2:chararray,jntime1:chararray,jntime2:chararray);
data_group = group data_load by (account1,account2);
data_count = foreach data_group{
    generate flatten(group),COUNT(data_load) as cc;
};
data_order = order data_count by cc desc;
store data_order into'yktdata/pid06_result09/connectioncount' using PigStorage(',');

2.这个是大师姐给我提供的,将两条相连记录合并

REGISTER /home/vlab/markovPairsjar/datafu-1.2.0.jar;
DEFINE MarkovPairs datafu.pig.stats.MarkovPairs();   ---××××××××××××
REGISTER /home/vlab/ykt/gettimebysecond.jar
DEFINE getSecondtime com.zhangdan.pig.GetTimebySecond();
data_load = load 'yktdata/pid06_10.csv' using PigStorage(',') as (account:chararray,des:chararray,jntime:chararray);
data_group = group data_load by account;
--连接连续的两次刷卡:卡号,地点,刷卡时间,下次刷卡时间
data_seq = foreach data_group{
    sorted = order data_load by jntime;
    pair = MarkovPairs(sorted); ---×××××××××××××
    generate flatten(pair) as (elem1:TUPLE(account:chararray,des:chararray,jntime:chararray),elem2:TUPLE(account:chararray,des:chararray,jntime:chararray));
}--连接连续的两次刷卡

--卡号,地点,刷卡时间,下次刷卡时间,时间差
data_long = foreach data_seq{
     generate elem1.account as account,elem1.des as des1,RTRIM(elem1.jntime) as jnstart,elem2.des as des2,RTRIM(elem2.jntime) as jnen,getSecondtime(elem1.jntime,elem2.jntime) as resu;
};
--data_result = filter data_long by resu<5*60;


--data_result = foreach data_seq generate flatten(elem1),flatten(elem2);
store data_long into 'yktdata/combine' using PigStorage(',');

pig对于刚刚处理大量数据的人来讲真的方便好多,对于不擅长写代码的孩纸更是容易不少,

讲真,掌握一门语言如java或者python,应该可以帮我们得到任意形式的数据,千万不要仅仅依赖pig

posted @ 2016-03-16 09:25  寻影4_2  阅读(162)  评论(0编辑  收藏  举报