我有一个包含3个字段(id,name和post_num)的包,我想删除post_num每个名称的post_num小于post_num的元组。例如,我有4个数据如下:
1, Dav, 5
2, Dav, 6
3, Dav, 4
4, Ed, 1
然后第三个数据应该被删除,因为Dav的平均post_num是5。
我可以不使用UDF吗?
答案 0 :(得分:1)
-- ## Suppose you have
-- 1000,SMITH,123
-- 1001,JOHN,452
-- 1002,TWAIN,125
-- 1003,HARDY,124
-- 1004,CHILD,785
-- 1005,CHILD,639
-- 1006,DAVIS,89
-- 1007,DAVIS,173
-- 1008,MIKE,420
-- 1009,DENNIS,562
-- 1010, CHILD,638
### Then try this on Pig CLI:
data = LOAD '/mnt/e_drive/temp/csdata.csv' USING PigStorage(',') as (id:int, name:chararray, post_num:int);
-- data: {id: int,name: chararray,post_num: int}
grpData= GROUP data BY name;
-- grpData: {group: chararray,data: {(id: int,name: chararray,post_num: int)}}
avgData = foreach grpData generate FLATTEN(data), AVG(data.post_num) as avg_post_num;
--avgData: {data::id: int,data::name: chararray,data::post_num: int,avg_post_num: double}
filterData = filter avgData by (double) data::post_num >= avg_post_num;
--filterData: {data::id: int,data::name: chararray,data::post_num: int,avg_post_num: double}
requiredData= foreach filterData generate data::id as id, data::name as name, data::post_num as post_num;
--requiredData: {id: int,name: chararray,post_num: int}
-- TO Debug ---------------------
dump avgData;
-- (1001,JOHN,452,452.0)
-- (1008,MIKE,420,420.0)
-- (1010,CHILD,638,687.3333333333334)
-- (1005,CHILD,639,687.3333333333334)
-- (1004,CHILD,785,687.3333333333334)
-- (1007,DAVIS,173,131.0)
-- (1006,DAVIS,89,131.0)
-- (1003,HARDY,124,124.0)
-- (1000,SMITH,123,123.0)
-- (1002,TWAIN,125,125.0)
-- (1009,DENNIS,562,562.0)
dump requiredData;
--(1001,JOHN,452)
--(1008,MIKE,420)
--(1004,CHILD,785)
--(1007,DAVIS,173)
--(1003,HARDY,124)
--(1000,SMITH,123)
--(1002,TWAIN,125)
--(1009,DENNIS,562)