计数聚合不适用于猪

时间:2018-04-09 08:26:42

标签: apache-pig

我是apache pig的新手,我在猪编程中遇到了问题。我想计算每位经理报告的员工人数。但我不认为我从这段代码中获得了正确的输出。需要你的帮助。

此处是源数据文件:

7369,SMITH,CLERK,7902,1980-12-17,800.00,NULL,20
7499,ALLEN,SALESMAN,7698,1981-02-20,1600.00,300.00,30
7521,WARD,SALESMAN,7698,1981-02-22,1250.00,500.00,30
7566,JONES,MANAGER,7839,1981-04-02,2975.00,NULL,20
7654,MARTIN,SALESMAN,7698,1981-09-28,1250.00,1400.00,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850.00,NULL,30
7782,CLARK,MANAGER,7839,1981-06-09,2450.00,NULL,10
7788,SCOTT,ANALYST,7566,1982-12-09,3000.00,NULL,20
7839,KING,PRESIDENT,NULL,1981-11-17,5000.00,NULL,10
7844,TURNER,SALESMAN,7698,1981-09-08,1500.00,0.00,30
7876,ADAMS,CLERK,7788,1983-01-12,1100.00,NULL,20
7900,JAMES,CLERK,7698,1981-12-03,950.00,NULL,30
7902,FORD,ANALYST,7566,1981-12-03,3000.00,NULL,20
7934,MILLER,CLERK,7782,1982-01-23,1300.00,NULL,10

这里是代码:

data_mgr = load '/users/Desktop/Employees.rtf' using 
PigStorage(',') as (empno:int, empname:chararray, job:chararray, 
mgr:int, hiredate:chararray, sal:float, comm:float, dept:int);

data_emp = load '/users/Desktop/Employees.rtf' using 
PigStorage(',') as
(empno:int, empname:chararray, job:chararray, mgr:int, 
hiredate:chararray, sal:float, comm:float, dept:int);

joined = join data_mgr by mgr, data_emp by empno;

select1 = foreach joined generate data_mgr::empno as mgrid, 
data_mgr::empname as mgrname, data_emp::empno as empno;

grouped = group select1 by ($0, $1);

select2 = foreach grouped generate group, COUNT(select1) as 
no_of_reportees;

ordered = order select2 by no_of_reportees desc;

dump ordered;

1 个答案:

答案 0 :(得分:0)

试试这个,

 emp_data = LOAD '/users/Desktop/Employees.rtf' USING PigStorage(',') AS (empno:int, empname:chararray, job:chararray, mgrid:int, hiredate:chararray, sal:float, comm:float, dept:int);
 mgr_group = GROUP  emp_data BY mgrid;
 emo_count = FOREACH mgr_grp GENERATE group AS mgr_id, COUNT(emp_data) AS Count;
 emp_count_ordered = ORDER emp_count BY Count DESC;
 DUMP emp_count_ordered;

注意:您可以进一步使用初始数据集的JOIN操作来获取mgr名称。

你是说这样的意思吗? (虽然我没有测试过)

data_emp = load '/users/Desktop/Employees.rtf' using PigStorage(',') as (empno:int, empname:chararray, job:chararray, mgrid:int, hiredate:chararray, sal:float, comm:float, dept:int);

data_mgr = load '/users/Desktop/Employees.rtf' using PigStorage(',') as (empno:int, empname:chararray, job:chararray, mgrid:int, hiredate:chararray, sal:float, comm:float, dept:int);

emp_mgr_join = join data_emp by empno, data_mgr by mgrid;

emp_mgr_join_sub = foreach emp_mgr_join generate data_mgr::mgrid as mgrid, data_mgr::empname as mgrname, data_emp::empno as empno;

emp_mgr_grouped = group emp_mgr_join_sub by mgrid;

emp_mgr_count = foreach emp_mgr_grouped generate group AS mgr_id, emp_mgr_join_sub.mgrname as mgr_name, COUNT(emp_mgr_join_sub) as no_of_reportees;

ordered = order emp_mgr_count by no_of_reportees desc;

dump ordered;