使用Apache Pig计算滚动日期值

时间:2014-11-07 04:48:05

标签: apache-pig

我们如何使用Apache Pig实现:

File :

A  2014/10/01
A  2014/09/01
A  2014/08/01
A  2014/02/01

结果应该是A计数3,因为我想用记录组之间使用滚动窗口30天的记录数来计算A.

1 个答案:

答案 0 :(得分:0)

请找到解决方案,我希望您可以根据需要进行进一步的改进。尝试使用您的输入执行,并让我知道它是如何工作的。

<强> input.txt中

A 2014/12/01
A 2014/11/01
A 2014/10/01
A 2014/07/01
A 2014/05/01
A 2014/04/01
B 2014/09/01
B 2014/07/01
B 2014/06/01
B 2014/02/01
C 2014/09/01
C 2014/07/01
C 2014/05/01

预期输出

A 5
B 2
C 0

<强> PigScript:

REGISTER rollingCount.jar;
A = LOAD 'input.txt' Using PigStorage(' ') AS (f1:chararray,f2:chararray);
B = GROUP A BY f1;
C = FOREACH B GENERATE mypackage.ROLLINGCOUNT(BagToString($1)) AS rollingCnt;
DUMP C;

来自剧本的OutPut:

(A,5)
(B,2)
(C,0)

Java代码:
1.编译以下java代码并创建jar文件名 rollingCount.jar
我刚刚写了代码,你可以根据需要进行优化。

<强> ROLLINGCOUNT.java

package mypackage;

import java.io.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import java.text.SimpleDateFormat;
import java.util.concurrent.TimeUnit;
import java.util.*;

public class ROLLINGCOUNT extends EvalFunc<Integer> {
    public Integer exec(Tuple input) throws IOException {

        //Get the input String from request
        String inputString = (String)input.get(0);
        Date[] arrayOfDates = getArrayOfDate(inputString);
        long diffDays[] = getDaysBetweenList(arrayOfDates);
        int rollingCount = getRollingCount(diffDays);

        return rollingCount;
    }

    //Function to convert strings to array of dates
    static protected Date[] getArrayOfDate(String inputString)
    {
        //Get the 1st column, this will be the Id
        String ID = inputString.split("_")[0];

        //Replace all the Ids with Null, bcoz its a duplicate columns
        String modifiedString = inputString.replace(ID+"_","");

        //Split the string into multiple columns using '_' as delimiter
        String list[] = modifiedString.split("_");

        //Convert the string to list of  array dates
        Date[] dateList = new Date[list.length];
        int index=0;
        for (String dateString: list)
        {
            try
            {
                //Convert the date string to date object in the give format
                SimpleDateFormat dFormat = new SimpleDateFormat("yyyy/MM/dd");
                dateList[index++] = dFormat.parse(dateString);
            }
            catch(Exception e)
            {
                // error handling goes here
            }
        }
        return dateList;
    }

    //Function to get difference between two dates
    static protected long[] getDaysBetweenList(Date[] arrayOfDate)
    {
        long diffDays[] = new long[arrayOfDate.length-1];
        int cnt=0;      
        for (int index=0; index<arrayOfDate.length-1;index++)
        {
            long diff = Math.abs(arrayOfDate[index+1].getTime() - arrayOfDate[index].getTime());    
            long days = TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
            diffDays[cnt++] = days;
        }
        return diffDays;
    }

    //Function to get the total rolling count   
    static protected int getRollingCount(long diffDays[])
    {
        int result =0;
        for(int index=0;index<diffDays.length;index++)
        {
            int cnt =0;
            //hardcoded the values of 30 and 31 days, may need to handle Feb month 28 or 29 days
            while((index<diffDays.length)&&((diffDays[index]==30)||(diffDays[index]==31)))
            {
                cnt++;
                index++;
            }
            if(cnt>0)
            {
                result = result + cnt+1;
            }       
        }
        return result;
    }
}