优化Pig请求

时间:2013-08-09 15:00:14

标签: optimization hadoop apache-pig

我想在嵌入式java程序中执行pig命令。暂时,我在本地模式下尝试Pig。我的数据文件大小约为15MB,但执行此命令的时间很长,所以我认为我的脚本需要优化...

我的剧本:

A = LOAD 'data' USING PigPrismeLoader('data.xml');
filter_response_time_less_than_1_s = FILTER A BY (response_time < 1000.0);
filter_response_time_between_1_s_and_2_s = FILTER A BY (response_time >= 1000.0 AND response_time < 1999.0);
filter_response_time_between_greater_than_2_s = FILTER A BY (response_time >= 2000.0);
star__zne_asfo_access_log = FOREACH ( COGROUP A BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_between_greater_than_2_s BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_less_than_1_s BY (date_day,url,date_minute,ret_code,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_minute,ret_code,serveur) )
{
        GENERATE
                FLATTEN(group) AS (date_day,zne_asfo_url,date_minute,zne_http_code,zne_asfo_server),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymd = FOREACH ( COGROUP A BY (date_day,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,date_year,date_month) )
{
        GENERATE
                FLATTEN(group) AS (date_day,date_year,date_month),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymd_ret_url = FOREACH ( COGROUP A BY (date_day,url,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,url,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,url,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_year,date_month) )
{
        GENERATE
                FLATTEN(group) AS (date_day,zne_asfo_url,date_year,date_month),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymd_ret_code = FOREACH ( COGROUP A BY (date_day,ret_code,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,ret_code,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,ret_code,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,ret_code,date_year,date_month) )
{
        GENERATE
                FLATTEN(group) AS (date_day,zne_http_code,date_year,date_month),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymd_ret_url_server = FOREACH ( COGROUP A BY (date_day,url,date_year,date_month,serveur), filter_response_time_between_greater_than_2_s BY (date_day,url,date_year,date_month,serveur), filter_response_time_less_than_1_s BY (date_day,url,date_year,date_month,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_year,date_month,serveur) )
{
        GENERATE
                FLATTEN(group) AS (date_day,zne_asfo_url,date_year,date_month,zne_asfo_server),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymd_ret_code_server = FOREACH ( COGROUP A BY (date_day,ret_code,date_year,date_month,serveur), filter_response_time_between_greater_than_2_s BY (date_day,ret_code,date_year,date_month,serveur), filter_response_time_less_than_1_s BY (date_day,ret_code,date_year,date_month,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,ret_code,date_year,date_month,serveur) )
{
        GENERATE
                FLATTEN(group) AS (date_day,zne_http_code,date_year,date_month,zne_asfo_server),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymdi_server = FOREACH ( COGROUP A BY (date_day,date_minute,date_year,date_month,serveur), filter_response_time_between_greater_than_2_s BY (date_day,date_minute,date_year,date_month,serveur), filter_response_time_less_than_1_s BY (date_day,date_minute,date_year,date_month,serveur), filter_response_time_between_1_s_and_2_s BY (date_day,date_minute,date_year,date_month,serveur) )
{
        GENERATE
                FLATTEN(group) AS (date_day,date_minute,date_year,date_month,zne_asfo_server),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymdhi_url = FOREACH ( COGROUP A BY (date_day,url,date_minute,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,url,date_minute,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,url,date_minute,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,url,date_minute,date_year,date_month) )
{
        GENERATE
                FLATTEN(group) AS (date_day,zne_asfo_url,date_minute,date_year,date_month),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
agg__zne_asfo_access_log_ymdhi = FOREACH ( COGROUP A BY (date_day,date_minute,date_year,date_month), filter_response_time_between_greater_than_2_s BY (date_day,date_minute,date_year,date_month), filter_response_time_less_than_1_s BY (date_day,date_minute,date_year,date_month), filter_response_time_between_1_s_and_2_s BY (date_day,date_minute,date_year,date_month) )
{
        GENERATE
                FLATTEN(group) AS (date_day,date_minute,date_year,date_month),
                (long)SUM((bag{tuple(long)})A.response_time) AS response_time,
                COUNT(filter_response_time_less_than_1_s) AS response_time_less_than_1_s,
                COUNT(filter_response_time_between_1_s_and_2_s) AS response_time_between_1_s_and_2_s,
                COUNT(filter_response_time_between_greater_than_2_s) AS response_time_between_greater_than_2_s,
                COUNT(A) AS nb_hit;
};
STORE star__zne_asfo_access_log INTO 'star__zne_asfo_access_log' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymd INTO 'agg__zne_asfo_access_log_ymd' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymd_ret_url INTO 'agg__zne_asfo_access_log_ymd_ret_url' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymd_ret_code INTO 'agg__zne_asfo_access_log_ymd_ret_code' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymd_ret_url_server INTO 'agg__zne_asfo_access_log_ymd_ret_url_server' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymd_ret_code_server INTO 'agg__zne_asfo_access_log_ymd_ret_code_server' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymdi_server INTO 'agg__zne_asfo_access_log_ymdi_server' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymdhi_url INTO 'agg__zne_asfo_access_log_ymdhi_url' USING PigStorage('\t', '-schema');
STORE agg__zne_asfo_access_log_ymdhi INTO 'agg__zne_asfo_access_log_ymdhi' USING PigStorage('\t', '-schema');

有什么想法吗?

1 个答案:

答案 0 :(得分:1)

您的脚本可能需要优化,但正如评论中所述,这是Hadoop的一小部分数据。

Hadoop对于如此小的数据(甚至高达千兆字节)表现不佳。

这是因为Hadoop旨在处理大量数据,涉及一个复杂的处理框架,需要时间来设置。如果您考虑使用大型数据集,则此设置时间可以忽略不计,但如果您使用15MB数据,则设置框架所需的时间比实际处理该数据要长得多。