HIVE:如何按键对键/值字符串的值求和

时间:2016-08-23 13:15:24

标签: string hive key-value

我有两个包含string对变量的key/value列。像这样:

    column1         column2
a:1,b:2,c:3         a:5,c:3
   a:12,b:4     a:9,b:3,d:5

我怎样才能通过特定的键将这些值(在现实生活中我不知道我有多少键,一些键只能在一列中找到)来得到这个值:

         column12
  a:6,b:2,c:6,d:0
 a:21,b:7,c:0,d:5

或者这个:

 a  b  c  d
 6  2  6  0
21  7  0  5

谢谢你的帮助!

3 个答案:

答案 0 :(得分:2)

假设每一行都有一个唯一标识符“id”,以下查询
 应该工作。

select id, collect_list(CONCAT(key,':',val)) as column12  
from  
(  
  select id, key, SUM(val) as val  
  from  
  (  
    select id, k1 as key, k2 as key, v1 as val, v2 as val  
    from  
    (  
      select id, str_to_map(col1,',',':') as c1, str_to_map(col2,',',':') as c2  
      from table  
    )x  
    LATERAL VIEW explode(c1) e1 as k1,v1  
    LATERAL VIEW explode(c2) e2 as k2,v2  
  )y  
)z  

答案 1 :(得分:0)

其中一种方法是使用Hcatalog编写自定义mapreduce代码。

以下MR获取表1的输入列并将结果列写入table2(由于逻辑在映射器中处理,因此不需要reducer)

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hcatalog.common.*;
import org.apache.hcatalog.mapreduce.*;
import org.apache.hcatalog.data.*;
import org.apache.hcatalog.data.schema.*;
import org.apache.commons.lang3.ArrayUtils;

public class HCatSum extends Configured implements Tool {

    public static class Map
            extends
            Mapper<WritableComparable, HCatRecord, WritableComparable, HCatRecord> {
        String column1;
        String column2;
        String val;

        @Override
        protected void map(
                WritableComparable key,
                HCatRecord value,
                org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord, WritableComparable, HCatRecord>.Context context)
                throws IOException, InterruptedException {

            column1 = (String) value.get(0); // a:1,b:2,c:3
            column2 = (String) value.get(1); // a:5,c:3

            String colArray[] = (String[]) ArrayUtils.addAll(
                    column1.split(","), column2.split(","));
            HashMap<String, Integer> hs = new HashMap<String, Integer>();
            for (String token : colArray) {
                String tokensplit[] = token.split(":");
                String k = tokensplit[0];
                int v = Integer.parseInt(tokensplit[1]);
                if (hs.containsKey(k)) {
                    int prev = hs.get(k);
                    hs.put(k, prev + v);
                } else {
                    hs.put(k, v);                       
                }
            }

            val = Arrays.toString(hs.entrySet().toArray()); // [a=6,b=2,c=6]
            HCatRecord record = new DefaultHCatRecord(1);
            record.set(0, val);
            context.write(null, record);
        }
    }

    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        args = new GenericOptionsParser(conf, args).getRemainingArgs();

        // Get the input and output table names as arguments
        String inputTableName = args[0];
        String outputTableName = args[1];
        // Assume the default database
        String dbName = null;

        Job job = new Job(conf, "HCatsum");
        HCatInputFormat.setInput(job,
                InputJobInfo.create(dbName, inputTableName, null));
        job.setJarByClass(HCatSum.class);
        job.setMapperClass(Map.class);

        // An HCatalog record as input
        job.setInputFormatClass(HCatInputFormat.class);

        // Mapper emits a string as key and an integer as value
        job.setMapOutputKeyClass(WritableComparable.class);
        job.setMapOutputValueClass(DefaultHCatRecord.class);

        // Ignore the key for the reducer output; emitting an HCatalog record as
        // value

        job.setOutputKeyClass(WritableComparable.class);
        job.setOutputValueClass(DefaultHCatRecord.class);
        job.setOutputFormatClass(HCatOutputFormat.class);

        HCatOutputFormat.setOutput(job,
                OutputJobInfo.create(dbName, outputTableName, null));
        HCatSchema s = HCatOutputFormat.getTableSchema(job);
        System.err.println("INFO: output schema explicitly set for writing:"
                + s);
        HCatOutputFormat.setSchema(job, s);
        return (job.waitForCompletion(true) ? 0 : 1);
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new HCatSum(), args);
        System.exit(exitCode);
    }
}

如何使用HCatalog运行MR(参考链接): Hive Manual

希望这有用。

答案 2 :(得分:0)

这是一个解决方案。它有点像黑客,但无论你拥有多少把钥匙都能正常工作。

<强> udf0.py

#/usr/bin/python

import sys
from collections import Counter

for line in sys.stdin:
    words = line.strip().split('\t')
    c = Counter()

    for word in words:
        d = {}
        s = word.split(',')
        for ss in s:
            k,v = ss.split(':')
            d[k] = int(v)

        c.update(d)

    print ','.join([str(k)+':'+str(v) for k,v in dict(c).iteritems()])

<强> udf1.py

#!/usr/bin/python

import sys

for line in sys.stdin:
    w0, w1 = line.strip().split('\t')

    out = {}
    d = {}
    l = []

    s0 = w0.strip().split(',')
    s1 = w1.strip().split(',')

    for ss in s0:
        k,v = ss.split(':')
        d[k] = int(v)

    for ss in s1:
        l.append(ss)

    for keys in l:
        if d.get(keys, None) is not None:
            out[keys] = d[keys]
        else:
            out[keys] = 0

     print ','.join([str(k)+':'+str(v) for k,v in out.iteritems()])

Hive查询

add file /home/username/udf0.py;
add file /home/username/udf1.py;

SELECT TRANSFORM(dict, unique_keys)
       USING 'python udf1.py'
       AS (final_map STRING)
FROM (
  SELECT DISTINCT dict
    , CONCAT_WS(',', unique_keys) as unique_keys
  FROM (
    SELECT dict
      , COLLECT_SET(keys) OVER () AS unique_keys
    FROM (
      SELECT dict
        , keys
      FROM (
        SELECT dict
          , map_keys(str_to_map(dict)) AS key_arr
        FROM (
          SELECT TRANSFORM (col1, col2)
                 USING 'python udf0.py'
                 AS (dict STRING)
          FROM db.tbl ) x ) z
      LATERAL VIEW EXPLODE(key_arr) exptbl AS keys ) a ) b ) c

<强>输出:

a:6,b:2,c:6,d:0
a:21,b:7,c:0,d:5

<强>阐释:

第一个UDF将获取您的字符串,将其转换为python字典并更新密钥(即将值与匹配的密钥一起添加)。由于您不了解实际的密钥,因此您需要从每个字典中提取密钥(在配置单元查询中map_keys()),将表格展开,然后将它们收回到唯一的组。现在,您将拥有任何字典中的所有可能键。然后,从那里,您可以使用第二个UDF导入在第一个UDF中创建的字典,检查每个键是否存在,如果不存在,则为它的值设置零。