我有两个包含string
对变量的key/value
列。像这样:
column1 column2
a:1,b:2,c:3 a:5,c:3
a:12,b:4 a:9,b:3,d:5
我怎样才能通过特定的键将这些值(在现实生活中我不知道我有多少键,一些键只能在一列中找到)来得到这个值:
column12
a:6,b:2,c:6,d:0
a:21,b:7,c:0,d:5
或者这个:
a b c d
6 2 6 0
21 7 0 5
谢谢你的帮助!
答案 0 :(得分:2)
假设每一行都有一个唯一标识符“id”,以下查询
应该工作。
select id, collect_list(CONCAT(key,':',val)) as column12
from
(
select id, key, SUM(val) as val
from
(
select id, k1 as key, k2 as key, v1 as val, v2 as val
from
(
select id, str_to_map(col1,',',':') as c1, str_to_map(col2,',',':') as c2
from table
)x
LATERAL VIEW explode(c1) e1 as k1,v1
LATERAL VIEW explode(c2) e2 as k2,v2
)y
)z
答案 1 :(得分:0)
其中一种方法是使用Hcatalog编写自定义mapreduce代码。
以下MR获取表1的输入列并将结果列写入table2(由于逻辑在映射器中处理,因此不需要reducer)
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.util.*;
import org.apache.hcatalog.common.*;
import org.apache.hcatalog.mapreduce.*;
import org.apache.hcatalog.data.*;
import org.apache.hcatalog.data.schema.*;
import org.apache.commons.lang3.ArrayUtils;
public class HCatSum extends Configured implements Tool {
public static class Map
extends
Mapper<WritableComparable, HCatRecord, WritableComparable, HCatRecord> {
String column1;
String column2;
String val;
@Override
protected void map(
WritableComparable key,
HCatRecord value,
org.apache.hadoop.mapreduce.Mapper<WritableComparable, HCatRecord, WritableComparable, HCatRecord>.Context context)
throws IOException, InterruptedException {
column1 = (String) value.get(0); // a:1,b:2,c:3
column2 = (String) value.get(1); // a:5,c:3
String colArray[] = (String[]) ArrayUtils.addAll(
column1.split(","), column2.split(","));
HashMap<String, Integer> hs = new HashMap<String, Integer>();
for (String token : colArray) {
String tokensplit[] = token.split(":");
String k = tokensplit[0];
int v = Integer.parseInt(tokensplit[1]);
if (hs.containsKey(k)) {
int prev = hs.get(k);
hs.put(k, prev + v);
} else {
hs.put(k, v);
}
}
val = Arrays.toString(hs.entrySet().toArray()); // [a=6,b=2,c=6]
HCatRecord record = new DefaultHCatRecord(1);
record.set(0, val);
context.write(null, record);
}
}
public int run(String[] args) throws Exception {
Configuration conf = getConf();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
// Get the input and output table names as arguments
String inputTableName = args[0];
String outputTableName = args[1];
// Assume the default database
String dbName = null;
Job job = new Job(conf, "HCatsum");
HCatInputFormat.setInput(job,
InputJobInfo.create(dbName, inputTableName, null));
job.setJarByClass(HCatSum.class);
job.setMapperClass(Map.class);
// An HCatalog record as input
job.setInputFormatClass(HCatInputFormat.class);
// Mapper emits a string as key and an integer as value
job.setMapOutputKeyClass(WritableComparable.class);
job.setMapOutputValueClass(DefaultHCatRecord.class);
// Ignore the key for the reducer output; emitting an HCatalog record as
// value
job.setOutputKeyClass(WritableComparable.class);
job.setOutputValueClass(DefaultHCatRecord.class);
job.setOutputFormatClass(HCatOutputFormat.class);
HCatOutputFormat.setOutput(job,
OutputJobInfo.create(dbName, outputTableName, null));
HCatSchema s = HCatOutputFormat.getTableSchema(job);
System.err.println("INFO: output schema explicitly set for writing:"
+ s);
HCatOutputFormat.setSchema(job, s);
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new HCatSum(), args);
System.exit(exitCode);
}
}
如何使用HCatalog运行MR(参考链接): Hive Manual
希望这有用。
答案 2 :(得分:0)
这是一个解决方案。它有点像黑客,但无论你拥有多少把钥匙都能正常工作。
<强> udf0.py 强>
#/usr/bin/python
import sys
from collections import Counter
for line in sys.stdin:
words = line.strip().split('\t')
c = Counter()
for word in words:
d = {}
s = word.split(',')
for ss in s:
k,v = ss.split(':')
d[k] = int(v)
c.update(d)
print ','.join([str(k)+':'+str(v) for k,v in dict(c).iteritems()])
<强> udf1.py 强>
#!/usr/bin/python
import sys
for line in sys.stdin:
w0, w1 = line.strip().split('\t')
out = {}
d = {}
l = []
s0 = w0.strip().split(',')
s1 = w1.strip().split(',')
for ss in s0:
k,v = ss.split(':')
d[k] = int(v)
for ss in s1:
l.append(ss)
for keys in l:
if d.get(keys, None) is not None:
out[keys] = d[keys]
else:
out[keys] = 0
print ','.join([str(k)+':'+str(v) for k,v in out.iteritems()])
Hive查询:
add file /home/username/udf0.py;
add file /home/username/udf1.py;
SELECT TRANSFORM(dict, unique_keys)
USING 'python udf1.py'
AS (final_map STRING)
FROM (
SELECT DISTINCT dict
, CONCAT_WS(',', unique_keys) as unique_keys
FROM (
SELECT dict
, COLLECT_SET(keys) OVER () AS unique_keys
FROM (
SELECT dict
, keys
FROM (
SELECT dict
, map_keys(str_to_map(dict)) AS key_arr
FROM (
SELECT TRANSFORM (col1, col2)
USING 'python udf0.py'
AS (dict STRING)
FROM db.tbl ) x ) z
LATERAL VIEW EXPLODE(key_arr) exptbl AS keys ) a ) b ) c
<强>输出:强>
a:6,b:2,c:6,d:0
a:21,b:7,c:0,d:5
<强>阐释:强>
第一个UDF将获取您的字符串,将其转换为python字典并更新密钥(即将值与匹配的密钥一起添加)。由于您不了解实际的密钥,因此您需要从每个字典中提取密钥(在配置单元查询中map_keys()
),将表格展开,然后将它们收回到唯一的组。现在,您将拥有任何字典中的所有可能键。然后,从那里,您可以使用第二个UDF导入在第一个UDF中创建的字典,检查每个键是否存在,如果不存在,则为它的值设置零。