目前我正在处理大量输入(twitter)并尝试使用Apache Hive运行一些基本的情绪分析。但是,我无法弄清楚如何比较我的tweetids和body字符串。我将尽力在下面解释:
我有两个名为TwitterLoc和TwitterNo的外部存储表,然后是:
CREATE EXTERNAL TABLE dict (word text, score int)
STORED BY 'org.apache.hadoop.hive.cassandra.CassandraStorageHandler'
WITH SERDEPROPERTIES("cassandra.ks.name"="myKeyspace", "cassandra.port"=9160);
- 目标表
DROP TABLE IF EXISTS results;
CREATE EXTERNAL TABLE results(tweetid string, score int)
STORED BY 'org.apache.hadoop.hive.cassandra.CassandraStorageHandler'
WITH SERDEPROPERTIES("cassandra.ks.name"="twitterverse")
- 将表及其相关信息加入一个表
DROP IF EXISTS twitter;
CREATE TABLE TWITTER(tweetid string, body string)
STORED AS SEQUENCEFILE;
INSERT OVERWRITE TABLE twitter
SELECT tweetid, body
FROM twitterLoc;
INSERT INTO TABLE twitter
select tweetid, body
from twitterNo;
从这里,我想完成以下任务:
通过tweetid再次对分数进行分组 这就是我试图这样做的方式:
--Compare to dictionary
DROP TABLE IF EXISTS twitterSplit;
CREATE TABLE twitterSplit(tweetid string, word text)
STORED AS SEQUENCEFILE
INSERT OVERWRITE TABLE twitterSplit
SELECT tweetid, SPLIT(body, " ")
FROM twitter;
DROP TABLE IF EXISTS scoreTable
CREATE TABLE scoreTable(tweetid string, word text, score int)
STORED AS SEQUENCEFILE;
INSERT OVERWRITE TABLE scoreTable
Select twitterSplit.tweetid, twitterSplit.word, Dict.score
FROM twitterSplit JOIN Dict WHERE word = word;
--Report Scores
INSERT OVERWRITE TABLE results
SELECT tweetid, SUM(score) by tweetid
FROM ScoreTable
GROUP BY tweetid;