Question

在我的hdfs中，我有一些电影名称为deadpool的twitter数据。所以我尝试在条形图中显示流行的哈希标记。我在python中编写了我的MapReduce。

所以这是我的映射器代码

from nltk.corpus import stopwords
import string
import re




emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs

    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

#------- ignore word is counting here ------------------
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via','RT']+stopwords.words('spanish')+stopwords.words('italian')+stopwords.words('russian')+stopwords.words('german')+stopwords.words('french')+stopwords.words('swedish')+stopwords.words('turkish')+stopwords.words('dutch')


def tokenize(s):
    return tokens_re.findall(s)

def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

我的令牌模块看起来像这样。

#!/usr/bin/ python
import sys
import fileinput
from collections import Counter
import vincent


count_all = Counter()
lists=[]
for line in sys.stdin :
    line = line.strip()
    lists.append(line)

count_all.update(lists)
word_freq=count_all.most_common(10)
labels,frequency = zip (*word_freq)
data = {'data':frequency , 'x':labels }
bar =vincent.Bar(data, iter_idx='x')
bar.axis_titles(x='HasaTag', y='frequencies')
bar.to_json('term_freq.json', html_out=True, html_path='chart.html')

这是我的reducer代码

bin/hadoop jar ./share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar -file ~/Desktop/hadoop_ex/hash_tag_mapper.py -mapper ~/Desktop/hadoop_ex/hash_tag_mapper.py -file ~/Desktop/hadoop_ex/hash_tag_reducer.py -reducer ~/Desktop/hadoop_ex/hash_tag_reducer.py -input /TwitterSample/* -output /thesis-output

我用来运行以下命令。

    16/04/03 06:24:26 ERROR streaming.StreamJob: Job not successful!
Streaming Command Failed!

但是它给了我以下错误

html, body {
  overflow-x: hidden;
}
body {
  position: relative
}

那我该怎么办。我看到像我这样的类似问题，但没有得到正确答案。 link和another

java.lang.RuntimeException：PipeMapRed.waitOutputThreads（）：子进程失败，代码为1

0 个答案: