在我的hdfs中,我有一些电影名称为deadpool的twitter数据。所以我尝试在条形图中显示流行的哈希标记。我在python中编写了我的MapReduce。
所以这是我的映射器代码
from nltk.corpus import stopwords
import string
import re
emoticons_str = r"""
(?:
[:=;] # Eyes
[oO\-]? # Nose (optional)
[D\)\]\(\]/\\OpP] # Mouth
)"""
regex_str = [
emoticons_str,
r'<[^>]+>', # HTML tags
r'(?:@[\w_]+)', # @-mentions
r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
r'(?:[\w_]+)', # other words
r'(?:\S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
#------- ignore word is counting here ------------------
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via','RT']+stopwords.words('spanish')+stopwords.words('italian')+stopwords.words('russian')+stopwords.words('german')+stopwords.words('french')+stopwords.words('swedish')+stopwords.words('turkish')+stopwords.words('dutch')
def tokenize(s):
return tokens_re.findall(s)
def preprocess(s, lowercase=False):
tokens = tokenize(s)
if lowercase:
tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
return tokens
我的令牌模块看起来像这样。
#!/usr/bin/ python
import sys
import fileinput
from collections import Counter
import vincent
count_all = Counter()
lists=[]
for line in sys.stdin :
line = line.strip()
lists.append(line)
count_all.update(lists)
word_freq=count_all.most_common(10)
labels,frequency = zip (*word_freq)
data = {'data':frequency , 'x':labels }
bar =vincent.Bar(data, iter_idx='x')
bar.axis_titles(x='HasaTag', y='frequencies')
bar.to_json('term_freq.json', html_out=True, html_path='chart.html')
这是我的reducer代码
bin/hadoop jar ./share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar -file ~/Desktop/hadoop_ex/hash_tag_mapper.py -mapper ~/Desktop/hadoop_ex/hash_tag_mapper.py -file ~/Desktop/hadoop_ex/hash_tag_reducer.py -reducer ~/Desktop/hadoop_ex/hash_tag_reducer.py -input /TwitterSample/* -output /thesis-output
我用来运行以下命令。
16/04/03 06:24:26 ERROR streaming.StreamJob: Job not successful!
Streaming Command Failed!
但是它给了我以下错误
html, body {
overflow-x: hidden;
}
body {
position: relative
}