我正在尝试运行Hadoop流式Python作业:
/home/hduser/hadoop/bin/hadoop jar /home/hduser/hadoop/share/hadoop/tools/lib/hadoop-*streaming*.jar -file audio.py -cacheFile hdfs://localhost:54310/user/hduser/fpcalc#fpcalc -input /user/hduser/audio/input -output /user/hduser/audio/output -mapper $cwd/audio.py -cmdenv AUDIO_DIR=/user/hduser/audio/input/ -verbose
我得到"语法无效"在audio.py的第183行(它只有182行)。
似乎Hadoop没有以我想要的方式处理audio.py文件(我想!)。我尝试删除上面的$ cwd并将名称放在引号中并指向pyc文件,然后使用" python audio.py"但没有任何帮助。我已将fpcalc复制到Hadoop。
非常感谢任何帮助!
audio.py:
#!/usr/bin/env python
# Adapted from http://www.randombytes.org/audio_comparison.html
import os
dir_a=os.environ["AUDIO_DIR"]
# directories to compare files between
# set to the same directory to compare files between themselves
dir_b=dir_a
# file to write matches to
match_file = 'matches.txt'
# seconds to sample audio file for
sample_length = 30
# number of points to crop from beginning of fingerprint
# 4096 / 11025 Hz / 3 = 0.124 seconds per point
crop = 3
# number of points to scan cross correlation over
span = 100
# step size (in points) of cross correlation
step = 1
# report match when cross correlation has a peak exceeding threshold
threshold = 0
################################################################################
# import modules
################################################################################
import re
import commands
import numpy
import math
################################################################################
# function definitions
################################################################################
# adds escape characters in front of Bash special characters
def esc_bash_chars(string):
# match any of the following characters between the capital A's
# A`!$^&*()=[]{}\|;:'",<>? A
# note that characters ] and ' need escape characters themselves in the
# regex, and \ requires two escape characters
specialchars = re.compile('[`!$^&*()=[\]{}\\\|;:\'",<>? ]')
string_escaped = ""
for char in string:
if specialchars.search(char):
string_escaped += '\\' + char
else:
string_escaped += char
return string_escaped
# returns variance of list
def variance(listx):
meanx = numpy.mean(listx)
# get mean of x^2
meanx_sqr = 0
for x in listx:
meanx_sqr += x**2
meanx_sqr = meanx_sqr / float(len(listx))
return meanx_sqr - meanx**2
# returns correlation between lists
def correlation(listx, listy):
if len(listx) != len(listy):
return -2
meanx = numpy.mean(listx)
meany = numpy.mean(listy)
covariance = 0
for i in range(len(listx)):
covariance += (listx[i] - meanx) * (listy[i] - meany)
covariance = covariance / float(len(listx))
return covariance / (math.sqrt(variance(listx)) * math.sqrt(variance(listy)))
# return cross correlation, with listy offset from listx
def cross_correlation(listx, listy, offset):
if offset > 0:
listx = listx[offset:]
listy = listy[:len(listx)]
elif offset < 0:
offset = -offset
listy = listy[offset:]
listx = listx[:len(listy)]
return correlation(listx, listy)
# cross correlate listx and listy with offsets from -span to span
def compare(listx, listy, span, step):
corr_xy = []
for offset in numpy.arange(-span, span + 1, step):
corr_xy.append(cross_correlation(listx, listy, offset))
return corr_xy
# return index of maximum value in list
def max_index(listx):
max_index = 0
max_value = listx[0]
for i, value in enumerate(listx):
if value > max_value:
max_value = value
max_index = i
return max_index
# write to a file
def write_string(string, filename):
file_out = open(filename, 'ab')
file_out.write(string + '\n')
file_out.close()
################################################################################
# main code
################################################################################
# escape Bash special characters
dir_a = esc_bash_chars(dir_a)
dir_b = esc_bash_chars(dir_b)
match_file = esc_bash_chars(match_file)
# get list of files to compare from each directory
filelist_a = commands.getoutput('ls ' + dir_a + '*.*').split('\n')
filelist_b = commands.getoutput('ls ' + dir_b + '*.*').split('\n')
# if cross-correlating between files within a directory, don't correlate files
# twice, or correlate files with themselves
intra_correlating = False
if filelist_a == filelist_b:
intra_correlating = True
for i, file_a in enumerate(filelist_a):
# if correlating between files within a directory, set filelist_b such that
# cross-correlations are not repeated, and files are not correlated with
# themselves
if intra_correlating:
# remove files already correlated with from filelist_b, along with
# current file
filelist_b = filelist_a[i+1:]
if len(filelist_b) == 0:
# nothing left to check!
break
file_a = esc_bash_chars(file_a)
# calculate fingerprint
# ewd qqq
# fpcalc_out = commands.getoutput('./fpcalc -raw -length ' \
# + str(sample_length) + ' ' + file_a)
import shlex
cli = './fpcalc -raw -length ' + str(sample_length) + ' ' + file_a
from subprocess import Popen, PIPE
cli_parts = shlex.split(cli)
fpcalc_out = Popen(cli_parts, stdin=PIPE, stderr=PIPE, stdout=PIPE)
fpcalc_out.communicate()[0]
# ewd qqq end
fingerprint_index = fpcalc_out.find('FINGERPRINT=') + 12
# convert fingerprint to list of integers
fingerprint_a = map(int, fpcalc_out[fingerprint_index:].split(','))
for file_b in filelist_b:
file_b = esc_bash_chars(file_b)
# calculate fingerprint
fpcalc_out = commands.getoutput('./fpcalc -raw -length ' \
+ str(sample_length) + ' ' + file_b)
fingerprint_index = fpcalc_out.find('FINGERPRINT=') + 12
# convert fingerprint to list of integers
fingerprint_b = map(int, fpcalc_out[fingerprint_index:].split(','))
# cross correlation between fingerprints
corr_ab = compare(fingerprint_a[crop:], fingerprint_b[crop:], span, step)
max_corr_index = max_index(corr_ab)
max_corr_offset = -span + max_corr_index * step
# report matches
if corr_ab[max_corr_index] > threshold: #qqqewd 0:
# print(file_a + ' and ' + file_b + ' match with correlation of ' \
# + str(corr_ab[max_corr_index]) + ' at offset ' \
# + str(max_corr_offset))
write_string(file_a + ' ' + file_b + '\t' \
+ str(corr_ab[max_corr_index])