Question

我正在尝试运行Hadoop流式Python作业：

/home/hduser/hadoop/bin/hadoop jar /home/hduser/hadoop/share/hadoop/tools/lib/hadoop-*streaming*.jar -file audio.py -cacheFile hdfs://localhost:54310/user/hduser/fpcalc#fpcalc -input /user/hduser/audio/input -output /user/hduser/audio/output -mapper $cwd/audio.py -cmdenv AUDIO_DIR=/user/hduser/audio/input/ -verbose

我得到＆＃34;语法无效＆＃34;在audio.py的第183行（它只有182行）。

似乎Hadoop没有以我想要的方式处理audio.py文件（我想！）。我尝试删除上面的$ cwd并将名称放在引号中并指向pyc文件，然后使用＆＃34; python audio.py＆＃34;但没有任何帮助。我已将fpcalc复制到Hadoop。

非常感谢任何帮助！

audio.py：

 #!/usr/bin/env python
# Adapted from http://www.randombytes.org/audio_comparison.html


import os
dir_a=os.environ["AUDIO_DIR"]

# directories to compare files between
# set to the same directory to compare files between themselves
dir_b=dir_a
# file to write matches to
match_file = 'matches.txt'
# seconds to sample audio file for
sample_length = 30
# number of points to crop from beginning of fingerprint
# 4096 / 11025 Hz / 3 = 0.124 seconds per point
crop = 3
# number of points to scan cross correlation over
span = 100
# step size (in points) of cross correlation
step = 1
# report match when cross correlation has a peak exceeding threshold
threshold = 0

################################################################################
# import modules
################################################################################

import re
import commands
import numpy
import math

################################################################################
# function definitions
################################################################################

# adds escape characters in front of Bash special characters
def esc_bash_chars(string):
    # match any of the following characters between the capital A's
    # A`!$^&*()=[]{}\|;:'",<>? A
    # note that characters ] and ' need escape characters themselves in the
    # regex, and \ requires two escape characters
    specialchars = re.compile('[`!$^&*()=[\]{}\\\|;:\'",<>? ]')
    string_escaped = ""
    for char in string:
        if specialchars.search(char):
            string_escaped += '\\' + char
        else:
            string_escaped += char
    return string_escaped

# returns variance of list
def variance(listx):
    meanx = numpy.mean(listx)
    # get mean of x^2
    meanx_sqr = 0
    for x in listx:
        meanx_sqr += x**2
    meanx_sqr = meanx_sqr / float(len(listx))
    return meanx_sqr - meanx**2

# returns correlation between lists
def correlation(listx, listy):
    if len(listx) != len(listy):
        return -2

    meanx = numpy.mean(listx)
    meany = numpy.mean(listy)

    covariance = 0
    for i in range(len(listx)):
        covariance += (listx[i] - meanx) * (listy[i] - meany)
    covariance = covariance / float(len(listx))

    return covariance / (math.sqrt(variance(listx)) * math.sqrt(variance(listy)))

# return cross correlation, with listy offset from listx
def cross_correlation(listx, listy, offset):
    if offset > 0:
        listx = listx[offset:]
        listy = listy[:len(listx)]
    elif offset < 0:
        offset = -offset
        listy = listy[offset:]
        listx = listx[:len(listy)]
    return correlation(listx, listy)

# cross correlate listx and listy with offsets from -span to span
def compare(listx, listy, span, step):
    corr_xy = []
    for offset in numpy.arange(-span, span + 1, step):
        corr_xy.append(cross_correlation(listx, listy, offset))
    return corr_xy

# return index of maximum value in list
def max_index(listx):
    max_index = 0
    max_value = listx[0]
    for i, value in enumerate(listx):
        if value > max_value:
            max_value = value
            max_index = i
    return max_index

# write to a file
def write_string(string, filename):
    file_out = open(filename, 'ab')
    file_out.write(string + '\n')
    file_out.close()

################################################################################
# main code
################################################################################

# escape Bash special characters
dir_a = esc_bash_chars(dir_a)
dir_b = esc_bash_chars(dir_b)
match_file = esc_bash_chars(match_file)

# get list of files to compare from each directory
filelist_a = commands.getoutput('ls ' + dir_a + '*.*').split('\n')
filelist_b = commands.getoutput('ls ' + dir_b + '*.*').split('\n')

# if cross-correlating between files within a directory, don't correlate files
# twice, or correlate files with themselves
intra_correlating = False
if filelist_a == filelist_b:
    intra_correlating = True

for i, file_a in enumerate(filelist_a):
    # if correlating between files within a directory, set filelist_b such that
    # cross-correlations are not repeated, and files are not correlated with
    # themselves
    if intra_correlating:
        # remove files already correlated with from filelist_b, along with
        # current file
        filelist_b = filelist_a[i+1:]
        if len(filelist_b) == 0:
            # nothing left to check!
            break

    file_a = esc_bash_chars(file_a)
    # calculate fingerprint
    # ewd qqq
#     fpcalc_out = commands.getoutput('./fpcalc -raw -length ' \
#                                      + str(sample_length) + ' ' + file_a)
    import shlex
    cli = './fpcalc -raw -length ' + str(sample_length) + ' ' + file_a
    from subprocess import Popen, PIPE

    cli_parts = shlex.split(cli)
    fpcalc_out = Popen(cli_parts, stdin=PIPE, stderr=PIPE, stdout=PIPE)
    fpcalc_out.communicate()[0]
    # ewd qqq end


    fingerprint_index = fpcalc_out.find('FINGERPRINT=') + 12
    # convert fingerprint to list of integers
    fingerprint_a = map(int, fpcalc_out[fingerprint_index:].split(','))

    for file_b in filelist_b:
        file_b = esc_bash_chars(file_b)
        # calculate fingerprint
        fpcalc_out = commands.getoutput('./fpcalc -raw -length ' \
                                         + str(sample_length) + ' ' + file_b)
        fingerprint_index = fpcalc_out.find('FINGERPRINT=') + 12
        # convert fingerprint to list of integers
        fingerprint_b = map(int, fpcalc_out[fingerprint_index:].split(','))

        # cross correlation between fingerprints
        corr_ab = compare(fingerprint_a[crop:], fingerprint_b[crop:], span, step)
        max_corr_index = max_index(corr_ab)
        max_corr_offset = -span + max_corr_index * step

        # report matches
        if corr_ab[max_corr_index] > threshold: #qqqewd 0:
#             print(file_a + ' and ' + file_b + ' match with correlation of ' \
#                   + str(corr_ab[max_corr_index]) + ' at offset ' \
#                   + str(max_corr_offset))
            write_string(file_a + ' ' + file_b + '\t' \
                         + str(corr_ab[max_corr_index])

无效语法hadoop流错误

0 个答案: