我的python脚本: #!的/ usr / bin中/蟒蛇 导入系统 进口小飞象 import cgi,urlparse #from dumbo.lib import JoinReducer #from dumbo.decor import primary,secondary
def mapper(key, value):
line = value.split('\t')
line[1] = float(line[1]) # Unix timestamp
line[3] = int(line[3]) # Port
line[4] = float(line[4]) # Delay
url_part = urlparse.urlsplit(line[6])
if url_part.path not in ('/android-test.htm',
'/iphone-test.htm',
'/symbian-test.htm',
'/meego-test.htm',
'/mobile-test.htm',
'/showlog.php',
'/showlognew.php',):
qs_dict = cgi.parse_qs(url_part.query)
line[6] = qs_dict
line[7] = int(line[7]) # HTTP status code
yield line
def reducer(key, values):
yield key, values
if __name__ == "__main__":
dumbo.run(mapper)
exec命令:
dumbo start logparser.py -input analytics.log-20111209 -output analytics-log -python \ python2.6 -hadoop /usr/local/cloudera/hadoop-0.20.2-cdh3u3/
jobtracker日志中的hadoop错误
2012-03-15 09:53:18,931 INFO org.apache.hadoop.mapred.TaskInProgress: Error from attempt_201203131446_0006_m_000003_2: java.lang.RuntimeException: java.lang.NullPointerException
at org.apache.hadoop.streaming.PipeMapRed.waitOutputThreads(PipeMapRed.java:376)
at org.apache.hadoop.streaming.PipeMapRed.mapRedFinished(PipeMapRed.java:572)
at org.apache.hadoop.streaming.PipeMapper.close(PipeMapper.java:136)
at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:57)
at org.apache.hadoop.streaming.PipeMapRunner.run(PipeMapRunner.java:34)
at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:391)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:325)
at org.apache.hadoop.mapred.Child$4.run(Child.java:270)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1157)
at org.apache.hadoop.mapred.Child.main(Child.java:264)
引起:java.lang.NullPointerException at org.apache.hadoop.streaming.io.TypedBytesOutputReader.readKeyValue(TypedBytesOutputReader.java:57) 在组织 提前感谢所有帮助。
答案 0 :(得分:0)
您必须在地图舞台上单独生成键和值。
def mapper(key, value):
...
yield line[0], line[1:]
另外请确保您的映射值是单个对象,因此您必须将多个值包装到tupple或列表中。