在Hadoop 2.2.0中,我有几个值可以由组合器输出,而不是将它们发送到reducer。
我尝试使用MultipleOutputs从组合器输出到特定文件。遗憾的是,当为一个映射器创建多个组合器时,我遇到异常,因为它们尝试访问同一个文件。
有没有办法为每个组合器创建不同的分区?
简化代码:
public class Combiner1 extends Reducer { String FILENAME = "tmp_round1_comb.txt"; protected MultipleOutputs mos; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); String s = String.format("Combiner-%05d-%d", context.getTaskAttemptID().getTaskID().getId(), context.getTaskAttemptID().getId()); LOG.info(s); mos = new MultipleOutputs(context); } @Override protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { if (specialCase) { ... mos.write(out1, out2, FILENAME); } else { .... context.write(key, out); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { mos.close(); } }
例外:
2014-10-13 14:53:00,045 INFO [main] org.apache.hadoop.mapred.MapTask: Starting flush of map output 2014-10-13 14:53:00,045 INFO [main] org.apache.hadoop.mapred.MapTask: Ignoring exception during close for org.apache.hadoop.mapred.MapTask$NewOutputCollector@61f2bf35 java.io.IOException: Spill failed at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.checkSpillException(MapTask.java:1535) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.flush(MapTask.java:1444) at org.apache.hadoop.mapred.MapTask$NewOutputCollector.close(MapTask.java:700) at org.apache.hadoop.mapred.MapTask.closeQuietly(MapTask.java:1990) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:774) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:340) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:167) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1554) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:162) Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: failed to create file /user/cloudera/scratch/Profiling_q4/20141013_145034/tmp/TMP_0_Out1/_temporary/1/_temporary/attempt_1412109710756_0057_m_000000_0/tmp_round1_comb.txt-m-00000 on client 127.0.0.1 because the file exists at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInternal(FSNamesystem.java:2307) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInt(FSNamesystem.java:2235) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFile(FSNamesystem.java:2188) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.create(NameNodeRpcServer.java:505) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.create(ClientNamenodeProtocolServerSideTranslatorPB.java:354) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1986) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1982) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1554) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1980) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:526) at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106) at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:73) at org.apache.hadoop.hdfs.DFSOutputStream.newStreamForCreate(DFSOutputStream.java:1603) at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1461) at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1386) at org.apache.hadoop.hdfs.DistributedFileSystem$6.doCall(DistributedFileSystem.java:394) at org.apache.hadoop.hdfs.DistributedFileSystem$6.doCall(DistributedFileSystem.java:390) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.create(DistributedFileSystem.java:390) at org.apache.hadoop.hdfs.DistributedFileSystem.create(DistributedFileSystem.java:334) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:906) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:887) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:784) at org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.getRecordWriter(TextOutputFormat.java:132) at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.getRecordWriter(MultipleOutputs.java:475) at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.write(MultipleOutputs.java:457) at mapreduce.guardedfragment.executor.hadoop.combiners.GFCombiner1.reduce(GFCombiner1.java:139) at mapreduce.guardedfragment.executor.hadoop.combiners.GFCombiner1.reduce(GFCombiner1.java:31) at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171) at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1645) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1611) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$900(MapTask.java:853) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:1505) Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.fs.FileAlreadyExistsException): failed to create file /user/cloudera/scratch/Profiling_q4/20141013_145034/tmp/TMP_0_Out1/_temporary/1/_temporary/attempt_1412109710756_0057_m_000000_0/tmp_round1_comb.txt-m-00000 on client 127.0.0.1 because the file exists at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInternal(FSNamesystem.java:2307) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInt(FSNamesystem.java:2235) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFile(FSNamesystem.java:2188) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.create(NameNodeRpcServer.java:505) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.create(ClientNamenodeProtocolServerSideTranslatorPB.java:354) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1986) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1982) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1554) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1980) at org.apache.hadoop.ipc.Client.call(Client.java:1409) at org.apache.hadoop.ipc.Client.call(Client.java:1362) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206) at com.sun.proxy.$Proxy10.create(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) at com.sun.proxy.$Proxy10.create(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.create(ClientNamenodeProtocolTranslatorPB.java:258) at org.apache.hadoop.hdfs.DFSOutputStream.newStreamForCreate(DFSOutputStream.java:1599) ... 20 more 2014-10-13 14:53:00,048 WARN [main] org.apache.hadoop.security.UserGroupInformation: PriviledgedActionException as:cloudera (auth:SIMPLE) cause:java.io.IOException: Spill failed 2014-10-13 14:53:00,049 WARN [main] org.apache.hadoop.mapred.YarnChild: Exception running child : java.io.IOException: Spill failed at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.checkSpillException(MapTask.java:1535) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$300(MapTask.java:853) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer$Buffer.write(MapTask.java:1349) at java.io.DataOutputStream.write(DataOutputStream.java:107) at org.apache.hadoop.io.Text.write(Text.java:324) at org.apache.hadoop.io.serializer.WritableSerialization$WritableSerializer.serialize(WritableSerialization.java:98) at org.apache.hadoop.io.serializer.WritableSerialization$WritableSerializer.serialize(WritableSerialization.java:82) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1126) at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:692) at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89) at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112) at mapreduce.guardedfragment.executor.hadoop.mappers.GFMapper1Guard.map(GFMapper1Guard.java:98) at mapreduce.guardedfragment.executor.hadoop.mappers.GFMapper1Guard.map(GFMapper1Guard.java:37) at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:145) at org.apache.hadoop.mapreduce.lib.input.DelegatingMapper.run(DelegatingMapper.java:55) at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:340) at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:167) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1554) at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:162) Caused by: org.apache.hadoop.fs.FileAlreadyExistsException: failed to create file /user/cloudera/scratch/Profiling_q4/20141013_145034/tmp/TMP_0_Out1/_temporary/1/_temporary/attempt_1412109710756_0057_m_000000_0/tmp_round1_comb.txt-m-00000 on client 127.0.0.1 because the file exists at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInternal(FSNamesystem.java:2307) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInt(FSNamesystem.java:2235) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFile(FSNamesystem.java:2188) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.create(NameNodeRpcServer.java:505) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.create(ClientNamenodeProtocolServerSideTranslatorPB.java:354) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1986) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1982) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1554) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1980) at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57) at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) at java.lang.reflect.Constructor.newInstance(Constructor.java:526) at org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106) at org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:73) at org.apache.hadoop.hdfs.DFSOutputStream.newStreamForCreate(DFSOutputStream.java:1603) at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1461) at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1386) at org.apache.hadoop.hdfs.DistributedFileSystem$6.doCall(DistributedFileSystem.java:394) at org.apache.hadoop.hdfs.DistributedFileSystem$6.doCall(DistributedFileSystem.java:390) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.create(DistributedFileSystem.java:390) at org.apache.hadoop.hdfs.DistributedFileSystem.create(DistributedFileSystem.java:334) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:906) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:887) at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:784) at org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.getRecordWriter(TextOutputFormat.java:132) at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.getRecordWriter(MultipleOutputs.java:475) at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.write(MultipleOutputs.java:457) at mapreduce.guardedfragment.executor.hadoop.combiners.GFCombiner1.reduce(GFCombiner1.java:139) at mapreduce.guardedfragment.executor.hadoop.combiners.GFCombiner1.reduce(GFCombiner1.java:31) at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171) at org.apache.hadoop.mapred.Task$NewCombinerRunner.combine(Task.java:1645) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.sortAndSpill(MapTask.java:1611) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.access$900(MapTask.java:853) at org.apache.hadoop.mapred.MapTask$MapOutputBuffer$SpillThread.run(MapTask.java:1505) Caused by: org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.fs.FileAlreadyExistsException): failed to create file /user/cloudera/scratch/Profiling_q4/20141013_145034/tmp/TMP_0_Out1/_temporary/1/_temporary/attempt_1412109710756_0057_m_000000_0/tmp_round1_comb.txt-m-00000 on client 127.0.0.1 because the file exists at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInternal(FSNamesystem.java:2307) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFileInt(FSNamesystem.java:2235) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.startFile(FSNamesystem.java:2188) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.create(NameNodeRpcServer.java:505) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.create(ClientNamenodeProtocolServerSideTranslatorPB.java:354) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:585) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1026) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1986) at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:1982) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:415) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1554) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:1980) at org.apache.hadoop.ipc.Client.call(Client.java:1409) at org.apache.hadoop.ipc.Client.call(Client.java:1362) at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206) at com.sun.proxy.$Proxy10.create(Unknown Source) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102) at com.sun.proxy.$Proxy10.create(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.create(ClientNamenodeProtocolTranslatorPB.java:258) at org.apache.hadoop.hdfs.DFSOutputStream.newStreamForCreate(DFSOutputStream.java:1599) ... 20 more 2014-10 -13 14:53:00,065 INFO [main] org.apache.hadoop.mapred.Task: Runnning cleanup for the task