我的群集中有100个nodemanagers,1000个内核和4.8T内存。 但是,最近有些事情让我在nodemanager上发疯了,偶尔会发生一次,而不是每天都发生。
在Cloudera Management中,健康测试显示" GC持续时间未知"和#34; Web服务器状态错误":
然后,我在群集中的应用程序将出现超时或线程中断错误。
和nodemanager错误日志如下所示:
java.io.IOException: Failed on local exception: java.io.InterruptedIOException: Interrupted: action=RetryAction(action=RETRY, delayMillis=1000, reason=null), retry policy=RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS); Host Details : local host is: "lfh-R720-20/10.1.0.20"; destination host is: "lfh-R720-20":8040;
at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:772)
at org.apache.hadoop.ipc.Client.call(Client.java:1472)
at org.apache.hadoop.ipc.Client.call(Client.java:1399)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:232)
at com.sun.proxy.$Proxy40.heartbeat(Unknown Source)
at org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:62)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:235)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:129)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1132)
Caused by: java.io.InterruptedIOException: Interrupted: action=RetryAction(action=RETRY, delayMillis=1000, reason=null), retry policy=RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)
at org.apache.hadoop.ipc.Client$Connection.handleConnectionFailure(Client.java:855)
at org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:626)
at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:705)
at org.apache.hadoop.ipc.Client$Connection.access$2800(Client.java:368)
at org.apache.hadoop.ipc.Client.getConnection(Client.java:1521)
at org.apache.hadoop.ipc.Client.call(Client.java:1438)
... 8 more
Caused by: java.lang.InterruptedException: sleep interrupted
at java.lang.Thread.sleep(Native Method)
at org.apache.hadoop.ipc.Client$Connection.handleConnectionFailure(Client.java:853)
... 13 more
java.io.IOException: Failed on local exception: java.io.InterruptedIOException: Interrupted: action=RetryAction(action=RETRY, delayMillis=1000, reason=null), retry policy=RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS); Host Details : local host is: "lfh-R720-20/10.1.0.20"; destination host is: "lfh-R720-20":8040;
at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:772)
at org.apache.hadoop.ipc.Client.call(Client.java:1472)
at org.apache.hadoop.ipc.Client.call(Client.java:1399)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:232)
at com.sun.proxy.$Proxy40.heartbeat(Unknown Source)
at org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:62)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:235)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:129)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1132)
Caused by: java.io.InterruptedIOException: Interrupted: action=RetryAction(action=RETRY, delayMillis=1000, reason=null), retry policy=RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)
at org.apache.hadoop.ipc.Client$Connection.handleConnectionFailure(Client.java:855)
at org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:626)
at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:705)
at org.apache.hadoop.ipc.Client$Connection.access$2800(Client.java:368)
at org.apache.hadoop.ipc.Client.getConnection(Client.java:1521)
at org.apache.hadoop.ipc.Client.call(Client.java:1438)
... 8 more
Caused by: java.lang.InterruptedException: sleep interrupted
at java.lang.Thread.sleep(Native Method)
at org.apache.hadoop.ipc.Client$Connection.handleConnectionFailure(Client.java:853)
... 13 more
java.io.IOException: Failed on local exception: java.io.InterruptedIOException: Interrupted: action=RetryAction(action=RETRY, delayMillis=1000, reason=null), retry policy=RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS); Host Details : local host is: "lfh-R720-20/10.1.0.20"; destination host is: "lfh-R720-20":8040;
at org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:772)
at org.apache.hadoop.ipc.Client.call(Client.java:1472)
at org.apache.hadoop.ipc.Client.call(Client.java:1399)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:232)
at com.sun.proxy.$Proxy40.heartbeat(Unknown Source)
at org.apache.hadoop.yarn.server.nodemanager.api.impl.pb.client.LocalizationProtocolPBClientImpl.heartbeat(LocalizationProtocolPBClientImpl.java:62)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.localizeFiles(ContainerLocalizer.java:235)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ContainerLocalizer.runLocalization(ContainerLocalizer.java:169)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.startLocalizer(DefaultContainerExecutor.java:129)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.localizer.ResourceLocalizationService$LocalizerRunner.run(ResourceLocalizationService.java:1132)
Caused by: java.io.InterruptedIOException: Interrupted: action=RetryAction(action=RETRY, delayMillis=1000, reason=null), retry policy=RetryUpToMaximumCountWithFixedSleep(maxRetries=10, sleepTime=1000 MILLISECONDS)
at org.apache.hadoop.ipc.Client$Connection.handleConnectionFailure(Client.java:855)
at org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:626)
at org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:705)
at org.apache.hadoop.ipc.Client$Connection.access$2800(Client.java:368)
at org.apache.hadoop.ipc.Client.getConnection(Client.java:1521)
at org.apache.hadoop.ipc.Client.call(Client.java:1438)
... 8 more
Caused by: java.lang.InterruptedException: sleep interrupted
at java.lang.Thread.sleep(Native Method)
at org.apache.hadoop.ipc.Client$Connection.handleConnectionFailure(Client.java:853)
... 13 more
谢谢你!