我有一个docker容器,我使用可执行jar运行我的java应用程序。容器的大小我设置为 2 GB ,java app 堆内存为1GB(-Xmx1024m -Xss256k -Xms256m)。偶尔我的应用程序会退出退出代码137(即128 + 9,即docker manager因为使用了更多的内存然后分配而终止容器进程)。
日志没有说任何OOM错误(堆或元空间)。我也有NewRelic集成,也说JVM指标受到控制。 (我每次都知道NR聚合JVM指标,但每1分钟上传一次,但我认为应用程序在这1分钟内需要300-400 MB内存,而在NR报告此增加之前,容器崩溃了){{3 }}
我做了 docker检查容器ID ,下面是响应(" OOMKilled":true可以看到这个)
[
{
"Id": "85f8dc2d2f18956e5fa0cb573e2f6f115348ffb1b6fa6c67a1c35cd0cf0fc069",
"Created": "2018-03-09T22:08:23.220466103Z",
"Path": "/bin/sh",
"Args": [
"-c",
"exec java -javaagent:/app/newrelic/newrelic.jar -Dnewrelic.environment=${RUNENV} -jar -server -Xmx1024m -Xss256k -Xms256m -XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=150m -XX:CompressedClassSpaceSize=32m -XX:ReservedCodeCacheSize=16m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:+UseStringDeduplication -Dspring.profiles.active=${RUNENV} ${APPNAME}-${SNAPSHOT}.jar",
"/bin/sh",
"-c",
"[\"java\" \"-jar\" \"newrelic.jar\" \"install\"]"
],
"State": {
"Status": "exited",
"Running": false,
"Paused": false,
"Restarting": false,
"OOMKilled": true,
"Dead": false,
"Pid": 0,
"ExitCode": 137,
"Error": "",
"StartedAt": "2018-03-09T22:08:24.00385629Z",
"FinishedAt": "2018-03-11T11:59:00.847375256Z"
},
"Image": "sha256:ffc04450b7b1c2df258210cf85b5183414db46562f819668958e3d6cbda64aef",
"ResolvConfPath": "/var/lib/docker/containers/85f8dc2d2f18956e5fa0cb573e2f6f115348ffb1b6fa6c67a1c35cd0cf0fc069/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/85f8dc2d2f18956e5fa0cb573e2f6f115348ffb1b6fa6c67a1c35cd0cf0fc069/hostname",
"HostsPath": "/var/lib/docker/containers/85f8dc2d2f18956e5fa0cb573e2f6f115348ffb1b6fa6c67a1c35cd0cf0fc069/hosts",
"LogPath": "/var/lib/docker/containers/85f8dc2d2f18956e5fa0cb573e2f6f115348ffb1b6fa6c67a1c35cd0cf0fc069/85f8dc2d2f18956e5fa0cb573e2f6f115348ffb1b6fa6c67a1c35cd0cf0fc069-json.log",
"Name": "/prod_ats_1",
"RestartCount": 0,
"Driver": "devicemapper",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "",
"ExecIDs": null,
"HostConfig": {
"Binds": [],
"ContainerIDFile": "",
"LogConfig": {
"Type": "json-file",
"Config": {}
},
"NetworkMode": "prod_default",
"PortBindings": {
"8120/tcp": [
{
"HostIp": "",
"HostPort": ""
}
]
},
"RestartPolicy": {
"Name": "",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "",
"VolumesFrom": [],
"CapAdd": null,
"CapDrop": null,
"Dns": null,
"DnsOptions": null,
"DnsSearch": null,
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": false,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": null,
"StorageOpt": null,
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"ConsoleSize": [
0,
0
],
"Isolation": "",
"CpuShares": 0,
"Memory": 2147483648,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": null,
"BlkioDeviceReadBps": null,
"BlkioDeviceWriteBps": null,
"BlkioDeviceReadIOps": null,
"BlkioDeviceWriteIOps": null,
"CpuPeriod": 0,
"CpuQuota": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": null,
"DiskQuota": 0,
"KernelMemory": 0,
"MemoryReservation": 0,
"MemorySwap": 4294967296,
"MemorySwappiness": -1,
"OomKillDisable": false,
"PidsLimit": 0,
"Ulimits": null,
"CpuCount": 0,
"CpuPercent": 0,
"BlkioIOps": 0,
"BlkioBps": 0,
"SandboxSize": 0
},
"GraphDriver": {
"Name": "devicemapper",
"Data": {
"DeviceId": "6652",
"DeviceName": "docker-202:1-398151-e96af3795ec923bbdac4e2c7ff5e045e6985d94b2e36e05214fc3a71213eee59",
"DeviceSize": "10737418240"
}
},
"Mounts": [],
"Config": {
"Hostname": "85f8dc2d2f18",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"ExposedPorts": {
"8120/tcp": {}
},
"Tty": false,
"OpenStdin": false,
"StdinOnce": false,
"Env": [
"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/jvm/java-1.8-openjdk/jre/bin:/usr/lib/jvm/java-1.8-openjdk/bin",
"LANG=C.UTF-8",
"JAVA_HOME=/usr/lib/jvm/java-1.8-openjdk",
"JAVA_VERSION=8u111",
"JAVA_ALPINE_VERSION=8.111.14-r0",
"RUNENV=prod",
"APPNAME=ats",
"SNAPSHOT=1.0.0-SNAPSHOT",
"NEW_RELIC_ENVIRONMENT=prod"
],
"Cmd": [
"/bin/sh",
"-c",
"[\"java\" \"-jar\" \"newrelic.jar\" \"install\"]"
],
"Image": "nethum/ats:1.0.0-SNAPSHOT-prod",
"Volumes": null,
"WorkingDir": "/app",
"Entrypoint": [
"/bin/sh",
"-c",
"exec java -javaagent:/app/newrelic/newrelic.jar -Dnewrelic.environment=${RUNENV} -jar -server -Xmx1024m -Xss256k -Xms256m -XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=150m -XX:CompressedClassSpaceSize=32m -XX:ReservedCodeCacheSize=16m -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:+UseStringDeduplication -Dspring.profiles.active=${RUNENV} ${APPNAME}-${SNAPSHOT}.jar"
],
"OnBuild": null,
"Labels": {
"com.docker.compose.config-hash": "6e1c97f2951d45feeb07eed99ea87734e65b5df6d52ab5f5e39eb31c77d66104",
"com.docker.compose.container-number": "1",
"com.docker.compose.oneoff": "False",
"com.docker.compose.project": "prod",
"com.docker.compose.service": "ats",
"com.docker.compose.version": "1.9.0"
}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "cd3792031b973869dba7178372f0cce8ed040486cf81a91d762d365fdcb96a53",
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"Ports": null,
"SandboxKey": "/var/run/docker/netns/cd3792031b97",
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"prod_default": {
"IPAMConfig": null,
"Links": null,
"Aliases": [
"ats",
"85f8dc2d2f18"
],
"NetworkID": "7d6ec63949a05c0122643a05aedf4b5ca114ecd9f9e71aaa6c7d80bb1c0ca2ff",
"EndpointID": "",
"Gateway": "",
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"MacAddress": ""
}
}
}
}]
Docket stats在启动容器后立即显示内存为1.7GB。
**CONTAINER CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS**
85f8dc2d2f18 3.39% 1.594 GB / 2.147 GB 74.23% 556.6 kB / 660.9 kB 171.3 MB / 0 B 0
我尝试进行堆转储,但是它运行的进程ID(进程ID 1),我无法仅通过JMX设置进行连接。
我已经通过NewRelic 进行了线程转储,但这并没有显示任何问题。
我已经死了,不知道如何调试此问题并修复它。
我在New Relic 中设置了堆内存警报,当我的使用率达到80%时,我每天都会得到2-3次,但是在GC运行后这个问题就会停止引发。(可以在NR JVM度量标准部分中进行相关)。当容器被杀死时,之前没有堆memery使用警报。
这是 ps -aef从容器内输出
PID USER TIME COMMAND
1 root 8:40 java -javaagent:/app/newrelic/newrelic.jar -Dnewrelic.environment=prod -jar -server -Xmx1024m -Xss256k -Xms256m -XX:MetaspaceSize=64m -XX:MaxMetaspaceSize=150
181 root 0:00 sh
210 root 0:00 ps -aef
很少有人不知道答案,
答案 0 :(得分:0)
1GB的内存非常少。另外,将-Xmx和-Xms设置为相同的值。这样,当JVM不需要在运行时调整堆大小并导致程序崩溃时
docker exec -it <container> /bin/bash
然后你在容器上有一个shell。然后你可以ps -ef
或其他什么来查找进程。但是你的Dockerfile
肯定有FROM
声明,你应该知道预期会发生什么。现在你在机器上有一个shell,你可以jmap
堆转储(如果你足够快的话!)
vmstat
是开始分析内存使用情况,交换和页面输入/输出的好地方。 top
也提供了有用的信息。
也许你甚至不需要1GB的堆。运行jmap -histo:live
以查找活动对象大小(这些对象很可能永远不会被垃圾回收)。初始堆大小约为3或4倍,这将作为起点。从那时起,如果在实时数据对象之上创建的新对象都没有被提升为旧版本,那么这是理想的...