描述
我在使用docker swarm模式的覆盖网络时遇到问题(重要:swarm模式,而不是swarm)。我有一个名为“内部”的覆盖网络。我有一个名为“datacollector”的服务,它被缩放到12个实例。我docker exec
进入另一个在同一群体中运行的服务(并在同一个覆盖网络上)并运行curl http://datacollector
12次。但是,其中4个请求导致超时。然后我运行dig tasks.datacollector
并获取12个IP地址的列表。果然,8个IP地址可以工作,但每次都有4个超时。
我尝试将服务缩减为1个实例,然后重新调整为12个,但结果相同。
然后我使用docker service ps datacollector
查找我的服务的每个运行实例。我在每个节点上使用docker kill xxxx
来手动终止所有实例并让swarm重新创建它们。然后我再次检查挖掘并验证该任务的IP地址列表不再相同。在此之后,我又跑了curl http://datacollector
12次。现在只有3个请求有效,剩下的9个超时!
这是过去两周左右发生的第二次。上一次我必须删除所有服务,删除覆盖网络,重新创建覆盖网络,并重新创建所有服务以解决问题。显然,这不是一个可行的长期解决方案:(
`docker service inspect datacollector:
的输出[
{
"ID": "2uevc4ouakk6k3dirhgqxexz9",
"Version": {
"Index": 72152
},
"CreatedAt": "2016-11-12T20:38:51.137043037Z",
"UpdatedAt": "2016-11-17T15:22:34.402801678Z",
"Spec": {
"Name": "datacollector",
"TaskTemplate": {
"ContainerSpec": {
"Image": "507452836298.dkr.ecr.us-east-1.amazonaws.com/swarm/api:61d7931f583742cca91b368bc6d9e15314545093",
"Args": [
"node",
".",
"api/dataCollector"
],
"Env": [
"ENVIRONMENT=stage",
"MONGODB_URI=mongodb://mongodb:27017/liveearth",
"RABBITMQ_URL=amqp://rabbitmq",
"ELASTICSEARCH_URL=http://elasticsearch"
]
},
"Resources": {
"Limits": {},
"Reservations": {}
},
"RestartPolicy": {
"Condition": "any",
"MaxAttempts": 0
},
"Placement": {
"Constraints": [
"node.labels.role.api==true",
"node.labels.role.api==true",
"node.labels.role.api==true",
"node.labels.role.api==true",
"node.labels.role.api==true"
]
}
},
"Mode": {
"Replicated": {
"Replicas": 12
}
},
"UpdateConfig": {
"Parallelism": 1,
"FailureAction": "pause"
},
"Networks": [
{
"Target": "88e9fd9715o5v1hqu6dnkg3vp"
}
],
"EndpointSpec": {
"Mode": "vip"
}
},
"Endpoint": {
"Spec": {
"Mode": "vip"
},
"VirtualIPs": [
{
"NetworkID": "88e9fd9715o5v1hqu6dnkg3vp",
"Addr": "192.168.1.23/24"
}
]
},
"UpdateStatus": {
"State": "completed",
"StartedAt": "2016-11-17T15:19:34.471292948Z",
"CompletedAt": "2016-11-17T15:22:34.402794312Z",
"Message": "update completed"
}
}
]
docker network inspect internal
的输出:
[
{
"Name": "internal",
"Id": "88e9fd9715o5v1hqu6dnkg3vp",
"Scope": "swarm",
"Driver": "overlay",
"EnableIPv6": false,
"IPAM": {
"Driver": "default",
"Options": null,
"Config": [
{
"Subnet": "192.168.1.0/24",
"Gateway": "192.168.1.1"
}
]
},
"Internal": false,
"Containers": {
"03ac1e71139ff2140f93c80d9e6b1d69abf442a0c2362610bee3e116e84ef434": {
"Name": "datacollector.5.cxmvk7p1hwznautresir94m3s",
"EndpointID": "22445be80ba55b67d7cfcfbc75f2c15586bace5f317be8ba9b59c5f9f338525c",
"MacAddress": "02:42:c0:a8:01:72",
"IPv4Address": "192.168.1.114/24",
"IPv6Address": ""
},
"08ae84c7cb6e57583baf12c2a9082c1d17f1e65261cfa93346aaa9bda1244875": {
"Name": "auth.10.aasw00k7teq4knxibctlrrj7e",
"EndpointID": "c3506c851f4c9f0d06d684a9f023e7ba529d0149d70fa7834180a87ad733c678",
"MacAddress": "02:42:c0:a8:01:44",
"IPv4Address": "192.168.1.68/24",
"IPv6Address": ""
},
"192203a127d6831c3f4a41eabdd8df5282e33c3e92b99c3baaf1f213042f5418": {
"Name": "parkingcollector.1.8yrm6d831wrfsrkzhal7cf2pm",
"EndpointID": "34de6e9621ef54f7d963db942a7a7b6e0013ac6db6c9f17b384de689b1f1b187",
"MacAddress": "02:42:c0:a8:01:9a",
"IPv4Address": "192.168.1.154/24",
"IPv6Address": ""
},
"24258109e16c1a5b15dcc84a41d99a4a6617bcadecc9b35279c721c0d2855141": {
"Name": "stream.8.38npsusmpa1pf8fbnmaux57rx",
"EndpointID": "b675991ffbd5c0d051a4b68790a33307b03b48582fd1b37ba531cf5e964af0ce",
"MacAddress": "02:42:c0:a8:01:74",
"IPv4Address": "192.168.1.116/24",
"IPv6Address": ""
},
"33063b988473b73be2cbc51e912e165112de3d01bc00ee2107aa635e30a36335": {
"Name": "billing.2.ca41k2h44zkn9wfbsif0lfupf",
"EndpointID": "77c576929d5e82f1075b4cc6fcb4128ce959281d4b9c1c22d9dcd1e42eed8b5e",
"MacAddress": "02:42:c0:a8:01:87",
"IPv4Address": "192.168.1.135/24",
"IPv6Address": ""
},
"8b0929e66e6c284206ea713f7c92f1207244667d3ff02815d4bab617c349b220": {
"Name": "shotspottercollector.2.328408tiyy8aryr0g1ipmm5xm",
"EndpointID": "f2a0558ec67745f5d1601375c2090f5cd141303bf0d54bec717e3463f26ed74d",
"MacAddress": "02:42:c0:a8:01:90",
"IPv4Address": "192.168.1.144/24",
"IPv6Address": ""
},
"938fe5f6f9bb893862e8c06becd76c1a7fe5f2d3b791fc55d7d8164e67ee3553": {
"Name": "inrixproxy.2.ed77crvat0waw41phjknhhm6v",
"EndpointID": "88f550fecd60f0bdb0dfc9d5bf0c74716a91d009bcc27dc4392b113ab1215038",
"MacAddress": "02:42:c0:a8:01:96",
"IPv4Address": "192.168.1.150/24",
"IPv6Address": ""
},
"970f9d4c6ae6cc4de54a1d501408720b7d95114c28a6615d8e4e650b7e69bc40": {
"Name": "rabbitmq.1.e7j721g6hfhs8r7p3phih4g9v",
"EndpointID": "c04a4a5650ee6e10b87884004aa2cb1ec6b1c7036af15c31579462b6621436a2",
"MacAddress": "02:42:c0:a8:01:1e",
"IPv4Address": "192.168.1.30/24",
"IPv6Address": ""
},
"b1f676e6d38eec026583943dc0abff1163d21e6be9c5901539c46288f8941638": {
"Name": "logspout.0.51j8juw8aj0rjjccp2am0rib5",
"EndpointID": "98a93153abd6897c58276340df2eeec5c0ceb77fbe17d1ce8c465febb06776c7",
"MacAddress": "02:42:c0:a8:01:10",
"IPv4Address": "192.168.1.16/24",
"IPv6Address": ""
},
"bab4d80be830fa3b3fefe501c66e3640907a2cbb2addc925a0eb6967a771a172": {
"Name": "auth.2.8fduvrn5ayk024b0lkhyz50of",
"EndpointID": "7e81d41fa04ec14263a2423d8ef003d6d431a8c3ff319963197f8a8d73b4e361",
"MacAddress": "02:42:c0:a8:01:3a",
"IPv4Address": "192.168.1.58/24",
"IPv6Address": ""
},
"bc3c75a7c2d8c078eb7cc1555833ff0d374d82045dd9fb24ccfc37868615bb5e": {
"Name": "reverseproxy.6.2g20zphn5j1r2feylzcplyorg",
"EndpointID": "6c2138966ebcd144b47229a94ee603d264f3954a96ccd024d9e96501b7ffd5c0",
"MacAddress": "02:42:c0:a8:01:6c",
"IPv4Address": "192.168.1.108/24",
"IPv6Address": ""
},
"cd59d61b16ac0325336121a8558e8215e42aa5300f75054df17a70bf1f3e6c0c": {
"Name": "usgscollector.1.0h0afyw8va8maoa4tjd5qz588",
"EndpointID": "952073efc6a567ebd3f80d26811222c675183e8c76005fbf12388725a97b1bee",
"MacAddress": "02:42:c0:a8:01:48",
"IPv4Address": "192.168.1.72/24",
"IPv6Address": ""
},
"d40476e56b91762b0609acd637a4f70e42c88d266f8ebb7d9511050a8fc1df17": {
"Name": "kibana.1.6hxu5b97hfykuqr5yb9i9sn5r",
"EndpointID": "08c5188076f9b8038d864d570e7084433a8d97d4c8809d27debf71cb5d652cd7",
"MacAddress": "02:42:c0:a8:01:06",
"IPv4Address": "192.168.1.6/24",
"IPv6Address": ""
},
"e29369ad8ee5b12fb0c6f9bcb899514ab092f7da291a7c05eea758b0c19bfb65": {
"Name": "weatherbugcollector.1.crpub0hf85cewxm0qt6annsra",
"EndpointID": "afa1ddbad8ab8fdab69505ddb5342ac89c0d17bc75a11e9ac0ac8829e5885997",
"MacAddress": "02:42:c0:a8:01:2e",
"IPv4Address": "192.168.1.46/24",
"IPv6Address": ""
},
"f1bf0a656ecb9d7ef9b837efa94a050d9c98586f7312435e48b9a129c5e92e46": {
"Name": "socratacollector.1.627icslq6kdb4syaha6tzkb19",
"EndpointID": "14bea0d9ec3f94b04b32f36b7172c60316ee703651d0d920126a49dd0fa99cf5",
"MacAddress": "02:42:c0:a8:01:1b",
"IPv4Address": "192.168.1.27/24",
"IPv6Address": ""
}
},
"Options": {
"com.docker.network.driver.overlay.vxlanid_list": "257"
},
"Labels": {}
}
]
dig datacollector
的输出:
; <<>> DiG 9.9.5-9+deb8u8-Debian <<>> datacollector
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 38227
;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 0
;; QUESTION SECTION:
;datacollector. IN A
;; ANSWER SECTION:
datacollector. 600 IN A 192.168.1.23
;; Query time: 0 msec
;; SERVER: 127.0.0.11#53(127.0.0.11)
;; WHEN: Thu Nov 17 16:11:57 UTC 2016
;; MSG SIZE rcvd: 60
dig tasks.datacollector
的输出:
; <<>> DiG 9.9.5-9+deb8u8-Debian <<>> tasks.datacollector
;; global options: +cmd
;; Got answer:
;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 9810
;; flags: qr rd ra; QUERY: 1, ANSWER: 12, AUTHORITY: 0, ADDITIONAL: 0
;; QUESTION SECTION:
;tasks.datacollector. IN A
;; ANSWER SECTION:
tasks.datacollector. 600 IN A 192.168.1.115
tasks.datacollector. 600 IN A 192.168.1.66
tasks.datacollector. 600 IN A 192.168.1.22
tasks.datacollector. 600 IN A 192.168.1.114
tasks.datacollector. 600 IN A 192.168.1.37
tasks.datacollector. 600 IN A 192.168.1.139
tasks.datacollector. 600 IN A 192.168.1.148
tasks.datacollector. 600 IN A 192.168.1.110
tasks.datacollector. 600 IN A 192.168.1.112
tasks.datacollector. 600 IN A 192.168.1.100
tasks.datacollector. 600 IN A 192.168.1.39
tasks.datacollector. 600 IN A 192.168.1.106
;; Query time: 0 msec
;; SERVER: 127.0.0.11#53(127.0.0.11)
;; WHEN: Thu Nov 17 16:08:54 UTC 2016
;; MSG SIZE rcvd: 457
docker version
的输出:
Client:
Version: 1.12.3
API version: 1.24
Go version: go1.6.3
Git commit: 6b644ec
Built: Wed Oct 26 23:26:11 2016
OS/Arch: darwin/amd64
Server:
Version: 1.12.3
API version: 1.24
Go version: go1.6.3
Git commit: 6b644ec
Built: Wed Oct 26 21:44:32 2016
OS/Arch: linux/amd64
docker info
的输出:
Containers: 58
Running: 15
Paused: 0
Stopped: 43
Images: 123
Server Version: 1.12.3
Storage Driver: aufs
Root Dir: /var/lib/docker/aufs
Backing Filesystem: extfs
Dirs: 430
Dirperm1 Supported: false
Logging Driver: json-file
Cgroup Driver: cgroupfs
Plugins:
Volume: local
Network: host null overlay bridge
Swarm: active
NodeID: 8uxexr2uz3qpn5x1km9k4le9s
Is Manager: true
ClusterID: 2kd4md2qyu67szx4y6q2npnet
Managers: 3
Nodes: 8
Orchestration:
Task History Retention Limit: 5
Raft:
Snapshot Interval: 10000
Heartbeat Tick: 1
Election Tick: 3
Dispatcher:
Heartbeat Period: 5 seconds
CA Configuration:
Expiry Duration: 3 months
Node Address: 10.10.44.201
Runtimes: runc
Default Runtime: runc
Security Options: apparmor
Kernel Version: 3.13.0-91-generic
Operating System: Ubuntu 14.04.4 LTS
OSType: linux
Architecture: x86_64
CPUs: 2
Total Memory: 3.676 GiB
Name: stage-0
ID: 76Z2:GN43:RQND:BBAJ:AGUU:S3F7:JWBC:CCCK:I4VH:PKYC:UHQT:IR2U
Docker Root Dir: /var/lib/docker
Debug Mode (client): false
Debug Mode (server): false
Username: herbrandson
Registry: https://index.docker.io/v1/
WARNING: No swap limit support
Labels:
provider=generic
Insecure Registries:
127.0.0.0/8
其他环境详情: Docker群模式(不是swarm)。所有节点都在AWS上运行。该群有8个节点(3个管理员和5个工人)
更新 根据评论,这里是一个来自docker daemon登录swarm master
的snipettime="2016-11-17T15:19:45.890158968Z" level=error msg="container status
unavailable" error="context canceled" module=taskmanager task.id=ch6w74b3cu78y8r2ugkmfmu8a
time="2016-11-17T15:19:48.929507277Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=exb6dfc067nxudzr8uo1eyj4e
time="2016-11-17T15:19:50.104962867Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=6mbbfkilj9gslfi33w7sursb9
time="2016-11-17T15:19:50.877223204Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=drd8o0yn1cg5t3k76frxgukaq
time="2016-11-17T15:19:54.680427504Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=9lwl5v0f2v6p52shg6gixs3j7
time="2016-11-17T15:19:54.949118806Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=51q1eeilfspsm4cx79nfkl4r0
time="2016-11-17T15:19:56.485909146Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=3vjzfjjdrjio2gx45q9c3j6qd
time="2016-11-17T15:19:56.934070026Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:00.000614497Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:00.163458802Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=4xa2ub5npxyxpyx3vd5n1gsuy
time="2016-11-17T15:20:01.463407652Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:01.949087337Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:02.942094926Z" level=error msg="Failed to create real server 192.168.1.150 for vip 192.168.1.32 fwmark 947 in sb 938fe5f6f9bb893862e8c06becd76c1a7fe5f2d3b791fc55d7d8164e67ee3553: no such process"
time="2016-11-17T15:20:03.319168359Z" level=error msg="Failed to delete a new service for vip 192.168.1.61 fwmark 2133: no such process"
time="2016-11-17T15:20:03.363775880Z" level=error msg="Failed to add firewall mark rule in sbox /var/run/docker/netns/5de57ee133a5: reexec failed: exit status 5"
time="2016-11-17T15:20:05.772683092Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:06.059212643Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:07.335686642Z" level=error msg="Failed to delete a new service for vip 192.168.1.67 fwmark 2134: no such process"
time="2016-11-17T15:20:07.385135664Z" level=error msg="Failed to add firewall mark rule in sbox /var/run/docker/netns/6699e7c03bbd: reexec failed: exit status 5"
time="2016-11-17T15:20:07.604064777Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:07.673852364Z" level=error msg="Failed to delete a new service for vip 192.168.1.75 fwmark 2097: no such process"
time="2016-11-17T15:20:07.766525370Z" level=error msg="Failed to add firewall mark rule in sbox /var/run/docker/netns/6699e7c03bbd: reexec failed: exit status 5"
time="2016-11-17T15:20:09.080101131Z" level=error msg="Failed to create real server 192.168.1.155 for vip 192.168.1.35 fwmark 904 in sb 192203a127d6831c3f4a41eabdd8df5282e33c3e92b99c3baaf1f213042f5418: no such process"
time="2016-11-17T15:20:11.516338629Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:20:11.729274237Z" level=error msg="Failed to delete a new service for vip 192.168.1.83 fwmark 2124: no such process"
time="2016-11-17T15:20:11.887572806Z" level=error msg="Failed to add firewall mark rule in sbox /var/run/docker/netns/5b810132057e: reexec failed: exit status 5"
time="2016-11-17T15:20:12.281481060Z" level=error msg="Failed to delete a new service for vip 192.168.1.73 fwmark 2136: no such process"
time="2016-11-17T15:20:12.395326864Z" level=error msg="Failed to add firewall mark rule in sbox /var/run/docker/netns/5b810132057e: reexec failed: exit status 5"
time="2016-11-17T15:20:20.263565036Z" level=error msg="Failed to create real server 192.168.1.72 for vip 192.168.1.91 fwmark 2163 in sb cd59d61b16ac0325336121a8558e8215e42aa5300f75054df17a70bf1f3e6c0c: no such process"
time="2016-11-17T15:20:20.410996971Z" level=error msg="Failed to delete a new service for vip 192.168.1.95 fwmark 2144: no such process"
time="2016-11-17T15:20:20.456710211Z" level=error msg="Failed to add firewall mark rule in sbox /var/run/docker/netns/88d38a2bfb77: reexec failed: exit status 5"
time="2016-11-17T15:20:21.389253510Z" level=error msg="Failed to create real server 192.168.1.46 for vip 192.168.1.99 fwmark 2145 in sb cd59d61b16ac0325336121a8558e8215e42aa5300f75054df17a70bf1f3e6c0c: no such process"
time="2016-11-17T15:20:22.208965378Z" level=error msg="Failed to create real server 192.168.1.46 for vip 192.168.1.99 fwmark 2145 in sb e29369ad8ee5b12fb0c6f9bcb899514ab092f7da291a7c05eea758b0c19bfb65: no such process"
time="2016-11-17T15:20:23.334582312Z" level=error msg="Failed to create a new service for vip 192.168.1.97 fwmark 2166: file exists"
time="2016-11-17T15:20:23.495873232Z" level=error msg="Failed to create real server 192.168.1.48 for vip 192.168.1.17 fwmark 552 in sb e29369ad8ee5b12fb0c6f9bcb899514ab092f7da291a7c05eea758b0c19bfb65: no such process"
time="2016-11-17T15:20:25.831988014Z" level=error msg="Failed to create real server 192.168.1.116 for vip 192.168.1.41 fwmark 566 in sb 03ac1e71139ff2140f93c80d9e6b1d69abf442a0c2362610bee3e116e84ef434: no such process"
time="2016-11-17T15:20:25.850904011Z" level=error msg="Failed to create real server 192.168.1.116 for vip 192.168.1.41 fwmark 566 in sb 03ac1e71139ff2140f93c80d9e6b1d69abf442a0c2362610bee3e116e84ef434: no such process"
time="2016-11-17T15:20:37.159637665Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=6yhu3glre4tbz6d08lk2pq9eb
time="2016-11-17T15:20:48.229343512Z" level=error msg="Error closing logger: invalid argument"
time="2016-11-17T15:51:16.027686909Z" level=error msg="Error getting service internal: service internal not found"
time="2016-11-17T15:51:16.027708795Z" level=error msg="Handler for GET /v1.24/services/internal returned error: service internal not found"
time="2016-11-17T16:15:50.946921655Z" level=error msg="container status unavailable" error="context canceled" module=taskmanager task.id=cxmvk7p1hwznautresir94m3s
time="2016-11-17T16:16:01.994494784Z" level=error msg="Error closing logger: invalid argument"
更新2: 我尝试删除该服务并重新创建它,但这并没有解决问题。
更新3: 我经历了一个又一个地重新启动集群中的每个节点。之后事情似乎恢复正常。但是,我仍然不知道是什么造成的。更重要的是,如何在将来再次发生这种情况?