AWS Fargate 运行状况检查失败

时间:2021-07-10 09:47:58

标签: amazon-web-services amazon-ecs aws-fargate

我有一个要在 AWS 上使用 Fargate 启动的任务定义。现在没有任何负载平衡和东西。我只想运行任务。定义如下:

{
  "ipcMode": null,
  "executionRoleArn": "arn:aws:iam::941606308749:role/ecsTaskExecutionRole",
  "containerDefinitions": [
    {
      "dnsSearchDomains": null,
      "environmentFiles": null,
      "logConfiguration": {
        "logDriver": "awslogs",
        "secretOptions": null,
        "options": {
          "awslogs-group": "/ecs/web",
          "awslogs-region": "eu-central-1",
          "awslogs-stream-prefix": "ecs"
        }
      },
      "entryPoint": null,
      "portMappings": [
        {
          "hostPort": 8000,
          "protocol": "tcp",
          "containerPort": 8000
        }
      ],
      "command": null,
      "linuxParameters": null,
      "cpu": 512,
      "environment": [
        {
          "name": "AWS_STORAGE_BUCKET_NAME",
          "value": "blacksheep-dev2"
        },
        {
          "name": "CELERY_BROKER_HOST",
          "value": "https://sqs.eu-central-1.amazonaws.com/941606308749/BlackSheepLearnsBroker"
        },
        {
          "name": "POSTGRES_DB",
          "value": "postgres"
        },
        {
          "name": "POSTGRES_HOST",
          "value": "blacksheeplearnsdb.c9a9ehc0s9ms.eu-central-1.rds.amazonaws.com"
        },
        {
          "name": "POSTGRES_USER",
          "value": "postgres"
        },
        {
          "name": "ROLLBAR_ENABLED",
          "value": "True"
        }
      ],
      "resourceRequirements": null,
      "ulimits": null,
      "dnsServers": null,
      "mountPoints": [],
      "workingDirectory": null,
      "secrets": null,
      "dockerSecurityOptions": null,
      "memory": null,
      "memoryReservation": 1024,
      "volumesFrom": [],
      "stopTimeout": null,
      "image": "941606308749.dkr.ecr.eu-central-1.amazonaws.com/blacksheeplearns:latest",
      "startTimeout": null,
      "firelensConfiguration": null,
      "dependsOn": null,
      "disableNetworking": null,
      "interactive": null,
      "healthCheck": {
        "retries": 3,
        "command": [
          "CMD-SHELL",
          "curl -f http://localhost:8000/health/ || exit 1"
        ],
        "timeout": 5,
        "interval": 30,
        "startPeriod": 30
      },
      "essential": true,
      "links": null,
      "hostname": null,
      "extraHosts": null,
      "pseudoTerminal": null,
      "user": null,
      "readonlyRootFilesystem": null,
      "dockerLabels": {
        "project": "BlackSheepLearns"
      },
      "systemControls": null,
      "privileged": null,
      "name": "web"
    }
  ],
  "placementConstraints": [],
  "memory": "1024",
  "taskRoleArn": "arn:aws:iam::941606308749:role/ecsTaskRole",
  "compatibilities": [
    "EC2",
    "FARGATE"
  ],
  "taskDefinitionArn": "arn:aws:ecs:eu-central-1:941606308749:task-definition/web:14",
  "family": "web",
  "requiresAttributes": [
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.logging-driver.awslogs"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "ecs.capability.execution-role-awslogs"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.ecr-auth"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.19"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.21"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.task-iam-role"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "ecs.capability.container-health-check"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "ecs.capability.execution-role-ecr-pull"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.18"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "ecs.capability.task-eni"
    },
    {
      "targetId": null,
      "targetType": null,
      "value": null,
      "name": "com.amazonaws.ecs.capability.docker-remote-api.1.29"
    }
  ],
  "pidMode": null,
  "requiresCompatibilities": [
    "FARGATE"
  ],
  "networkMode": "awsvpc",
  "cpu": "512",
  "revision": 14,
  "status": "ACTIVE",
  "inferenceAccelerators": null,
  "proxyConfiguration": null,
  "volumes": []
}

但是,当我想启动它时,它会启动大约 1.5 分钟,然后它就被杀死了。我怀疑这与健康检查有关。

在某些时候它只是收到一个终止信号并停止。此处配置为没有目标组或负载均衡器:

2021-07-10 10:48:40
[2021-07-10 08:48:40 +0000] [1] [INFO] Shutting down: Master
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:48:39
[2021-07-10 08:48:39 +0000] [13] [INFO] Worker exiting (pid: 13)
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:48:39
[2021-07-10 08:48:39 +0000] [14] [INFO] Worker exiting (pid: 14)
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:48:39
[2021-07-10 08:48:39 +0000] [1] [INFO] Handling signal: term
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:33
WARNING:rollbar:Rollbar already initialized. Ignoring re-init.
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:33
WARNING:rollbar:Rollbar already initialized. Ignoring re-init.
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:32
INFO:root:Retrieving secret: ROLLBAR_KEY
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:32
INFO:root:Retrieving secret: ROLLBAR_KEY
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:32
INFO:root:Retrieving secret: POSTGRES_PASSWORD
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:32
INFO:root:Retrieving secret: POSTGRES_PASSWORD
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:32
INFO:root:Retrieving secret: SECRET_KEY
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:32
INFO:root:Retrieving secret: SECRET_KEY
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:29
[2021-07-10 08:46:29 +0000] [14] [INFO] Booting worker with pid: 14
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:29
[2021-07-10 08:46:29 +0000] [13] [INFO] Booting worker with pid: 13
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:29
[2021-07-10 08:46:29 +0000] [1] [INFO] Listening at: http://0.0.0.0:8000 (1)
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:29
[2021-07-10 08:46:29 +0000] [1] [INFO] Using worker: sync
ccb94e999e294bdbaadc3f941b786603
2021-07-10 10:46:29
[2021-07-10 08:46:29 +0000] [1] [INFO] Starting gunicorn 20.0.4
ccb94e999e294bdbaadc3f941b786603

因此,从我所见,服务正在启动,gunicorn(用于 Python Web 应用程序的服务器)正在启动,侦听端口 8000,我已映射等等。我还在我的应用程序中公开了 /health/ 端点,以允许简单和轻量级的健康检查(它只返回 200 秒)。然而在服务控制台中,我不断收到:

<块引用>

3a52c067-63bd-4d58-a092-a69d29380962 2021-07-10 11:46:08 +0200 服务 Web 任务 7982151a4a904a82b077fc48410dd672 容器健康检查失败。

我做错了什么?

1 个答案:

答案 0 :(得分:0)

你能做 2 个与健康检查相关的检查并写下你的发现吗?

  • 当您增加超时持续时间时,它是否通过了健康检查?
  • 当您在提供的任务定义上增加 cpu 或 ram 时会发生什么?