Question

我正在尝试在我的集群中创建一个带有 GPU 的 GKE 节点池。当我执行 make apply 时，我没有看到应用了 guest_accelerator 块。创建节点池时，其中没有 GPU。

我在下面的配置中遗漏了什么吗？

谢谢

resource "google_container_cluster" "default" {
  provider       = google-beta
  project        = var.project_id
  name           = var.name
  location       = "us-central1"
  node_locations = ["us-central1-a", "us-central1-c", "us-central1-f"]
  network        = var.network
  subnetwork     = var.subnetwork

  ip_allocation_policy {
    cluster_ipv4_cidr_block  = var.ip_cidr.cluster
    services_ipv4_cidr_block = var.ip_cidr.services
  }

  remove_default_node_pool = true
  initial_node_count       = 1

  master_auth {
    username = ""
    password = ""

    client_certificate_config {
      issue_client_certificate = false
    }
  }

  addons_config {
    horizontal_pod_autoscaling {
      disabled = false
    }

    http_load_balancing {
      disabled = false
    }
  }

  private_cluster_config {
    enable_private_nodes    = true
    .
    .
    .
  }

  logging_service    = "logging.googleapis.com/kubernetes"
  monitoring_service = "monitoring.googleapis.com/kubernetes"

  workload_identity_config {
    identity_namespace = "${var.project_id}.svc.id.goog"
  }
}

resource "google_container_node_pool" "default" {
  provider = google-beta

  project            = google_container_cluster.default.project
  name               = var.pool_name
  location           = google_container_cluster.default.location
  node_locations     = ["us-central1-a", "us-central1-c"]
  cluster            = google_container_cluster.default.name
  initial_node_count = 2

  autoscaling {
    min_node_count = 1
    max_node_count = 4
  }

  node_config {
    machine_type    = "n1-standard-4"
    image_type      = "COS"
    disk_size_gb    = 100
    disk_type       = "pd-standard"
    local_ssd_count = 0
    preemptible     = false
    service_account = var.node_service_account
    guest_accelerator {
      type  = "nvidia-tesla-p4"
      count = 1
    }
    metadata = {
      disable-legacy-endpoints = "true"
    }
    
    shielded_instance_config {
      enable_secure_boot          = true
      enable_integrity_monitoring = true
    }
  }

  management {
    auto_repair  = true
    auto_upgrade = true
  }
}

下面是我做地形计划时的输出

  + resource "google_container_node_pool" "default" {
      + cluster             = "my-cluster"
      + id                  = (known after apply)
      + initial_node_count  = 1
      + instance_group_urls = (known after apply)
      + location            = "us-central1"
      + max_pods_per_node   = (known after apply)
      + name                = "my-gpu-nodes"
      + name_prefix         = (known after apply)
      + node_count          = (known after apply)
      + node_locations      = [
          + "us-central1-a",
          + "us-central1-c",
        ]
      + operation           = (known after apply)
      + project             = "my-project"
      + version             = (known after apply)

      + autoscaling {
          + max_node_count = 4
          + min_node_count = 1
        }

      + management {
          + auto_repair  = true
          + auto_upgrade = true
        }

      + node_config {
          + disk_size_gb      = 100
          + disk_type         = "pd-standard"
          + guest_accelerator = (known after apply)
          + image_type        = "COS"
          + local_ssd_count   = 0
          + machine_type      = "n1-standard-4"
          + metadata          = {
              + "disable-legacy-endpoints" = "true"
            }
          + oauth_scopes      = [
              + "https://www.googleapis.com/auth/cloud-platform",
            ]
          + preemptible       = false
          + tags              = []
          + taint             = (known after apply)

          + shielded_instance_config {
              + enable_integrity_monitoring = true
              + enable_secure_boot          = true
            }

          + workload_metadata_config {
              + node_metadata = "GKE_METADATA_SERVER"
            }
        }

      + upgrade_settings {
          + max_surge       = (known after apply)
          + max_unavailable = (known after apply)
        }
    }

以下是应用配置时的 terraform 调试

---[ REQUEST ]---------------------------------------                                                                                                                                [37647/43374]
POST /v1beta1/projects/verily-surgical-cloud-dev/locations/us-central1/clusters/verily-surgical/nodePools?alt=json&prettyPrint=false HTTP/1.1
Host: container.googleapis.com
User-Agent: google-api-go-client/0.5 Terraform/0.13.3 (+https://www.terraform.io) Terraform-Plugin-SDK/2.4.4 terraform-provider-google-beta/dev
Content-Length: 703
Content-Type: application/json
X-Goog-Api-Client: gl-go/1.14.5 gdcl/20210308
Accept-Encoding: gzip

{
 "nodePool": {
  "autoscaling": {
   "enabled": true,
   "maxNodeCount": 4,
   "minNodeCount": 1
  },
  "config": {
   "diskSizeGb": 100,
   "diskType": "pd-standard",
   "imageType": "COS",
   "machineType": "n1-standard-4",
   "metadata": {
    "disable-legacy-endpoints": "true"
   },
   "shieldedInstanceConfig": {
    "enableIntegrityMonitoring": true,
    "enableSecureBoot": true
   },
   "workloadMetadataConfig": {
    "nodeMetadata": "GKE_METADATA_SERVER"
   }
  },
  "initialNodeCount": 2,
  "locations": [
   "us-central1-a",
   "us-central1-c"
  ],
  "management": {
   "autoRepair": true,
   "autoUpgrade": true
  },
  "name": "my-gpu-nodes"
 }
}

Answer 1

有时特定位置无法使用特定 GPU。即使您从控制台尝试，它也会显示“GPU 不可用 - 选择另一个位置”。在通过 terraform 应用之前，尝试从控制台配置一次。它将提供更好的上下文。

GPU 未添加到 GKE 节点池

1 个答案: