我正在尝试在我的集群中创建一个带有 GPU 的 GKE 节点池。当我执行 make apply
时,我没有看到应用了 guest_accelerator 块。创建节点池时,其中没有 GPU。
我在下面的配置中遗漏了什么吗?
谢谢
resource "google_container_cluster" "default" {
provider = google-beta
project = var.project_id
name = var.name
location = "us-central1"
node_locations = ["us-central1-a", "us-central1-c", "us-central1-f"]
network = var.network
subnetwork = var.subnetwork
ip_allocation_policy {
cluster_ipv4_cidr_block = var.ip_cidr.cluster
services_ipv4_cidr_block = var.ip_cidr.services
}
remove_default_node_pool = true
initial_node_count = 1
master_auth {
username = ""
password = ""
client_certificate_config {
issue_client_certificate = false
}
}
addons_config {
horizontal_pod_autoscaling {
disabled = false
}
http_load_balancing {
disabled = false
}
}
private_cluster_config {
enable_private_nodes = true
.
.
.
}
logging_service = "logging.googleapis.com/kubernetes"
monitoring_service = "monitoring.googleapis.com/kubernetes"
workload_identity_config {
identity_namespace = "${var.project_id}.svc.id.goog"
}
}
resource "google_container_node_pool" "default" {
provider = google-beta
project = google_container_cluster.default.project
name = var.pool_name
location = google_container_cluster.default.location
node_locations = ["us-central1-a", "us-central1-c"]
cluster = google_container_cluster.default.name
initial_node_count = 2
autoscaling {
min_node_count = 1
max_node_count = 4
}
node_config {
machine_type = "n1-standard-4"
image_type = "COS"
disk_size_gb = 100
disk_type = "pd-standard"
local_ssd_count = 0
preemptible = false
service_account = var.node_service_account
guest_accelerator {
type = "nvidia-tesla-p4"
count = 1
}
metadata = {
disable-legacy-endpoints = "true"
}
shielded_instance_config {
enable_secure_boot = true
enable_integrity_monitoring = true
}
}
management {
auto_repair = true
auto_upgrade = true
}
}
下面是我做地形计划时的输出
+ resource "google_container_node_pool" "default" {
+ cluster = "my-cluster"
+ id = (known after apply)
+ initial_node_count = 1
+ instance_group_urls = (known after apply)
+ location = "us-central1"
+ max_pods_per_node = (known after apply)
+ name = "my-gpu-nodes"
+ name_prefix = (known after apply)
+ node_count = (known after apply)
+ node_locations = [
+ "us-central1-a",
+ "us-central1-c",
]
+ operation = (known after apply)
+ project = "my-project"
+ version = (known after apply)
+ autoscaling {
+ max_node_count = 4
+ min_node_count = 1
}
+ management {
+ auto_repair = true
+ auto_upgrade = true
}
+ node_config {
+ disk_size_gb = 100
+ disk_type = "pd-standard"
+ guest_accelerator = (known after apply)
+ image_type = "COS"
+ local_ssd_count = 0
+ machine_type = "n1-standard-4"
+ metadata = {
+ "disable-legacy-endpoints" = "true"
}
+ oauth_scopes = [
+ "https://www.googleapis.com/auth/cloud-platform",
]
+ preemptible = false
+ tags = []
+ taint = (known after apply)
+ shielded_instance_config {
+ enable_integrity_monitoring = true
+ enable_secure_boot = true
}
+ workload_metadata_config {
+ node_metadata = "GKE_METADATA_SERVER"
}
}
+ upgrade_settings {
+ max_surge = (known after apply)
+ max_unavailable = (known after apply)
}
}
以下是应用配置时的 terraform 调试
---[ REQUEST ]--------------------------------------- [37647/43374]
POST /v1beta1/projects/verily-surgical-cloud-dev/locations/us-central1/clusters/verily-surgical/nodePools?alt=json&prettyPrint=false HTTP/1.1
Host: container.googleapis.com
User-Agent: google-api-go-client/0.5 Terraform/0.13.3 (+https://www.terraform.io) Terraform-Plugin-SDK/2.4.4 terraform-provider-google-beta/dev
Content-Length: 703
Content-Type: application/json
X-Goog-Api-Client: gl-go/1.14.5 gdcl/20210308
Accept-Encoding: gzip
{
"nodePool": {
"autoscaling": {
"enabled": true,
"maxNodeCount": 4,
"minNodeCount": 1
},
"config": {
"diskSizeGb": 100,
"diskType": "pd-standard",
"imageType": "COS",
"machineType": "n1-standard-4",
"metadata": {
"disable-legacy-endpoints": "true"
},
"shieldedInstanceConfig": {
"enableIntegrityMonitoring": true,
"enableSecureBoot": true
},
"workloadMetadataConfig": {
"nodeMetadata": "GKE_METADATA_SERVER"
}
},
"initialNodeCount": 2,
"locations": [
"us-central1-a",
"us-central1-c"
],
"management": {
"autoRepair": true,
"autoUpgrade": true
},
"name": "my-gpu-nodes"
}
}
答案 0 :(得分:0)
有时特定位置无法使用特定 GPU。即使您从控制台尝试,它也会显示“GPU 不可用 - 选择另一个位置”。在通过 terraform 应用之前,尝试从控制台配置一次。它将提供更好的上下文。