无法在一个特定的k8s节点上获取指标

时间:2020-04-12 20:28:51

标签: kubernetes metrics-server

今天,我重新启动了我的k8s工人节点之一。现在无法获取在此节点上启动的任何Pod的指标。 kubectl top nodes正常。

$ kubectl top pods
W0413 03:16:04.917900  596110 top_pod.go:266] Metrics not available for pod default/cluster-registry-84f8b6b45c-xmzr4, age: 1h32m29.917882167s
error: Metrics not available for pod default/cluster-registry-84f8b6b45c-xmzr4, age: 1h32m29.917882167s
$ kubectl logs -f metrics-server-596fcd4bcd-fgk86 -n kube-system

E0412 20:16:07.413028       1 reststorage.go:160] unable to fetch pod metrics for pod default/runner-registry-74bdcf4f9b-8kkzn: no metrics known for pod
E0412 20:17:07.413399       1 reststorage.go:160] unable to fetch pod metrics for pod default/runner-registry-74bdcf4f9b-8kkzn: no metrics known for pod

我尝试使用--v=4 arg来启动metrics-server,但是没有发现任何有趣的地方。 另一个节点上的Pod指标是可以的。

k8s-v1.17.4

metrics-server-amd64:v0.3.6始于

--kubelet-insecure-tls
--kubelet-preferred-address-types=InternalIP

更新: 节点名称为sms-crm-stg-2。以下kubectl get --raw /api/v1/nodes/sms-crm-stg-2/proxy/stats/summary的输出:

$ kubectl get --raw /api/v1/nodes/sms-crm-stg-2/proxy/stats/summary
{
 "node": {
  "nodeName": "sms-crm-stg-2",
  "systemContainers": [
   {
    "name": "pods",
    "startTime": "2020-04-12T17:50:25Z",
    "cpu": {
     "time": "2020-04-14T10:53:20Z",
     "usageNanoCores": 12877941,
     "usageCoreNanoSeconds": 4387476849484
    },
    "memory": {
     "time": "2020-04-14T10:53:20Z",
     "availableBytes": 16520691712,
     "usageBytes": 154824704,
     "workingSetBytes": 136818688,
     "rssBytes": 68583424,
     "pageFaults": 0,
     "majorPageFaults": 0
    }
   },
   {
    "name": "kubelet",
    "startTime": "2020-04-12T17:49:18Z",
    "cpu": {
     "time": "2020-04-14T10:53:05Z",
     "usageNanoCores": 18983004,
     "usageCoreNanoSeconds": 2979656573959
    },
    "memory": {
     "time": "2020-04-14T10:53:05Z",
     "usageBytes": 374534144,
     "workingSetBytes": 353353728,
     "rssBytes": 325005312,
     "pageFaults": 133278612,
     "majorPageFaults": 536505
    }
   },
   {
    "name": "runtime",
    "startTime": "2020-04-12T17:48:35Z",
    "cpu": {
     "time": "2020-04-14T10:53:03Z",
     "usageNanoCores": 15982086,
     "usageCoreNanoSeconds": 1522750008369
    },
    "memory": {
     "time": "2020-04-14T10:53:03Z",
     "usageBytes": 306790400,
     "workingSetBytes": 297889792,
     "rssBytes": 280047616,
     "pageFaults": 53437788,
     "majorPageFaults": 255703
    }
   }
  ],
  "startTime": "2020-04-12T17:48:19Z",
  "cpu": {
   "time": "2020-04-14T10:53:20Z",
   "usageNanoCores": 110654764,
   "usageCoreNanoSeconds": 29602969518334
  },
  "memory": {
   "time": "2020-04-14T10:53:20Z",
   "availableBytes": 1377738752,
   "usageBytes": 15835013120,
   "workingSetBytes": 15279771648,
   "rssBytes": 14585233408,
   "pageFaults": 3309653,
   "majorPageFaults": 16969
  },
  "network": {
   "time": "2020-04-14T10:53:20Z",
   "name": "",
   "interfaces": [
    {
     "name": "br-6edcec7930f0",
     "rxBytes": 0,
     "rxErrors": 0,
     "txBytes": 0,
     "txErrors": 0
    },
    {
     "name": "cali63387897a01",
     "rxBytes": 131540393,
     "rxErrors": 0,
     "txBytes": 71581241,
     "txErrors": 0
    },
    {
     "name": "cali75b3a97cfc0",
     "rxBytes": 194967,
     "rxErrors": 0,
     "txBytes": 54249,
     "txErrors": 0
    },
    {
     "name": "cali382d1538876",
     "rxBytes": 666667,
     "rxErrors": 0,
     "txBytes": 780072,
     "txErrors": 0
    },
    {
     "name": "br-0b3d0a271eb2",
     "rxBytes": 0,
     "rxErrors": 0,
     "txBytes": 0,
     "txErrors": 0
    },
    {
     "name": "cali7c48479e916",
     "rxBytes": 139682733,
     "rxErrors": 0,
     "txBytes": 205172367,
     "txErrors": 0
    },
    {
     "name": "cali346a5d86923",
     "rxBytes": 112517660,
     "rxErrors": 0,
     "txBytes": 232383,
     "txErrors": 0
    },
    {
     "name": "br-5d30bcdbc231",
     "rxBytes": 0,
     "rxErrors": 0,
     "txBytes": 0,
     "txErrors": 0
    },
    {
     "name": "tunl0",
     "rxBytes": 195091257,
     "rxErrors": 0,
     "txBytes": 215334849,
     "txErrors": 0
    },
    {
     "name": "ens160",
     "rxBytes": 3241985272,
     "rxErrors": 0,
     "txBytes": 3548616264,
     "txErrors": 0
    }
   ]
  },
  "fs": {
   "time": "2020-04-14T10:53:20Z",
   "availableBytes": 9231872000,
   "capacityBytes": 24109666304,
   "usedBytes": 14877794304,
   "inodesFree": 23363080,
   "inodes": 23556096,
   "inodesUsed": 193016
  },
  "runtime": {
   "imageFs": {
    "time": "2020-04-14T10:53:20Z",
    "availableBytes": 9231872000,
    "capacityBytes": 24109666304,
    "usedBytes": 6145920764,
    "inodesFree": 23363080,
    "inodes": 23556096,
    "inodesUsed": 193016
   }
  },
  "rlimit": {
   "time": "2020-04-14T10:53:22Z",
   "maxpid": 32768,
   "curproc": 1608
  }
 },
 "pods": []
}

"pods": []为空,因此看起来是节点问题,而不是度量服务器。

1 个答案:

答案 0 :(得分:1)

OP确认度量服务器问题是由故障节点引起的。添加新的解决了该问题。