应在什么度量标准的编织网上发出警报?

时间:2020-02-14 19:37:32

标签: kubernetes prometheus weave

WeaveNet公开了以下Prometheus metrics

enter image description here

以下警报看起来正确无误,无法继续发出警报?在这些指标的哪些值上,我们应该发出警报以监视织网运行状况?

  • WeaveNoFastDP weave_flows [5m]> 0
  • WeaveIPAMUnreachable weave_ipam_unreachable_percentage> 0
  • WeaveIPAMPendingAllocates weave_ipam_pending_allocates> 0
  • WeavePendingClaims weave_ipam_pending_claims> 0
  • WeaveConnecTerm weave_connection_terminations_total> 300

1 个答案:

答案 0 :(得分:0)

在编织指标上方制作了grafana仪表板。 这是仪表板

  1. WeaveNet https://grafana.com/grafana/dashboards/11789
  2. WeaveNet(群集)https://grafana.com/grafana/dashboards/11804

以下是对编织网进行监视的有用度量。下面的警报为json格式。

enter image description here

{
  "groups": [
    {
      "name": "nodeagent",
      "rules": [
        {
          "alert": "UnhealthyNodes",
          "expr": "changes(central_nodeagent:node_route_unhealthy_count[3m]) > 0",
          "for": "1m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "Unhealthy nodes in the cluster. Go to prometheus the below prometheus link for details.",
            "description": "Actionable: Find why the node(s) are unhealthy and fix it."
          }
        }
      ]
    },
    {
      "name": "weave-net",
      "rules": [
        {
          "alert": "WeaveNetIPAMSPlitBrain",
          "expr": "max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNetIPAM has a split brain. Go to the below prometheus link for details.",
            "description": "Actionable: Every node should see same unreachability percentage. Please check and fix why it is not so."
          }
        },
        {
          "alert": "WeaveNetIPAMUnreachable",
          "expr": "weave_ipam_unreachable_percentage[10m] > 25",
          "for": "10m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNetIPAM unreachability percentage is above threshold. Go to the below prometheus link for details.",
            "description": "Actionable: Find why the unreachability threshold have increased from threshold and fix it. WeaveNet is responsible to keep it under control. Weave rm peer deployment can help clean things."
          }
        },
        {
          "alert": "WeaveNetIPAMPendingAllocates",
          "expr": "sum(weave_ipam_pending_allocates) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet IPAM has pending allocates. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for IPAM allocates to be in pending state and fix it."
          }
        },
        {
          "alert": "WeaveNetIPAMPendingClaims",
          "expr": "sum(weave_ipam_pending_claims) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet IPAM has pending claims. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for IPAM claims to be in pending state and fix it."
          }
        },
        {
          "alert": "WeaveNetFastDPFlowsLow",
          "expr": "sum(weave_flows) < 15000",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet total FastDP flows is below threshold. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for fast dp flows dropping below the threshold."
          }
        },
        {
          "alert": "WeaveNetFastDPFlowsOff",
          "expr": "sum(weave_flows == bool 0) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "WeaveNet FastDP flows is not happening in some or all nodes. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for fast dp being off."
          }
        },
        {
          "alert": "WeaveNetHighConnectionTerminationRate",
          "expr": "rate(weave_connection_terminations_total[5m]) > 0.1",
          "for": "5m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are getting terminated. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason for high connection termination rate and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsConnecting",
          "expr": "sum(weave_connections{state='connecting'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in connecting state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsRetying",
          "expr": "sum(weave_connections{state='retrying'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in retrying state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsPending",
          "expr": "sum(weave_connections{state='pending'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in pending state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        },
        {
          "alert": "WeaveNetConnectionsFailed",
          "expr": "sum(weave_connections{state='failed'}) > 0",
          "for": "3m",
          "labels": {
            "severity": "critical"
          },
          "annotations": {
            "summary": "A lot of connections are in failed state. Go to the below prometheus link for details.",
            "description": "Actionable: Find the reason and fix it."
          }
        }
      ]
    }
  ]
}