你做了什么?
I ran prometheus2.0.0 on kubernetesv1.8.5
您期望看到什么?
Everything went well.
你看到的是什么?在哪种情况下?
一开始一切顺利。但几个小时后,豆荚'状态转向" CrashLoopBackOff",所有prometheus都变得不可用。在prometheus pods创建之后,什么也没做。
[root@k8s-1 prometheus]# kubectl get all -n monitoring
NAME DESIRED CURRENT AGE
statefulsets/prometheus-k8s 0 2 16h
NAME READY STATUS RESTARTS AGE
po/prometheus-k8s-0 0/1 CrashLoopBackOff 81 16h
po/prometheus-k8s-1 0/1 CrashLoopBackOff 22 16h
环境
[root@k8s-1 prometheus]# kubectl version --short
Client Version: v1.8.5
Server Version: v1.8.5
[root@k8s-1 prometheus]# docker images | grep -i prometheus
quay.io/prometheus/alertmanager v0.12.0 f87cbd5f1360 5 weeks ago 31.2 MB
quay.io/prometheus/node_exporter v0.15.2 ff5ecdcfc4a2 6 weeks ago 22.8 MB
quay.io/prometheus/prometheus v2.0.0 67141fa03496 2 months ago 80.2 MB
系统信息:
[root @ k8s-1 prometheus]#uname -srm
Linux 3.10.0-229.el7.x86_64 x86_64
普罗米修斯版本:
V2.0.0
Prometheus配置文件:
[root@k8s-1 prometheus]# cat prometheus-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-config
namespace: monitoring
data:
prometheus.yaml: |
global:
scrape_interval: 10s
scrape_timeout: 10s
evaluation_interval: 10s
rule_files:
- "/etc/prometheus-rules/*.rules"
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'kubernetes-service-endpoints'
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
- job_name: 'kubernetes-ingresses'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.example.com:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
prometheus.yaml:
[root@k8s-1 prometheus]# cat prometheus-all-together.yaml
apiVersion: v1
kind: Service
metadata:
labels:
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
annotations:
prometheus.io/scrape: "true"
spec:
ports:
- name: web
nodePort: 30900
port: 9090
protocol: TCP
targetPort: web
selector:
prometheus: k8s
sessionAffinity: None
type: NodePort
---
apiVersion: apps/v1beta1
kind: StatefulSet
metadata:
labels:
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
spec:
selector:
matchLabels:
app: prometheus
prometheus: k8s
serviceName: prometheus-k8s
replicas: 2
template:
metadata:
labels:
app: prometheus
prometheus: k8s
spec:
securityContext:
runAsUser: 65534
fsGroup: 65534
runAsNonRoot: true
containers:
- args:
- --config.file=/etc/prometheus/config/prometheus.yaml
- --storage.tsdb.path=/cephfs/prometheus/data
- --storage.tsdb.retention=180d
- --web.route-prefix=/
- --web.enable-lifecycle
- --web.enable-admin-api
image: quay.io/prometheus/prometheus:v2.0.0
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 10
httpGet:
path: /status
port: web
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 3
name: prometheus
ports:
- containerPort: 9090
name: web
protocol: TCP
readinessProbe:
failureThreshold: 6
httpGet:
path: /status
port: web
scheme: HTTP
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 3
resources:
requests:
cpu: 100m
memory: 200Mi
limits:
cpu: 500m
memory: 500Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/prometheus/config
name: config
readOnly: false
- mountPath: /etc/prometheus/rules
name: rules
readOnly: false
- mountPath: /cephfs/prometheus/data
name: data
subPath: prometheus-data
readOnly: false
serviceAccount: prometheus-k8s
serviceAccountName: prometheus-k8s
terminationGracePeriodSeconds: 60
volumes:
- configMap:
defaultMode: 511
name: prometheus-k8s-config
name: config
- configMap:
defaultMode: 511
name: prometheus-k8s-rules
name: rules
- name: data
persistentVolumeClaim:
claimName: cephfs-pvc
updateStrategy:
type: RollingUpdate
日志:
[root@k8s-1 prometheus]# kubectl logs prometheus-k8s-0 -n monitoring
level=info ts=2018-01-20T03:16:32.966070249Z caller=main.go:215 msg="Starting Prometheus" version="(version=2.0.0, branch=HEAD, revision=0a74f98628a0463dddc90528220c94de5032d1a0)"
level=info ts=2018-01-20T03:16:32.966225361Z caller=main.go:216 build_context="(go=go1.9.2, user=root@615b82cb36b6, date=20171108-07:11:59)"
level=info ts=2018-01-20T03:16:32.966252185Z caller=main.go:217 host_details="(Linux 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64 prometheus-k8s-0 (none))"
level=info ts=2018-01-20T03:16:32.969789371Z caller=web.go:380 component=web msg="Start listening for connections" address=0.0.0.0:9090
level=info ts=2018-01-20T03:16:32.971388907Z caller=main.go:314 msg="Starting TSDB"
level=info ts=2018-01-20T03:16:32.971596811Z caller=targetmanager.go:71 component="target manager" msg="Starting target manager..."
level=error ts=2018-01-20T03:16:59.781338012Z caller=main.go:323 msg="Opening storage failed" err="invalid block sequence: block time ranges overlap (1516348800000, 1516356000000)"
[root@k8s-1 prometheus]#
[root@k8s-1 prometheus]# kubectl logs prometheus-k8s-1 -n monitoring
level=info ts=2018-01-20T03:15:22.701351679Z caller=main.go:215 msg="Starting Prometheus" version="(version=2.0.0, branch=HEAD, revision=0a74f98628a0463dddc90528220c94de5032d1a0)"
level=info ts=2018-01-20T03:15:22.70148418Z caller=main.go:216 build_context="(go=go1.9.2, user=root@615b82cb36b6, date=20171108-07:11:59)"
level=info ts=2018-01-20T03:15:22.701512333Z caller=main.go:217 host_details="(Linux 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64 prometheus-k8s-1 (none))"
level=info ts=2018-01-20T03:15:22.705824203Z caller=web.go:380 component=web msg="Start listening for connections" address=0.0.0.0:9090
level=info ts=2018-01-20T03:15:22.707629775Z caller=main.go:314 msg="Starting TSDB"
level=info ts=2018-01-20T03:15:22.707837323Z caller=targetmanager.go:71 component="target manager" msg="Starting target manager..."
level=error ts=2018-01-20T03:15:54.775639791Z caller=main.go:323 msg="Opening storage failed" err="invalid block sequence: block time ranges overlap (1516348800000, 1516356000000)"
[root@k8s-1 prometheus]# kubectl describe po/prometheus-k8s-0 -n monitoring
Name: prometheus-k8s-0
Namespace: monitoring
Node: k8s-3/172.16.1.8
Start Time: Fri, 19 Jan 2018 17:59:38 +0800
Labels: app=prometheus
controller-revision-hash=prometheus-k8s-7d86dfbd86
prometheus=k8s
Annotations: kubernetes.io/created-by={"kind":"SerializedReference","apiVersion":"v1","reference":{"kind":"StatefulSet","namespace":"monitoring","name":"prometheus-k8s","uid":"7593d8ac-fcff-11e7-9333-fa163e48f857"...
Status: Running
IP: 10.244.2.54
Created By: StatefulSet/prometheus-k8s
Controlled By: StatefulSet/prometheus-k8s
Containers:
prometheus:
Container ID: docker://98faabe55fb71050aacd776d349a6567c25c339117159356eedc10cbc19ef02a
Image: quay.io/prometheus/prometheus:v2.0.0
Image ID: docker-pullable://quay.io/prometheus/prometheus@sha256:53afe934a8d497bb703dbbf7db273681a56677775c462833da8d85015471f7a3
Port: 9090/TCP
Args:
--config.file=/etc/prometheus/config/prometheus.yaml
--storage.tsdb.path=/cephfs/prometheus/data
--storage.tsdb.retention=180d
--web.route-prefix=/
--web.enable-lifecycle
--web.enable-admin-api
State: Waiting
Reason: CrashLoopBackOff
Last State: Terminated
Reason: Error
Exit Code: 1
Started: Sat, 20 Jan 2018 11:11:00 +0800
Finished: Sat, 20 Jan 2018 11:11:29 +0800
Ready: False
Restart Count: 84
Limits:
cpu: 500m
memory: 500Mi
Requests:
cpu: 100m
memory: 200Mi
Liveness: http-get http://:web/status delay=30s timeout=3s period=5s #success=1 #failure=10
Readiness: http-get http://:web/status delay=0s timeout=3s period=5s #success=1 #failure=6
Environment: <none>
Mounts:
/cephfs/prometheus/data from data (rw)
/etc/prometheus/config from config (rw)
/etc/prometheus/rules from rules (rw)
/var/run/secrets/kubernetes.io/serviceaccount from prometheus-k8s-token-x8xzh (ro)
Conditions:
Type Status
Initialized True
Ready False
PodScheduled True
Volumes:
config:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: prometheus-k8s-config
Optional: false
rules:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: prometheus-k8s-rules
Optional: false
data:
Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
ClaimName: cephfs-pvc
ReadOnly: false
prometheus-k8s-token-x8xzh:
Type: Secret (a volume populated by a Secret)
SecretName: prometheus-k8s-token-x8xzh
Optional: false
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.alpha.kubernetes.io/notReady:NoExecute for 300s
node.alpha.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal Pulled 15m (x83 over 17h) kubelet, k8s-3 Container image "quay.io/prometheus/prometheus:v2.0.0" already present on machine
Warning FailedSync 23s (x1801 over 7h) kubelet, k8s-3 Error syncing pod
登录kubernetes节点:
[root@k8s-3 01C48JAGH1QCGKGCG72E0B2Y8R]# journalctl -xeu kubelet --no-pager
1月 20 11:21:54 k8s-3 kubelet[14306]: I0120 11:21:54.619924 14306 kuberuntime_manager.go:749] Back-off 5m0s restarting failed container=prometheus pod=prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)
1月 20 11:21:54 k8s-3 kubelet[14306]: E0120 11:21:54.620042 14306 pod_workers.go:182] Error syncing pod 7598959a-fcff-11e7-9333-fa163e48f857 ("prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)"), skipping: failed to "StartContainer" for "prometheus" with CrashLoopBackOff: "Back-off 5m0s restarting failed container=prometheus pod=prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)"
1月 20 11:22:08 k8s-3 kubelet[14306]: I0120 11:22:08.615438 14306 kuberuntime_manager.go:500] Container {Name:prometheus Image:quay.io/prometheus/prometheus:v2.0.0 Command:[] Args:[--config.file=/etc/prometheus/config/prometheus.yaml --storage.tsdb.path=/cephfs/prometheus/data --storage.tsdb.retention=180d --web.route-prefix=/ --web.enable-lifecycle --web.enable-admin-api] WorkingDir: Ports:[{Name:web HostPort:0 ContainerPort:9090 Protocol:TCP HostIP:}] EnvFrom:[] Env:[] Resources:{Limits:map[cpu:{i:{value:500 scale:-3} d:{Dec:<nil>} s:500m Format:DecimalSI} memory:{i:{value:524288000 scale:0} d:{Dec:<nil>} s:500Mi Format:BinarySI}] Requests:map[cpu:{i:{value:100 scale:-3} d:{Dec:<nil>} s:100m Format:DecimalSI} memory:{i:{value:209715200 scale:0} d:{Dec:<nil>} s: Format:BinarySI}]} VolumeMounts:[{Name:config ReadOnly:false MountPath:/etc/prometheus/config SubPath: MountPropagation:<nil>} {Name:rules ReadOnly:false MountPath:/etc/prometheus/rules SubPath: MountPropagation:<nil>} {Name:data ReadOnly:false MountPath:/cephfs/prometheus/data SubPath:prometheus-data MountPropagation:<nil>} {Name:prometheus-k8s-token-x8xzh ReadOnly:true MountPath:/var/run/secrets/kubernetes.io/serviceaccount SubPath: MountPropagation:<nil>}] LivenessProbe:&Probe{Handler:Handler{Exec:nil,HTTPGet:&HTTPGetAction{Path:/status,Port:web,Host:,Scheme:HTTP,HTTPHeaders:[],},TCPSocket:nil,},InitialDelaySeconds:30,TimeoutSeconds:3,PeriodSeconds:5,SuccessThreshold:1,FailureThreshold:10,} ReadinessProbe:&Probe{Handler:Handler{Exec:nil,HTTPGet:&HTTPGetAction{Path:/status,Port:web,Host:,Scheme:HTTP,HTTPHeaders:[],},TCPSocket:nil,},InitialDelaySeconds:0,TimeoutSeconds:3,PeriodSeconds:5,SuccessThreshold:1,FailureThreshold:6,} Lifecycle:nil TerminationMessagePath:/dev/termination-log TerminationMessagePolicy:File ImagePullPolicy:IfNotPresent SecurityContext:nil Stdin:false StdinOnce:false TTY:false} is dead, but RestartPolicy says that we should restart it.
1月 20 11:22:08 k8s-3 kubelet[14306]: I0120 11:22:08.615662 14306 kuberuntime_manager.go:739] checking backoff for container "prometheus" in pod "prometheus-k8s-0_monitoring(7598959a-fcff-11e7-9333-fa163e48f857)"
有什么建议吗?感谢。
答案 0 :(得分:1)
两个Prometheus服务器不能共享同一个存储目录,你应该遇到一个锁定错误。