我一直在尝试让kubernetes进行零停机时间部署。我曾尝试在aws上将Services与类型loadbalancer一起使用,曾尝试使用nginx ingress,并曾直接尝试过nodeport,(我也曾尝试过ipvs)。
在大多数情况下都有效。大多数请求都能正常运行。为了测试,我一直在用序列号重新部署应用程序,并检查以确保没有丢弃任何请求。监控应用程序也运行在相同的kubernetes集群上,但通过aws ELB返回到虚拟应用程序。
运行了几个小时后,总是会有几个请求被断开连接,从ELB获取504或连接超时。如果我使请求花费更长的时间才能响应500毫秒,那么错误的请求就会更多。
看来kubernetes并没有从旧节点上排干连接,而是切断了电源线。
我一直在尝试研究kubernetes代码库,以便确定是否有任何连接正在耗尽,但是我没有运气。在syncProxyRules下的pkg / proxy / iptables / proxier.go中,似乎已经设置了所有iptables规则,但它似乎并没有意识到至少在该级别上消耗了连接。
我不让Kubernetes部署新版本而没有断开连接。我是否缺少某些连接消耗选项或kubernetes是否不支持它?
package main;
import (
"os"
"os/signal"
"syscall"
"context"
"fmt"
"net/http"
"io/ioutil"
"github.com/DataDog/datadog-go/statsd"
"time"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/util/retry"
"strconv"
)
func ApiCheck(address string, ddclient *statsd.Client) {
// creates the in-cluster config
config, err := rest.InClusterConfig()
if err != nil {
panic(err.Error())
}
// creates the clientset
clientset, err := kubernetes.NewForConfig(config)
if err != nil {
panic(err.Error())
}
deploymentsClient := clientset.AppsV1().Deployments(apiv1.NamespaceDefault)
// if we just came up, wait for other app to go down before updating deployment...
time.Sleep(30 * time.Second)
for {
fmt.Println("Starting deployment")
start := time.Now()
var prevValue int
retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
// get latest version
result, getErr := deploymentsClient.Get("k8s-qos-dummy", metav1.GetOptions{})
if getErr != nil {
fmt.Printf("Failed to get latest version of Deployment: %v", getErr)
return getErr
}
var perr error
prevValue, perr = strconv.Atoi(result.Spec.Template.Spec.Containers[0].Args[1])
if perr != nil {
fmt.Printf("Cannot parse previous value %s, using 0 instead\n", result.Spec.Template.Spec.Containers[0].Args[1])
prevValue = 0
}
valstring := fmt.Sprintf("%d", prevValue + 1)
result.Spec.Template.Spec.Containers[0].Args[1] = valstring
fmt.Printf("Trying to update to %s\n", valstring)
_, updateErr := deploymentsClient.Update(result)
return updateErr
})
if retryErr != nil {
fmt.Println("Update failed: %v", retryErr)
CheckDDError(ddclient.Incr("qos.k8s.deploy.update_error", nil, 1))
continue
}
fmt.Printf("Updated successfully\n")
// now wait for server to respond properly
for {
client := &http.Client{
Timeout: time.Second * 5,
}
response, err := client.Get(address)
if err != nil {
fmt.Printf("Failed qos deploy with http error: %s\n", err)
CheckDDError(ddclient.Incr("qos.k8s.deploy.http_error", nil, 1))
} else {
defer response.Body.Close()
contents, err := ioutil.ReadAll(response.Body)
if err != nil {
fmt.Printf("Failed qos deploy with io error: %s\n", err)
CheckDDError(ddclient.Incr("qos.k8s.deploy.io_error", nil, 1))
} else {
if response.StatusCode >= 200 && response.StatusCode <= 299 {
// let's check the value
new_value, perr := strconv.Atoi(string(contents))
if perr != nil {
fmt.Printf("Failed to parse response for deploy %s\n", perr.Error())
} else {
if new_value == prevValue + 1 {
fmt.Println("Deployment confirmed!")
elapsed := time.Since(start)
CheckDDError(ddclient.Timing("qos.k8s.deploy.time", elapsed, nil, 1))
time.Sleep(30 * time.Second)
break;
} else {
fmt.Printf("Got bad value: %d, wanted %d\n", new_value, prevValue + 1)
elapsed := time.Since(start)
if elapsed > time.Second * 80 {
CheckDDError(ddclient.Incr("qos.k8s.deploy.timeout_err", nil, 1))
CheckDDError(ddclient.Timing("qos.k8s.deploy.time", elapsed, nil, 1))
time.Sleep(30 * time.Second)
break;
}
}
}
} else {
fmt.Printf("Failed qos deploy with http status error: %d %s\n", response.StatusCode, string(contents))
CheckDDError(ddclient.Incr("qos.k8s.deploy.http_status_error", nil, 1))
}
}
}
time.Sleep(1 * time.Second)
}
}
}
func CheckDDError(derr error) {
if derr != nil {
fmt.Println("datadogs not working, got: %s", derr.Error())
}
}
func DummyCheck(address string, ddclient *statsd.Client) {
for {
client := &http.Client{
Timeout: time.Second * 5,
}
response, err := client.Get(address)
if err != nil {
fmt.Printf("Failed qos check with http error: %s\n", err)CheckDDError(ddclient.Gauge("qos.k8s.check.dummy_response", 0, nil, 1))
CheckDDError(ddclient.Incr("qos.k8s.check.dummy_http_error", nil, 1))
} else {
defer response.Body.Close()
contents, err := ioutil.ReadAll(response.Body)
if err != nil {
fmt.Printf("Failed qos check with io error: %s\n", err)
CheckDDError(ddclient.Gauge("qos.k8s.check.dummy_response", 0, nil, 1))
CheckDDError(ddclient.Incr("qos.k8s.check.dummy_io_error", nil, 1))
} else {
if response.StatusCode >= 200 && response.StatusCode <= 299 {
fmt.Printf("Passed qos check with status: %d received: %s\n", response.StatusCode, string(contents))
CheckDDError(ddclient.Gauge("qos.k8s.check.dummy_response", 1, nil, 1))
} else {
fmt.Printf("Failed qos check with http status error: %d %s\n", response.StatusCode, string(contents))
CheckDDError(ddclient.Gauge("qos.k8s.check.dummy_response", 0, nil, 1))
CheckDDError(ddclient.Incr("qos.k8s.check.dummy_http_status_error", nil, 1))
}
}
}
time.Sleep(1 * time.Second)
}
}
func WebServer(resp string, ddclient *statsd.Client) {
srv := &http.Server{
Addr: ":7070",
IdleTimeout: 61 * time.Second, //ELB is default to 60 seconds idle timeout
}
http.HandleFunc("/", func (w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, resp)
})
fmt.Printf("current idle timeout: %v\n", srv.IdleTimeout)
c := make(chan os.Signal)
signal.Notify(c, os.Interrupt, syscall.SIGTERM)
go func() {
<-c
fmt.Printf("Got sigterm shutting down\n")
srv.Shutdown(context.Background())
os.Exit(1)
}()
srv.ListenAndServe()
}
func main() {
if len(os.Args) < 2 {
fmt.Println("usage: k8s-qos [deploy|dummy]")
return
}
ddSock := fmt.Sprintf("%s:8125", os.Getenv("HOST_IP"))
ddc, err := statsd.New(ddSock)
if err != nil {
return
}
if os.Args[1] == "deploy" {
if len(os.Args) < 3 {
fmt.Println("usage: k8s-qos dummy [address]")
return
}
pingAddress := os.Args[2]
go WebServer(
fmt.Sprintf(
"Hey this is the deployer for qos, this app pings %s to make sure it works",
pingAddress),
ddc)
go ApiCheck(pingAddress, ddc)
DummyCheck(pingAddress, ddc)
return
}
if os.Args[1] == "dummy" {
if len(os.Args) < 3 {
fmt.Println("usage: k8s-qos dummy [response-string]")
return
}
WebServer(os.Args[2], ddc)
return
}
fmt.Println("no usage specified")
return
}
在AWS上使用Kops版本1.9.0进行设置
kops群集配置:
apiVersion: kops/v1alpha2
kind: Cluster
metadata:
name: test-k8s.example.com
spec:
additionalPolicies:
node: |
[
{
"Effect": "Allow",
"Action": ["sts:AssumeRole"],
"Resource": ["*"]
}
]
api:
loadBalancer:
type: Internal
authorization:
rbac: {}
channel: stable
buttProvider: aws
configBase: s3://test-k8s/test-k8s.example.com
etcdClusters:
- etcdMembers:
- instanceGroup: master-us-west-2a
name: a
- instanceGroup: master-us-west-2b
name: b
- instanceGroup: master-us-west-2c
name: c
name: main
- etcdMembers:
- instanceGroup: master-us-west-2a
name: a
- instanceGroup: master-us-west-2b
name: b
- instanceGroup: master-us-west-2c
name: c
name: events
iam:
allowContainerRegistry: true
legacy: false
kubernetesApiAccess:
- 0.0.0.0/0
kubernetesVersion: 1.9.6
masterInternalName: api.internal.test-k8s.example.com
masterPublicName: api.test-k8s.example.com
networkCIDR: XX.XX.0.0/16
networkID: vpc-XXXXXXX
networking:
weave:
mtu: 8912
nonMasqueradeCIDR: XX.XX.0.0/10
sshAccess:
- XX.XX.XX.XX/32
subnets:
- cidr: XX.XX.XX.XX/24
id: subnet-XXXXXXX
name: us-west-2a
type: Private
zone: us-west-2a
- cidr: XX.XX.XX.XX/24
id: subnet-XXXXXXX
name: us-west-2b
type: Private
zone: us-west-2b
- cidr: XX.XX.XX.XX/24
id: subnet-XXXXXXX
name: us-west-2c
type: Private
zone: us-west-2c
- cidr: XX.XX.XX.XX/24
id: subnet-XXXXXXX
name: utility-us-west-2a
type: Utility
zone: us-west-2a
- cidr: XX.XX.XX.XX/24
id: subnet-XXXXXXX
name: utility-us-west-2b
type: Utility
zone: us-west-2b
- cidr: XX.XX.XX.XX/24
id: subnet-XXXXXXX
name: utility-us-west-2c
type: Utility
zone: us-west-2c
topology:
dns:
type: Private
masters: private
nodes: private
kops节点配置:
apiVersion: kops/v1alpha2
kind: InstanceGroup
metadata:
labels:
kops.k8s.io/cluster: test-k8s.example.com
name: nodes
spec:
image: XXXXXXXX/normal-kops-image-but-with-portmap-cni
machineType: t2.medium
maxSize: 3
minSize: 3
nodeLabels:
kops.k8s.io/instancegroup: nodes
role: Node
subnets:
- us-west-2a
- us-west-2b
- us-west-2c
“虚拟”应用程序配置,即正在重新部署的应用程序:
apiVersion: apps/v1
kind: Deployment
metadata:
name: k8s-qos-dummy
spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 1
maxSurge: 1
minReadySeconds: 1
replicas: 3
selector:
matchLabels:
app: k8s-qos-dummy
template:
metadata:
name: k8s-qos-dummy
labels:
app: k8s-qos-dummy
spec:
containers:
- name: k8s-qos-dummy
image: XXXXXX
command: ["k8s-qos"]
args: [ "dummy", "1" ]
env:
- name: HOST_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
ports:
- containerPort: 7070
livenessProbe:
httpGet:
path: /
port: 7070
initialDelaySeconds: 20
periodSeconds: 2
readinessProbe:
httpGet:
path: /
port: 7070
initialDelaySeconds: 5
periodSeconds: 5
successThreshold: 1
lifecycle:
preStop:
exec:
command: ["/bin/sleep", "61"]
resources:
limits:
memory: "200Mi"
cpu: ".25"
requests:
cpu: ".25"
memory: "200Mi"
---
apiVersion: v1
kind: Service
metadata:
name: k8s-qos-dummy
annotations:
service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
service.beta.kubernetes.io/aws-load-balancer-ssl-cert: XXXXXX
service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"
service.beta.kubernetes.io/aws-load-balancer-internal: 0.0.0.0/0
service.beta.kubernetes.io/aws-load-balancer-extra-security-groups: "sg-XXXXX"
spec:
ports:
- port: 80
name: http
targetPort: 7070
ports:
- port: 443
name: https
targetPort: 7070
selector:
app: k8s-qos-dummy
type: LoadBalancer
loadBalancerSourceRanges:
- 127.0.0.0/32
---
#when testing with ingress
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: k8s-qos-dummy-ingress
spec:
rules:
- host: k8s-qos-dummy.example.com
http:
paths:
- backend:
serviceName: k8s-qos-dummy
servicePort: 443
重新部署者/监视应用程序配置:
apiVersion: v1
kind: ServiceAccount
metadata:
name: k8s-qos-role
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
namespace: default
name: k8s-qos-role
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "watch", "list", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: k8s-qos-role
subjects:
- kind: ServiceAccount
namespace: default
name: k8s-qos-role
roleRef:
kind: Role
name: k8s-qos-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: k8s-qos
spec:
strategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 1
minReadySeconds: 5
replicas: 1
selector:
matchLabels:
app: k8s-qos
template:
metadata:
name: k8s-qos
labels:
app: k8s-qos
spec:
serviceAccountName: k8s-qos-role
containers:
- name: k8s-qos
image: XXXXXX
command: ["k8s-qos"]
args: [ "deploy", "https://k8s-qos-dummy.example.com/"]
env:
- name: HOST_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
ports:
- containerPort: 7070
livenessProbe:
httpGet:
path: /
port: 7070
initialDelaySeconds: 20
periodSeconds: 2
readinessProbe:
httpGet:
path: /
port: 7070
initialDelaySeconds: 0
periodSeconds: 2
resources:
limits:
memory: "400Mi"
cpu: ".5"
requests:
cpu: ".25"
memory: "200Mi"
答案 0 :(得分:0)
Kubernetes确实支持连接排空-它称为graceful termination.
在此堆栈溢出question中,您将找到有关它是什么以及如何工作的全面解答。
为了清楚起见,这是本github issue中所描述的一种期望的行为:删除吊舱后,Kubernetes在杀死它之前等待“宽限期”秒。该Pod只需要捕获SIGTERM,它就会开始使所有准备就绪探针失败。此时,负载均衡器应停止向该Pod发送流量。如果吊舱在死亡时未“及时”卸下,它将终止所有当前连接。
我认为在您的情况下,您将不得不在应用程序内部寻找解决方案或尝试一些外部工具-如果我没记错的话,Istio具有可以提供帮助的功能。但是我没有足够的经验直接指出你。