...// WaitSidecar waits until sidecar is health
funcWaitSidecar(){//判断是否运行在k8s集群中
iflen(os.Getenv("KUBERNETES_SERVICE_HOST"))>0&&len(os.Getenv("NO_SIDECAR"))<=0{tic:=time.NewTicker(1*time.Second)defertic.Stop()after:=time.After(30*time.Second)for{select{case<-tic.C:logger.Infof("[main] sidecar health checking")//sidercar健康检查
ifsidecarHealthCheck(){return}case<-after:logger.Warn("[main] sidecar health check timeout after 30 seconds")return}}}return}...funcsidecarHealthCheck()bool{cli:=http.Client{Timeout:1*time.Second,}// envoy健康检查接口
resp,err:=cli.Get("http://127.0.0.1:15021/healthz/ready")iferr!=nil{logger.Warn("[main] sidecarHealthCheck failed,err",err)returnfalse}deferresp.Body.Close()ifresp.StatusCode==200{returntrue}returnfalse}
加上这段逻辑,基本上解决了我们服务部署时启动异常的问题.这么做,虽然达到了目的,但是所有的服务都需要重新打包部署,一定程度上也违背了 service mesh 的理念,实在是不够优雅,但是有没有更好的实现方式呢?
Istio 1.7 的实现
容器的启动顺序
在出现问个问题的时候,我们都会下意识的认为原因是因为同一个 pod 中的容器都是同时开始启动的,应用容器启动的时间比 sidecar 容器启动时间短导致的。但是根据kubelet的源码,你会发现容器确实是按顺序启动的:
Hook handler calls are synchronous within the context of the Pod containing the Container. This means that for a PostStart hook, the Container ENTRYPOINT and hook fire asynchronously. However, if the hook takes too long to run or hangs, the Container cannot reach a running state.
var(timeoutSecondsintrequestTimeoutMillisintperiodMillisinturlstringwaitCmd=&cobra.Command{Use:"wait",Short:"Waits until the Envoy proxy is ready",RunE:func(c*cobra.Command,args[]string)error{client:=&http.Client{Timeout:time.Duration(requestTimeoutMillis)*time.Millisecond,}log.Infof("Waiting for Envoy proxy to be ready (timeout: %d seconds)...",timeoutSeconds)varerrerrortimeoutAt:=time.Now().Add(time.Duration(timeoutSeconds)*time.Second)fortime.Now().Before(timeoutAt){err=checkIfReady(client,url)iferr==nil{log.Infof("Envoy is ready!")returnnil}log.Debugf("Not ready yet: %v",err)time.Sleep(time.Duration(periodMillis)*time.Millisecond)}returnfmt.Errorf("timeout waiting for Envoy proxy to become ready. Last error: %v",err)},})funccheckIfReady(client*http.Client,urlstring)error{req,err:=http.NewRequest(http.MethodGet,url,nil)iferr!=nil{returnerr}resp,err:=client.Do(req)iferr!=nil{returnerr}deferfunc(){_=resp.Body.Close()}()_,err=ioutil.ReadAll(resp.Body)iferr!=nil{returnerr}ifresp.StatusCode!=200{returnfmt.Errorf("HTTP status code %v",resp.StatusCode)}returnnil}
funcIntoObject(sidecarTemplatestring,valuesConfigstring,revisionstring,meshconfig*meshconfig.MeshConfig,inruntime.Object)(interface{},error){...podSpec.InitContainers=append(podSpec.InitContainers,spec.InitContainers...)podSpec.Containers=injectContainers(podSpec.Containers,spec)podSpec.Volumes=append(podSpec.Volumes,spec.Volumes...)...}funcinjectContainers(target[]corev1.Container,sic*SidecarInjectionSpec)[]corev1.Container{containersToInject:=sic.Containersifsic.HoldApplicationUntilProxyStarts{// inject sidecar at start of spec.containers
proxyIndex:=-1fori,c:=rangecontainersToInject{ifc.Name==ProxyContainerName{proxyIndex=ibreak}}ifproxyIndex!=-1{result:=make([]corev1.Container,1,len(target)+len(containersToInject))result[0]=containersToInject[proxyIndex]result=append(result,target...)result=append(result,containersToInject[:proxyIndex]...)result=append(result,containersToInject[proxyIndex+1:]...)returnresult}}returnappend(target,containersToInject...)}
还有部分修改就是修改 sidecar pod 模板,添加values.global.proxy.holdApplicationUntilProxyStarts的判断,如果为true,则在对应的 lifecycle 的postStart 添加 pilot-agent wait命令.