ApiServer处理
不管是通过kubectl命令还是程序通过api接口删除pod,最终都是通过Api Server进行处理。
registerResourceHandlers
Api Server提供了restful接口,处理DELETE的方法,实现如下:
文件位置在k8s.io/apiserver/pkg/endpoints/install.go文件中registerResourceHandlers函数
case "DELETE": // Delete a resource.
article := GetArticleForNoun(kind, " ")
doc := "delete" + article + kind
if isSubresource {
doc = "delete " + subresource + " of" + article + kind
}
deleteReturnType := versionedStatus
if deleteReturnsDeletedObject {
deleteReturnType = producedObject
}
handler := metrics.InstrumentRouteFunc(action.Verb, group, version, resource, subresource, requestScope, metrics.APIServerComponent, deprecated, removedRelease, restfulDeleteResource(gracefulDeleter, isGracefulDeleter, reqScope, admit))
if enableWarningHeaders {
handler = utilwarning.AddWarningsHandler(handler, warnings)
}
...
其中调用了restfulDeleteResource
方法->restfulDeleteResource
->调用Delete
进行删除
//第一次由调用api触发,(kubectl命令或者程序调用rest api)
//第二次由kubelet组件的statusManager模块的访问而触发。
func DeleteResource(r rest.GracefulDeleter, allowsOptions bool, scope *RequestScope, admit admission.Interface) http.HandlerFunc {
return func(w http.ResponseWriter, req *http.Request) {
trace := utiltrace.New("Delete " + req.URL.Path)
/*
其他代码
*/
options := &metav1.DeleteOptions{}
trace.Step("About to delete object from database")
result, err := finishRequest(timeout, func() (runtime.Object, error) {
//重点在 r.Delete(...)
obj, deleted, err := r.Delete(ctx, name, rest.AdmissionToValidateObjectDeleteFunc(admit, staticAdmissionAttrs, scope), options)
/*
其他代码
*/
return obj, err
})
/*
检查性代码
*/
trace.Step("Object deleted from database")
status := http.StatusOK
/*
其他代码
*/
//向客户端返回响应
transformResponseObject(ctx, scope, trace, req, w, status, outputMediaType, result)
}
}
Delete
Delete的实现在k8s.io/kubernetes/staging/src/k8s.io/apiserver/pkg/registry/generic/registry/store.go 文件中
Delete
函数实现如下:
func (e *Store) Delete(ctx context.Context, name string, deleteValidation rest.ValidateObjectFunc, options *metav1.DeleteOptions) (runtime.Object, bool, error) {
...
graceful, pendingGraceful, err := rest.BeforeDelete(e.DeleteStrategy, ctx, obj, options)
if err != nil {
return nil, false, err
}
// this means finalizers cannot be updated via DeleteOptions if a deletion is already pending
if pendingGraceful {
out, err := e.finalizeDelete(ctx, obj, false)
return out, false, err
}
// check if obj has pending finalizers
accessor, err := meta.Accessor(obj)
if err != nil {
return nil, false, apierrors.NewInternalError(err)
}
pendingFinalizers := len(accessor.GetFinalizers()) != 0
var ignoreNotFound bool
var deleteImmediately bool = true
var lastExisting, out runtime.Object
// Handle combinations of graceful deletion and finalization by issuing
// the correct updates.
shouldUpdateFinalizers, _ := deletionFinalizersForGarbageCollection(ctx, e, accessor, options)
// TODO: remove the check, because we support no-op updates now.
// 默认情况下,这个优雅的时间是30s
if graceful || pendingFinalizers || shouldUpdateFinalizers {
err, ignoreNotFound, deleteImmediately, out, lastExisting = e.updateForGracefulDeletionAndFinalizers(ctx, name, key, options, preconditions, deleteValidation, obj)
/*
检查性代码
*/
}
// 第一次来到此处,如果是优雅删除此处deleteImmediately为false,会返回
// !deleteImmediately covers all cases where err != nil. We keep both to be future-proof.
if !deleteImmediately || err != nil {
return out, false, err
}
...
// 第二次才会到达此处,彻底删除清理存储
out = e.NewFunc()
if err := e.Storage.Delete(ctx, key, out, &preconditions, storage.ValidateObjectFunc(deleteValidation), dryrun.IsDryRun(options.DryRun)); err != nil {
// Please refer to the place where we set ignoreNotFound for the reason
// why we ignore the NotFound error .
if storage.IsNotFound(err) && ignoreNotFound && lastExisting != nil {
// The lastExisting object may not be the last state of the object
// before its deletion, but it's the best approximation.
out, err := e.finalizeDelete(ctx, lastExisting, true)
return out, true, err
}
return nil, false, storeerr.InterpretDeleteError(err, qualifiedResource, name)
}
out, err = e.finalizeDelete(ctx, out, true)
return out, true, err
}
BeforeDelete
调用BeforeDelete
方法改变pod的内部信息,主要是DeletionTimestamp和DeletionGracePeriodSeconds两个字段
now := metav1.NewTime(metav1.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds)))
objectMeta.SetDeletionTimestamp(&now)
objectMeta.SetDeletionGracePeriodSeconds(options.GracePeriodSeconds)
Kubelet处理
处理流程
请求删除Pod-->apiserver更新Pod信息-->kubelet优雅释放Pod资源(修改DeletionTimestamp和DeletionGracePeriodSeconds)-->kubelet清理pod资源(canBeDeleted-->PodResourcesAreReclaimed)-->kubelet调用api server接口删除Pod(此时将 graceful设置为0)-->apiserver删除etcd中Pod信息(deleteImmediately此时为true)-->kubelet完成最终Pod的资源清理(执行remove操作)
代码分析
事件处理
主循环syncLoopIteration
针对podUpdate
处理调用了HandlePodUpdates
。另外一个与delete相关的是每2s执行一次的来自于housekeepingCh的定时事件,用于清理pod,执行的是HandlePodCleanups
函数。
func (kl *Kubelet) syncLoopIteration(configCh <-chan kubetypes.PodUpdate, handler SyncHandler,
syncCh <-chan time.Time, housekeepingCh <-chan time.Time, plegCh <-chan *pleg.PodLifecycleEvent) bool {
select {
case u, open := <-configCh:
// Update from a config source; dispatch it to the right handler
// callback. if !open {
klog.Errorf("Update channel is closed. Exiting the sync loop.")
return false
}
switch u.Op {
case kubetypes.ADD:
klog.V(2).Infof("SyncLoop (ADD, %q): %q", u.Source, format.Pods(u.Pods))
// After restarting, kubelet will get all existing pods through
// ADD as if they are new pods. These pods will then go through the // admission process and *may* be rejected. This can be resolved // once we have checkpointing. handler.HandlePodAdditions(u.Pods)
case kubetypes.UPDATE:
klog.V(2).Infof("SyncLoop (UPDATE, %q): %q", u.Source, format.PodsWithDeletionTimestamps(u.Pods))
handler.HandlePodUpdates(u.Pods)
...
case <-housekeepingCh:
if !kl.sourcesReady.AllReady() {
// If the sources aren't ready or volume manager has not yet synced the states,
// skip housekeeping, as we may accidentally delete pods from unready sources. klog.V(4).Infof("SyncLoop (housekeeping, skipped): sources aren't ready yet.")
} else {
klog.V(4).Infof("SyncLoop (housekeeping)")
if err := handler.HandlePodCleanups(); err != nil {
klog.Errorf("Failed cleaning pods: %v", err)
}
}
...
}
后续调用HandlePodUpdates
->dispatchWork
func (kl *Kubelet) dispatchWork(pod *v1.Pod, syncType kubetypes.SyncPodType, mirrorPod *v1.Pod, start time.Time) {
if kl.podIsTerminated(pod) {
if pod.DeletionTimestamp != nil {
kl.statusManager.TerminatePod(pod)
}
return
}
// Run the sync in an async worker.
kl.podWorkers.UpdatePod(&UpdatePodOptions{
Pod: pod,
MirrorPod: mirrorPod,
UpdateType: syncType,
OnCompleteFunc: func(err error) {
...
首先通过判断了kl.podIsTerminated(pod)判断pod是不是已经处于了Terminated
状态,直接设置pod状态并返回;否则执行UpdatePod。Terminated
状态:pod为failed或者pod为succeeded或者DeletionTimestamp有值且pod中所有的容器不在运行
func (kl *Kubelet) podIsTerminated(pod *v1.Pod) bool {
status, ok := kl.statusManager.GetPodStatus(pod.UID)
if !ok {
status = pod.Status
}
return status.Phase == v1.PodFailed || status.Phase == v1.PodSucceeded || (pod.DeletionTimestamp != nil && notRunning(status.ContainerStatuses))
}
后续执行podWorkers.UpdatePod
->启动协程p.managePodLoop(podUpdates)
->syncPodFn
处理,syncPodFn
在kubelet初始化时进行设置,如下:
klet.podWorkers = newPodWorkers(klet.syncPod, kubeDeps.Recorder, klet.workQueue, klet.resyncInterval, backOffPeriod, klet.podCache)
syncPodFn
会被设置为syncPod
,syncPod
主要逻辑如下:
func (kl *Kubelet) syncPod(o syncPodOptions) error {
/*
其他代码
*/
//pod对象具备DeletionTimestamp字段则进入if语句
if !runnable.Admit || pod.DeletionTimestamp != nil || apiPodStatus.Phase == v1.PodFailed {
//killPod(..)调用容器运行时来停止pod中容器
if err := kl.killPod(pod, nil, podStatus, nil); err != nil {
/*
其他代码
*/
} else {
/*
其他代码
*/
}
return syncErr
}
}
killPod
主要逻辑
func (kl *Kubelet) killPod(pod *v1.Pod, runningPod *kubecontainer.Pod, status *kubecontainer.PodStatus, gracePeriodOverride *int64) error {
var p kubecontainer.Pod
/*
其他代码
*/
// 调用容器运行时停止pod中的容器
if err := kl.containerRuntime.KillPod(pod, p, gracePeriodOverride); err != nil {
return err
}
if err := kl.containerManager.UpdateQOSCgroups(); err != nil {
klog.V(2).Infof("Failed to update QoS cgroups while killing pod: %v", err)
}
return nil
}
停止pod的过程主要发生在killPodWithSyncResult
函数中
func (m *kubeGenericRuntimeManager) killPodWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (result kubecontainer.PodSyncResult) {
killContainerResults := m.killContainersWithSyncResult(pod, runningPod, gracePeriodOverride)
...
for _, podSandbox := range runningPod.Sandboxes {
if err := m.runtimeService.StopPodSandbox(podSandbox.ID.ID); err != nil {
...
killPodWithSyncResult
的主要工作分为两个部分。killContainersWithSyncResult
负责将pod中的container停止掉,再执行StopPodSandbox
func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, reason string, gracePeriodOverride *int64) error {
...
if err := m.internalLifecycle.PreStopContainer(containerID.ID); err != nil {
return err
}
// 执行pre stop hook
if containerSpec.Lifecycle != nil && containerSpec.Lifecycle.PreStop != nil && gracePeriod > 0 {
gracePeriod = gracePeriod - m.executePreStopHook(pod, containerID, containerSpec, gracePeriod)
}
...
err := m.runtimeService.StopContainer(containerID.ID, gracePeriod)
killContainersWithSyncResult
的主要工作是在killContainer
中完成的,这里可以看到,其中的主要两个步骤是在容器中进行prestop
的操作。待其成功后,进行container的stop工作。至此所有的应用容器都已经停止了。下一步是停止pause
容器,由函数StopPodSandbox
完成。停止sandbox,也就是pause容器停止掉。StopPodSandbox
是在dockershim
中执行的。
func (ds *dockerService) StopPodSandbox(ctx context.Context, r *runtimeapi.StopPodSandboxRequest) (*runtimeapi.StopPodSandboxResponse, error) {
...
if !hostNetwork && (ready || !ok) {
...
err := ds.network.TearDownPod(namespace, name, cID, annotations)
...
}
if err := ds.client.StopContainer(podSandboxID, defaultSandboxGracePeriod); err != nil {
StopPodSandbox
中主要的部分是先进行网络卸载,再停止相应的容器。在完成StopPodSandbox后,至此pod的所有容器都已经停止完成。停止后的容器在pod彻底清理后,会被gc回收。
HandlePodCleanups
如果container都不是running
,那么在函数dispatchWork
中设置为pod
为Terminated
状态后就会返回,此时资源由HandlePodCleanups
进行清理。syncLoopIteration
中每2s执行一次的HandlePodCleanups
的流程。比如当container处于crash
,正好不是running等情况,就是由这个流程里进行处理的。当然HandlePodCleanups的作用不仅仅是清理not running的pod,再比如数据已经在apiserver中强制清理掉了,或者由于其他原因这个节点上还有一些没有完成清理的pod,都是在这个流程中进行处理。
func (kl *Kubelet) HandlePodCleanups() error {
...
for _, pod := range runningPods {
if _, found := desiredPods[pod.ID]; !found {
kl.podKillingCh <- &kubecontainer.PodPair{APIPod: nil, RunningPod: pod}
}
}
runningPods
是从cache中获取节点现有的pod,而desiredPods
则是节点上应该存在未被停止的pod
。如果存在runningPods
中有而desiredPods
中没有的pod
,那么它应该被停止,所以发送到podKillingCh
中。
func (kl *Kubelet) podKiller() {
...
for podPair := range kl.podKillingCh {
...
if !exists {
go func(apiPod *v1.Pod, runningPod *kubecontainer.Pod) {
glog.V(2).Infof("Killing unwanted pod %q", runningPod.Name)
err := kl.killPod(apiPod, runningPod, nil, nil)
...
}(apiPod, runningPod)
}
}
}
在podKiller
流程中,会去接收来自podKillingCh
的消息,从而执行killPod
状态同步
kubelet结构定义中有一个statusManager模块,它会for循环调用syncPod()方法
type Kubelet struct {
...
// Syncs pods statuses with apiserver; also used as a cache of statuses.
statusManager status.Manager
...
}
manager start函数如下:
func (m *manager) Start() {
// Don't start the status manager if we don't have a client. This will happen
// on the master, where the kubelet is responsible for bootstrapping the pods // of the master components. if m.kubeClient == nil {
klog.Infof("Kubernetes client is nil, not starting status manager.")
return
}
klog.Info("Starting to sync pod status with apiserver")
//lint:ignore SA1015 Ticker can link since this is only called once and doesn't handle termination.
syncTicker := time.Tick(syncPeriod)
// syncPod and syncBatch share the same go routine to avoid sync races.
go wait.Forever(func() {
for {
select {
case syncRequest := <-m.podStatusChannel:
klog.V(5).Infof("Status Manager: syncing pod: %q, with status: (%d, %v) from podStatusChannel",
syncRequest.podUID, syncRequest.status.version, syncRequest.status.status)
m.syncPod(syncRequest.podUID, syncRequest.status)
case <-syncTicker:
klog.V(5).Infof("Status Manager: syncing batch")
// remove any entries in the status channel since the batch will handle them
for i := len(m.podStatusChannel); i > 0; i-- {
<-m.podStatusChannel
}
m.syncBatch()
}
}
}, 0)
}
在syncPod
方法有下面的逻辑
// We don't handle graceful deletion of mirror pods.
// canBeDeleted函数中会调用PodResourcesAreReclaimed,来检查pod资源是否已经释放完毕;真正的回收工作在cgc.evictContainers中完成
if m.canBeDeleted(pod, status.status) {
deleteOptions := metav1.DeleteOptions{
GracePeriodSeconds: new(int64),
// Use the pod UID as the precondition for deletion to prevent deleting a
// newly created pod with the same name and namespace. Preconditions: metav1.NewUIDPreconditions(string(pod.UID)),
}
// 再次调用删除接口,此时GracePeriodSeconds已经是0,api server会执行立即删除操作
err = m.kubeClient.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, deleteOptions)
if err != nil {
klog.Warningf("Failed to delete status for pod %q: %v", format.Pod(pod), err)
return
}
klog.V(3).Infof("Pod %q fully terminated and removed from etcd", format.Pod(pod))
m.deletePodStatus(uid)
}
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。