k8s随笔--descheduler源码简读

启动：

折叠源码

　　



// 通过cobra生成启动命令
    cmd := &cobra.Command{
        Use:   "descheduler",
        Short: "descheduler",
        Long:  `The descheduler evicts pods which may be bound to less desired nodes`,
        Run: func(cmd *cobra.Command, args []string) {
            // s.Logs.Config.Format = s.Logging.Format
 
            // LoopbackClientConfig is a config for a privileged loopback connection
            var LoopbackClientConfig *restclient.Config
            var SecureServing *apiserver.SecureServingInfo
            if err := s.SecureServing.ApplyTo(&SecureServing, &LoopbackClientConfig); err != nil {
                klog.ErrorS(err, "failed to apply secure server configuration")
                return
            }
 
            factory, _ := registry.LogRegistry.Get(s.Logging.Format)
            if factory == nil {
                klog.ClearLogger()
            } else {
                log, logrFlush := factory.Create(config.FormatOptions{})
                defer logrFlush()
                klog.SetLogger(log)
            }
 
            ctx, done := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
            defer done()
            pathRecorderMux := mux.NewPathRecorderMux("descheduler")
            // 可配置是否开启监控
            if !s.DisableMetrics {
                pathRecorderMux.Handle("/metrics", legacyregistry.HandlerWithReset())
            }
 
            healthz.InstallHandler(pathRecorderMux, healthz.NamedCheck("Descheduler", healthz.PingHealthz.Check))
 
            if _, err := SecureServing.Serve(pathRecorderMux, 0, ctx.Done()); err != nil {
                klog.Fatalf("failed to start secure server: %v", err)
                return
            }
 
            err := Run(ctx, s)
            if err != nil {
                klog.ErrorS(err, "descheduler server")
            }
        },
    }

初始化及声明运行周期等部分:

折叠源码

func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer, deschedulerPolicy *api.DeschedulerPolicy, evictionPolicyGroupVersion string, stopChannel chan struct{}) error {
    // 获取进行二次调度需要使用的一些相关信息
    sharedInformerFactory := informers.NewSharedInformerFactory(rs.Client, 0)
    nodeInformer := sharedInformerFactory.Core().V1().Nodes()
    podInformer := sharedInformerFactory.Core().V1().Pods()
    namespaceInformer := sharedInformerFactory.Core().V1().Namespaces()
    priorityClassInformer := sharedInformerFactory.Scheduling().V1().PriorityClasses()
 
    // create the informers
    namespaceInformer.Informer()
    priorityClassInformer.Informer()
 
    // 注意这里，这里其实build了一个func，这个func接收nodeName和podFilter方法，返回该node下通过filter的所有pod
    getPodsAssignedToNode, err := podutil.BuildGetPodsAssignedToNodeFunc(podInformer)
    if err != nil {
        return fmt.Errorf("build get pods assigned to node function error: %v", err)
    }
 
    sharedInformerFactory.Start(stopChannel)
    sharedInformerFactory.WaitForCacheSync(stopChannel)
 
    // 可能使用到的(descheduler预定义的)所有重调度策略，value是实现各自策略的方法
    // 每个策略具体执行的实现都是这个类型的 
    // type strategyFunction func(ctx context.Context, client clientset.Interface, strategy api.DeschedulerStrategy, nodes []*v1.Node, podEvictor *evictions.PodEvictor, getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc)
    strategyFuncs := map[api.StrategyName]strategyFunction{
        // 移除同一node下的重复pod
        "RemoveDuplicates":                            strategies.RemoveDuplicatePods,
        // node低资源使用率
        "LowNodeUtilization":                          nodeutilization.LowNodeUtilization,
        // node高资源使用率
        "HighNodeUtilization":                         nodeutilization.HighNodeUtilization,
        "RemovePodsViolatingInterPodAntiAffinity":     strategies.RemovePodsViolatingInterPodAntiAffinity,
        "RemovePodsViolatingNodeAffinity":             strategies.RemovePodsViolatingNodeAffinity,
        "RemovePodsViolatingNodeTaints":               strategies.RemovePodsViolatingNodeTaints,
        "RemovePodsHavingTooManyRestarts":             strategies.RemovePodsHavingTooManyRestarts,
        "PodLifeTime":                                 strategies.PodLifeTime,
        "RemovePodsViolatingTopologySpreadConstraint": strategies.RemovePodsViolatingTopologySpreadConstraint,
        "RemoveFailedPods":                            strategies.RemoveFailedPods,
    }
 
    // 下面准备一些相关的配置
    var nodeSelector string
    if deschedulerPolicy.NodeSelector != nil {
        nodeSelector = *deschedulerPolicy.NodeSelector
    }
 
    var evictLocalStoragePods bool
    if deschedulerPolicy.EvictLocalStoragePods != nil {
        evictLocalStoragePods = *deschedulerPolicy.EvictLocalStoragePods
    }
 
    evictBarePods := false
    if deschedulerPolicy.EvictFailedBarePods != nil {
        evictBarePods = *deschedulerPolicy.EvictFailedBarePods
        if evictBarePods {
            klog.V(1).InfoS("Warning: EvictFailedBarePods is set to True. This could cause eviction of pods without ownerReferences.")
        }
    }
 
    evictSystemCriticalPods := false
    if deschedulerPolicy.EvictSystemCriticalPods != nil {
        evictSystemCriticalPods = *deschedulerPolicy.EvictSystemCriticalPods
        if evictSystemCriticalPods {
            klog.V(1).InfoS("Warning: EvictSystemCriticalPods is set to True. This could cause eviction of Kubernetes system pods.")
        }
    }
 
    ignorePvcPods := false
    if deschedulerPolicy.IgnorePVCPods != nil {
        ignorePvcPods = *deschedulerPolicy.IgnorePVCPods
    }
 
    // 通过NonSlidingUntil来周期性的执行descheduler的监控及驱逐策略，执行周期为 rs.DeschedulingInterval
    wait.NonSlidingUntil(func() {
        // 查找node集合
        nodes, err := nodeutil.ReadyNodes(ctx, rs.Client, nodeInformer, nodeSelector)
        if err != nil {
            klog.V(1).InfoS("Unable to get ready nodes", "err", err)
            close(stopChannel)
            return
        }
 
        //node数量<=1，应当终止
        if len(nodes) <= 1 {
            klog.V(1).InfoS("The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting..")
            close(stopChannel)
            return
        }
 
        var podEvictorClient clientset.Interface
        // When the dry mode is enable, collect all the relevant objects (mostly pods) under a fake client.
        // So when evicting pods while running multiple strategies in a row have the cummulative effect
        // as is when evicting pods for real.
        // DryRun模式下，会构造虚拟的podEvictorClient，不会真正的进行驱逐
        if rs.DryRun {
            klog.V(3).Infof("Building a cached client from the cluster for the dry run")
            // Create a new cache so we start from scratch without any leftovers
            fakeClient, err := cachedClient(rs.Client, podInformer, nodeInformer, namespaceInformer, priorityClassInformer)
            if err != nil {
                klog.Error(err)
                return
            }
 
            fakeSharedInformerFactory := informers.NewSharedInformerFactory(fakeClient, 0)
            getPodsAssignedToNode, err = podutil.BuildGetPodsAssignedToNodeFunc(fakeSharedInformerFactory.Core().V1().Pods())
            if err != nil {
                klog.Errorf("build get pods assigned to node function error: %v", err)
                return
            }
 
            fakeCtx, cncl := context.WithCancel(context.TODO())
            defer cncl()
            fakeSharedInformerFactory.Start(fakeCtx.Done())
            fakeSharedInformerFactory.WaitForCacheSync(fakeCtx.Done())
 
            podEvictorClient = fakeClient
        } else {
            podEvictorClient = rs.Client
        }
 
        klog.V(3).Infof("Building a pod evictor")
        // 构造驱逐器
        podEvictor := evictions.NewPodEvictor(
            // client, dryRun模式下是fakeClient
            podEvictorClient,
            // 集群支持的驱逐器所需要使用的子资源的版本
            evictionPolicyGroupVersion,
            // 是否为dryRun
            rs.DryRun,
            // 每个node最多驱逐的pod限制，可配置
            deschedulerPolicy.MaxNoOfPodsToEvictPerNode,
            // 每个node最多驱逐的pod限制，可配置
            deschedulerPolicy.MaxNoOfPodsToEvictPerNamespace,
            // 所有node的集合
            nodes,
            // 是否驱逐LocalStoragePod，可配置
            evictLocalStoragePods,
            // 是否驱逐SystemCritical，可配置
            evictSystemCriticalPods,
            // 是否忽略PvcPods，可配置
            ignorePvcPods,
            evictBarePods,
            !rs.DisableMetrics,
        )
 
        // 遍历用户配置中指定的驱逐策略，如果descheduler支持当前策略，就通过 f(ctx, rs.Client, strategy, nodes, podEvictor, getPodsAssignedToNode) 执行.
        for name, strategy := range deschedulerPolicy.Strategies {
            if f, ok := strategyFuncs[name]; ok {
                if strategy.Enabled {
                    f(ctx, rs.Client, strategy, nodes, podEvictor, getPodsAssignedToNode)
                }
            } else {
                klog.ErrorS(fmt.Errorf("unknown strategy name"), "skipping strategy", "strategy", name)
            }
        }
 
        klog.V(1).InfoS("Number of evicted pods", "totalEvicted", podEvictor.TotalEvicted())
 
        // If there was no interval specified, send a signal to the stopChannel to end the wait.Until loop after 1 iteration
        if rs.DeschedulingInterval.Seconds() == 0 {
            close(stopChannel)
        }
    }, rs.DeschedulingInterval, stopChannel)
 
    return nil
}

移除同一个Node下重复Pod策略实现:

折叠源码

// RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node.
// A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same
// namespace, and have at least one container with the same image.
// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages.
func RemoveDuplicatePods(
    ctx context.Context,
    client clientset.Interface,
    strategy api.DeschedulerStrategy,
    nodes []*v1.Node,
    podEvictor *evictions.PodEvictor,
    getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc,
) {
    // 一些关于策略配置方面的基础校验
    if err := validateRemoveDuplicatePodsParams(strategy.Params); err != nil {
        klog.ErrorS(err, "Invalid RemoveDuplicatePods parameters")
        return
    }
    thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params)
    if err != nil {
        klog.ErrorS(err, "Failed to get threshold priority from strategy's params")
        return
    }
 
    var includedNamespaces, excludedNamespaces sets.String
    if strategy.Params != nil && strategy.Params.Namespaces != nil {
        includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...)
        excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...)
    }
 
    nodeFit := false
    if strategy.Params != nil {
        nodeFit = strategy.Params.NodeFit
    }
 
    evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit))
 
    duplicatePods := make(map[podOwner]map[string][]*v1.Pod)
    ownerKeyOccurence := make(map[podOwner]int32)
    nodeCount := 0
    nodeMap := make(map[string]*v1.Node)
 
    // 构造podFilter
    podFilter, err := podutil.NewOptions().
        WithFilter(evictable.IsEvictable).
        WithNamespaces(includedNamespaces).
        WithoutNamespaces(excludedNamespaces).
        BuildFilterFunc()
    if err != nil {
        klog.ErrorS(err, "Error initializing pod filter function")
        return
    }
 
    // 遍历所有node
    for _, node := range nodes {
        klog.V(1).InfoS("Processing node", "node", klog.KObj(node))
        // 注意这里的 getPodsAssignedToNode，就是初始化部分提到的 getPodsAssignedToNode， 接收节点名与podFitler方法，返回这个node下所有通过filter的pod
        pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter)
        if err != nil {
            klog.ErrorS(err, "Error listing evictable pods on node", "node", klog.KObj(node))
            continue
        }
        nodeMap[node.Name] = node
        nodeCount++
        // Each pod has a list of owners and a list of containers, and each container has 1 image spec.
        // For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string.
        // All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness.
        // This list of keys representing a single pod is then sorted alphabetically.
        // If any other pod has a list that matches that pod's list, those pods are undeniably duplicates for the following reasons:
        //   - The 2 pods have the exact same ownerrefs
        //   - The 2 pods have the exact same container images
        //
        // duplicateKeysMap maps the first Namespace/Kind/Name/Image in a pod's list to a 2D-slice of all the other lists where that is the first key
        // (Since we sort each pod's list, we only need to key the map on the first entry in each list. Any pod that doesn't have
        // the same first entry is clearly not a duplicate. This makes lookup quick and minimizes storage needed).
        // If any of the existing lists for that first key matches the current pod's list, the current pod is a duplicate.
        // If not, then we add this pod's list to the list of lists for that key.
 
        // 以下是筛选重复pod的关键逻辑，descheduler对于这部分做了特别的设计，尝试解释一下，也可以看看上面的英文
        // 首先是duplicateKeysMap，key是由当前pod的 Namespace/Kind/Name/Image 拼接而成 value则是一个二维string切片 [][]string
        duplicateKeysMap := map[string][][]string{}
        for _, pod := range pods {
            ownerRefList := podutil.OwnerRef(pod)
            if hasExcludedOwnerRefKind(ownerRefList, strategy) || len(ownerRefList) == 0 {
                continue
            }
            // 这里定义的podContainerKeys预先指定了容量，可以避免slice的扩容. M个ownerRet与N个containers M * N
            podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers))
            imageList := []string{}
            for _, container := range pod.Spec.Containers {
                imageList = append(imageList, container.Image)
            }
            // 对pod下的image排序，保证image出现的次序
            sort.Strings(imageList)
            imagesHash := strings.Join(imageList, "#")
            for _, ownerRef := range ownerRefList {
                ownerKey := podOwner{
                    namespace:  pod.ObjectMeta.Namespace,
                    kind:       ownerRef.Kind,
                    name:       ownerRef.Name,
                    imagesHash: imagesHash,
                }
                ownerKeyOccurence[ownerKey] = ownerKeyOccurence[ownerKey] + 1
                for _, image := range imageList {
                    // Namespace/Kind/Name should be unique for the cluster.
                    // We also consider the image, as 2 pods could have the same owner but serve different purposes
                    // So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod.
                    // 这里使用了更严格的策略, 增加了image
                    s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, image}, "/")
                    podContainerKeys = append(podContainerKeys, s)
                }
            }
            // 对podContainerKey再次进行排序
            sort.Strings(podContainerKeys)
 
            // If there have been any other pods with the same first "key", look through all the lists to see if any match
            // 基于上面的排序操作，如果真的存在相同的key，任何podContainerKeys 0号 索引不同的pod被认为是不重复的
            if existing, ok := duplicateKeysMap[podContainerKeys[0]]; ok {
                matched := false
                for _, keys := range existing {
                    if reflect.DeepEqual(keys, podContainerKeys) {
                        matched = true
                        klog.V(3).InfoS("Duplicate found", "pod", klog.KObj(pod))
                        for _, ownerRef := range ownerRefList {
                            ownerKey := podOwner{
                                namespace:  pod.ObjectMeta.Namespace,
                                kind:       ownerRef.Kind,
                                name:       ownerRef.Name,
                                imagesHash: imagesHash,
                            }
                            // 添加到指定的duplicatePods[ownerKey]中去，key是ownerKey, value是要驱逐的pod的切片
                            if _, ok := duplicatePods[ownerKey]; !ok {
                                duplicatePods[ownerKey] = make(map[string][]*v1.Pod)
                            }
                            duplicatePods[ownerKey][node.Name] = append(duplicatePods[ownerKey][node.Name], pod)
                        }
                        break
                    }
                }
                if !matched {
                    // Found no matches, add this list of keys to the list of lists that have the same first key
                    duplicateKeysMap[podContainerKeys[0]] = append(duplicateKeysMap[podContainerKeys[0]], podContainerKeys)
                }
            } else {
                // This is the first pod we've seen that has this first "key" entry
                // 当Map中不存在某个Pod生成的重复性校验的key时(例如，这组pods中的第一个pod)，就把它添加进去
                duplicateKeysMap[podContainerKeys[0]] = [][]string{podContainerKeys}
            }
        }
    }
 
    // 1. how many pods can be evicted to respect uniform placement of pods among viable nodes?
    for ownerKey, podNodes := range duplicatePods {
        // 查看将要被驱逐的节点还可以再哪些节点上运行
        targetNodes := getTargetNodes(podNodes, nodes)
 
        klog.V(2).InfoS("Adjusting feasible nodes", "owner", ownerKey, "from", nodeCount, "to", len(targetNodes))
        // 如果要被驱逐的pod可选用的node数量不足，就不进行本次的驱逐操作
        if len(targetNodes) < 2 {
            klog.V(1).InfoS("Less than two feasible nodes for duplicates to land, skipping eviction", "owner", ownerKey)
            continue
        }
 
        upperAvg := int(math.Ceil(float64(ownerKeyOccurence[ownerKey]) / float64(len(targetNodes))))
        for nodeName, pods := range podNodes {
            klog.V(2).InfoS("Average occurrence per node", "node", klog.KObj(nodeMap[nodeName]), "ownerKey", ownerKey, "avg", upperAvg)
            // list of duplicated pods does not contain the original referential pod
            if len(pods)+1 > upperAvg {
                // It's assumed all duplicated pods are in the same priority class
                // TODO(jchaloup): check if the pod has a different node to lend to
                for _, pod := range pods[upperAvg-1:] {
                    if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods"); err != nil {
                        klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod))
                        break
                    }
                }
            }
        }
    }
}

DogTwo

启动：

初始化及声明运行周期等部分:

移除同一个Node下重复Pod策略实现:

公告