Skip to content

[KEP-2400] Add full cgroup v2 swap support with automatically calculated swap limit for LimitedSwap and Burstable QoS Pods #118764

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions hack/local-up-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ CGROUP_DRIVER=${CGROUP_DRIVER:-""}
CGROUP_ROOT=${CGROUP_ROOT:-""}
# owner of client certs, default to current user if not specified
USER=${USER:-$(whoami)}
# if true, limited swap is being used instead of unlimited swap (default)
LIMITED_SWAP=${LIMITED_SWAP:-""}

# required for cni installation
CNI_CONFIG_DIR=${CNI_CONFIG_DIR:-/etc/cni/net.d}
Expand Down Expand Up @@ -832,6 +834,13 @@ tracing:
EOF
fi

if [[ "$LIMITED_SWAP" == "true" ]]; then
cat <<EOF >> "${TMP_DIR}"/kubelet.yaml
memorySwap:
swapBehavior: LimitedSwap
EOF
fi

{
# authentication
echo "authentication:"
Expand Down
5 changes: 3 additions & 2 deletions pkg/features/kube_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -566,8 +566,9 @@ const (
// Allow pods to failover to a different node in case of non graceful node shutdown
NodeOutOfServiceVolumeDetach featuregate.Feature = "NodeOutOfServiceVolumeDetach"

// owner: @ehashman
// owner: @iholder101
// alpha: v1.22
// beta1: v1.28. For more info, please look at the KEP: https://kep.k8s.io/2400.
//
// Permits kubelet to run with swap enabled
NodeSwap featuregate.Feature = "NodeSwap"
Expand Down Expand Up @@ -1010,7 +1011,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS

NodeOutOfServiceVolumeDetach: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31

NodeSwap: {Default: false, PreRelease: featuregate.Alpha},
NodeSwap: {Default: false, PreRelease: featuregate.Beta},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can enable it by default. There is an additional configuration that needs to be tweaked anyway, so no need to ask to change feature gate as well

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for other reviewers: we found that enabling it by default breaks some upgrade scenarios that we will deal with in beta2.


PDBUnhealthyPodEvictionPolicy: {Default: true, PreRelease: featuregate.Beta},

Expand Down
11 changes: 6 additions & 5 deletions pkg/kubelet/cm/cgroup_manager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,12 @@ import (
const (
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// MemoryMin is memory.min for cgroup v2
MemoryMin string = "memory.min"
// MemoryHigh is memory.high for cgroup v2
MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
// Cgroup2MemoryMin is memory.min for cgroup v2
Cgroup2MemoryMin string = "memory.min"
// Cgroup2MemoryHigh is memory.high for cgroup v2
Cgroup2MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
Cgroup2MaxSwapFilename string = "memory.swap.max"
)

var RootCgroupName = CgroupName([]string{})
Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/cm/helpers_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
}
if memoryMin > 0 {
result.Unified = map[string]string{
MemoryMin: strconv.FormatInt(memoryMin, 10),
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/cm/node_container_manager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/kubelet/cm/qos_container_manager_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,15 +292,15 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
}

if guaranteedMin > 0 {
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
}
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
}
}
Expand Down
118 changes: 104 additions & 14 deletions pkg/kubelet/kuberuntime/kuberuntime_container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ limitations under the License.
package kuberuntime

import (
"fmt"
cadvisorv1 "github.com/google/cadvisor/info/v1"
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"math"
"os"
"strconv"
Expand All @@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
Expand Down Expand Up @@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,

lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)

if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to initialize this object before checking the feature gate? Ideally there should be no change when feature gate is not set

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in another comment, cgroup configures unlimited swap by default. If the feature gate is turned off, there's a need to actively limit the swap usage to 0. This is also consistent with the current logic [1].

[1] https://github.com/kubernetes/kubernetes/blob/v1.28.0-alpha.4/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go#L111

// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
switch m.memorySwapBehavior {
case kubelettypes.UnlimitedSwap:
// -1 = unlimited swap
lcr.MemorySwapLimitInBytes = -1
case kubelettypes.LimitedSwap:
fallthrough
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
default:
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
}
} else {
swapConfigurationHelper.ConfigureNoSwap(lcr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is what we have today in alpha? If feature gate is disabled, do we configure no swap? I thought we do not make any configuration in this case

Copy link
Contributor Author

@iholder101 iholder101 Jul 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, today we don't configure anything if the feature gate is disabled.

However, the question is if we want to ensure that swap is not accessible if the feature gate is off, or do we want to let the CRI decide what to do in this case. I know that in cgroups, the default is using max, which is equivalent to UnlimitedSwap. Maybe most CRIs default to NoSwap, but IMHO it's better to actively ensure no swap is accessible if the feature gate is off.

WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am trying to understand whether this will break any customer who don't use the feature gate at all.

Let's say somebody was running with failSwapOn: false before.

If they never enabled nor disabled the feature gate, they would have the default container runtime's behavior today. Once upgraded to 1.28, they will have the feature gate enabled by default (default of feature gate is true). And they will get the unlimited behavior. So instead of container runtime's default behavior we will get the unlimited set explicitly.

If they explicitly enabled the feature gate, they will get the same behavior with beta1 as was configured with alpha. So this is good - no surprises.

If they explicitly disabled the feature gate in alpha, after the upgrade they will get explicit zero instead of Container Runtime. I think Zero is a default behavior for Containerd at least latest versions.

So the actual concern is not this code branch. More concerning the new default with no feature gate set. Maybe we need to rethink the default behavior from being unlimited to being whatever Container Runtime default is?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we cannot find a quick answer here, perhaps what we can do:

  • Add a note in this branch suggesting the change of behavior when featuer gate is disabled explicitly. Relnote it as well
  • Turn the feature gate off by default. Deal with the upgrade scenario in beta2

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, let's keep it off by default and revisit.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@SergeyKanzhelev Thanks for elaborating.

Yeah, let's keep it off by default and revisit.

Yup, I agree.
Made the change 👍 PTAL

}

// Set memory.min and memory.high to enforce MemoryQoS
Expand All @@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
memoryRequest := container.Resources.Requests.Memory().Value()
memoryLimit := container.Resources.Limits.Memory().Value()
if memoryRequest != 0 {
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
}

// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
Expand All @@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
}
}
if memoryHigh != 0 && memoryHigh > memoryRequest {
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
}
}
if len(unified) > 0 {
Expand All @@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
enforceMemoryQoS := false
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
isCgroup2UnifiedMode() {
enforceMemoryQoS = true
}
return &runtimeapi.ContainerResources{
Expand Down Expand Up @@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
}

// runc requires cgroupv2 for unified mode
if libcontainercgroups.IsCgroup2UnifiedMode() {
if isCgroup2UnifiedMode() {
resources.Unified = map[string]string{
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
Expand Down Expand Up @@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
}
return cStatusResources
}

// Note: this function variable is being added here so it would be possible to mock
// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
// the cgroup version would solely depend on the environment running the test.
var isCgroup2UnifiedMode = func() bool {
return libcontainercgroups.IsCgroup2UnifiedMode()
}

type swapConfigurationHelper struct {
machineInfo cadvisorv1.MachineInfo
}

func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
return &swapConfigurationHelper{machineInfo: machineInfo}
}

func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
podQos := kubeapiqos.GetPodQOS(pod)
containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0

if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
m.ConfigureNoSwap(lcr)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's also add the logic on kubelet startup to fail to start if swap is configured with the cgroupv1

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea! Thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so will you send a separate PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, here's the PR: #119327.

It's still WIP. While the logic itself is ready I also need to adjust the tests. I will finalize it shortly.

return
}

containerMemoryRequest := container.Resources.Requests.Memory()
swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))

if err != nil {
klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
m.ConfigureNoSwap(lcr)
return
}

m.configureSwap(lcr, swapLimit)
}

func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
// Some swapping is still possible.
// Note that if memory limit is 0, memory swap limit is ignored.
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect this line failed some tests without swap enabled, like some Ubuntu.
#119467.

return
}

m.configureSwap(lcr, 0)
}

func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
if !isCgroup2UnifiedMode() {
m.ConfigureNoSwap(lcr)
return
}

if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}

lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
}

func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
if !isCgroup2UnifiedMode() {
klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
return
}

if lcr.Unified == nil {
lcr.Unified = map[string]string{}
}

lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
}

// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
// For more info, please look at the following KEP: https://kep.k8s.io/2400
func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add some comments here to make it more clear?

The logic is defined in KEP https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2400-node-swap/README.md#steps-to-calculate-swap-limit

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

if nodeTotalMemory <= 0 {
return 0, fmt.Errorf("total node memory is 0")
}
if containerMemoryRequest > nodeTotalMemory {
return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
}

containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)

return int64(swapAllocation), nil
}
Loading