Skip to content

Commit da2fdf8

Browse files
authored
Merge pull request #118764 from iholder101/Swap/burstableQoS-impl
Add full cgroup v2 swap support with automatically calculated swap limit for LimitedSwap and Burstable QoS Pods
2 parents 1e086cc + 4321d8c commit da2fdf8

10 files changed

+830
-168
lines changed

hack/local-up-cluster.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ CGROUP_DRIVER=${CGROUP_DRIVER:-""}
4747
CGROUP_ROOT=${CGROUP_ROOT:-""}
4848
# owner of client certs, default to current user if not specified
4949
USER=${USER:-$(whoami)}
50+
# if true, limited swap is being used instead of unlimited swap (default)
51+
LIMITED_SWAP=${LIMITED_SWAP:-""}
5052

5153
# required for cni installation
5254
CNI_CONFIG_DIR=${CNI_CONFIG_DIR:-/etc/cni/net.d}
@@ -832,6 +834,13 @@ tracing:
832834
EOF
833835
fi
834836

837+
if [[ "$LIMITED_SWAP" == "true" ]]; then
838+
cat <<EOF >> "${TMP_DIR}"/kubelet.yaml
839+
memorySwap:
840+
swapBehavior: LimitedSwap
841+
EOF
842+
fi
843+
835844
{
836845
# authentication
837846
echo "authentication:"

pkg/features/kube_features.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -596,8 +596,9 @@ const (
596596
// Allow pods to failover to a different node in case of non graceful node shutdown
597597
NodeOutOfServiceVolumeDetach featuregate.Feature = "NodeOutOfServiceVolumeDetach"
598598

599-
// owner: @ehashman
599+
// owner: @iholder101
600600
// alpha: v1.22
601+
// beta1: v1.28. For more info, please look at the KEP: https://kep.k8s.io/2400.
601602
//
602603
// Permits kubelet to run with swap enabled
603604
NodeSwap featuregate.Feature = "NodeSwap"
@@ -1074,7 +1075,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
10741075

10751076
NodeOutOfServiceVolumeDetach: {Default: true, PreRelease: featuregate.GA, LockToDefault: true}, // remove in 1.31
10761077

1077-
NodeSwap: {Default: false, PreRelease: featuregate.Alpha},
1078+
NodeSwap: {Default: false, PreRelease: featuregate.Beta},
10781079

10791080
PDBUnhealthyPodEvictionPolicy: {Default: true, PreRelease: featuregate.Beta},
10801081

pkg/kubelet/cm/cgroup_manager_linux.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,12 @@ import (
4545
const (
4646
// systemdSuffix is the cgroup name suffix for systemd
4747
systemdSuffix string = ".slice"
48-
// MemoryMin is memory.min for cgroup v2
49-
MemoryMin string = "memory.min"
50-
// MemoryHigh is memory.high for cgroup v2
51-
MemoryHigh string = "memory.high"
52-
Cgroup2MaxCpuLimit string = "max"
48+
// Cgroup2MemoryMin is memory.min for cgroup v2
49+
Cgroup2MemoryMin string = "memory.min"
50+
// Cgroup2MemoryHigh is memory.high for cgroup v2
51+
Cgroup2MemoryHigh string = "memory.high"
52+
Cgroup2MaxCpuLimit string = "max"
53+
Cgroup2MaxSwapFilename string = "memory.swap.max"
5354
)
5455

5556
var RootCgroupName = CgroupName([]string{})

pkg/kubelet/cm/helpers_linux.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ func ResourceConfigForPod(pod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64,
196196
}
197197
if memoryMin > 0 {
198198
result.Unified = map[string]string{
199-
MemoryMin: strconv.FormatInt(memoryMin, 10),
199+
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
200200
}
201201
}
202202
}

pkg/kubelet/cm/node_container_manager_linux.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
147147
if rp.Unified == nil {
148148
rp.Unified = make(map[string]string)
149149
}
150-
rp.Unified[MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
150+
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
151151
}
152152
}
153153

pkg/kubelet/cm/qos_container_manager_linux.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,15 +292,15 @@ func (m *qosContainerManagerImpl) setMemoryQoS(configs map[v1.PodQOSClass]*Cgrou
292292
if configs[v1.PodQOSBurstable].ResourceParameters.Unified == nil {
293293
configs[v1.PodQOSBurstable].ResourceParameters.Unified = make(map[string]string)
294294
}
295-
configs[v1.PodQOSBurstable].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(burstableMin, 10)
295+
configs[v1.PodQOSBurstable].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(burstableMin, 10)
296296
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSBurstable, "memoryMin", burstableMin)
297297
}
298298

299299
if guaranteedMin > 0 {
300300
if configs[v1.PodQOSGuaranteed].ResourceParameters.Unified == nil {
301301
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified = make(map[string]string)
302302
}
303-
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
303+
configs[v1.PodQOSGuaranteed].ResourceParameters.Unified[Cgroup2MemoryMin] = strconv.FormatInt(guaranteedMin, 10)
304304
klog.V(4).InfoS("MemoryQoS config for qos", "qos", v1.PodQOSGuaranteed, "memoryMin", guaranteedMin)
305305
}
306306
}

pkg/kubelet/kuberuntime/kuberuntime_container_linux.go

Lines changed: 104 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ limitations under the License.
2020
package kuberuntime
2121

2222
import (
23+
"fmt"
24+
cadvisorv1 "github.com/google/cadvisor/info/v1"
25+
kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
2326
"math"
2427
"os"
2528
"strconv"
@@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
4649
enforceMemoryQoS := false
4750
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
4851
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
49-
libcontainercgroups.IsCgroup2UnifiedMode() {
52+
isCgroup2UnifiedMode() {
5053
enforceMemoryQoS = true
5154
}
5255
cl, err := m.generateLinuxContainerConfig(container, pod, uid, username, nsTarget, enforceMemoryQoS)
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
99102

100103
lcr.HugepageLimits = GetHugepageLimitsFromResources(container.Resources)
101104

102-
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
105+
if swapConfigurationHelper := newSwapConfigurationHelper(*m.machineInfo); utilfeature.DefaultFeatureGate.Enabled(kubefeatures.NodeSwap) {
103106
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
104107
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
105108
switch m.memorySwapBehavior {
106-
case kubelettypes.UnlimitedSwap:
107-
// -1 = unlimited swap
108-
lcr.MemorySwapLimitInBytes = -1
109109
case kubelettypes.LimitedSwap:
110-
fallthrough
110+
swapConfigurationHelper.ConfigureLimitedSwap(lcr, pod, container)
111111
default:
112-
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
113-
// Some swapping is still possible.
114-
// Note that if memory limit is 0, memory swap limit is ignored.
115-
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
112+
swapConfigurationHelper.ConfigureUnlimitedSwap(lcr)
116113
}
114+
} else {
115+
swapConfigurationHelper.ConfigureNoSwap(lcr)
117116
}
118117

119118
// Set memory.min and memory.high to enforce MemoryQoS
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
122121
memoryRequest := container.Resources.Requests.Memory().Value()
123122
memoryLimit := container.Resources.Limits.Memory().Value()
124123
if memoryRequest != 0 {
125-
unified[cm.MemoryMin] = strconv.FormatInt(memoryRequest, 10)
124+
unified[cm.Cgroup2MemoryMin] = strconv.FormatInt(memoryRequest, 10)
126125
}
127126

128127
// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
148147
}
149148
}
150149
if memoryHigh != 0 && memoryHigh > memoryRequest {
151-
unified[cm.MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
150+
unified[cm.Cgroup2MemoryHigh] = strconv.FormatInt(memoryHigh, 10)
152151
}
153152
}
154153
if len(unified) > 0 {
@@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
171170
enforceMemoryQoS := false
172171
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
173172
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
174-
libcontainercgroups.IsCgroup2UnifiedMode() {
173+
isCgroup2UnifiedMode() {
175174
enforceMemoryQoS = true
176175
}
177176
return &runtimeapi.ContainerResources{
@@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
216215
}
217216

218217
// runc requires cgroupv2 for unified mode
219-
if libcontainercgroups.IsCgroup2UnifiedMode() {
218+
if isCgroup2UnifiedMode() {
220219
resources.Unified = map[string]string{
221220
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
222221
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
@@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
298297
}
299298
return cStatusResources
300299
}
300+
301+
// Note: this function variable is being added here so it would be possible to mock
302+
// the cgroup version for unit tests by assigning a new mocked function into it. Without it,
303+
// the cgroup version would solely depend on the environment running the test.
304+
var isCgroup2UnifiedMode = func() bool {
305+
return libcontainercgroups.IsCgroup2UnifiedMode()
306+
}
307+
308+
type swapConfigurationHelper struct {
309+
machineInfo cadvisorv1.MachineInfo
310+
}
311+
312+
func newSwapConfigurationHelper(machineInfo cadvisorv1.MachineInfo) *swapConfigurationHelper {
313+
return &swapConfigurationHelper{machineInfo: machineInfo}
314+
}
315+
316+
func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxContainerResources, pod *v1.Pod, container *v1.Container) {
317+
podQos := kubeapiqos.GetPodQOS(pod)
318+
containerDoesNotRequestMemory := container.Resources.Requests.Memory().IsZero() && container.Resources.Limits.Memory().IsZero()
319+
memoryRequestEqualsToLimit := container.Resources.Requests.Memory().Cmp(*container.Resources.Limits.Memory()) == 0
320+
321+
if podQos != v1.PodQOSBurstable || containerDoesNotRequestMemory || !isCgroup2UnifiedMode() || memoryRequestEqualsToLimit {
322+
m.ConfigureNoSwap(lcr)
323+
return
324+
}
325+
326+
containerMemoryRequest := container.Resources.Requests.Memory()
327+
swapLimit, err := calcSwapForBurstablePods(containerMemoryRequest.Value(), int64(m.machineInfo.MemoryCapacity), int64(m.machineInfo.SwapCapacity))
328+
329+
if err != nil {
330+
klog.ErrorS(err, "cannot calculate swap allocation amount; disallowing swap")
331+
m.ConfigureNoSwap(lcr)
332+
return
333+
}
334+
335+
m.configureSwap(lcr, swapLimit)
336+
}
337+
338+
func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) {
339+
if !isCgroup2UnifiedMode() {
340+
// memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
341+
// Some swapping is still possible.
342+
// Note that if memory limit is 0, memory swap limit is ignored.
343+
lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes
344+
return
345+
}
346+
347+
m.configureSwap(lcr, 0)
348+
}
349+
350+
func (m swapConfigurationHelper) ConfigureUnlimitedSwap(lcr *runtimeapi.LinuxContainerResources) {
351+
if !isCgroup2UnifiedMode() {
352+
m.ConfigureNoSwap(lcr)
353+
return
354+
}
355+
356+
if lcr.Unified == nil {
357+
lcr.Unified = map[string]string{}
358+
}
359+
360+
lcr.Unified[cm.Cgroup2MaxSwapFilename] = "max"
361+
}
362+
363+
func (m swapConfigurationHelper) configureSwap(lcr *runtimeapi.LinuxContainerResources, swapMemory int64) {
364+
if !isCgroup2UnifiedMode() {
365+
klog.ErrorS(fmt.Errorf("swap configuration is not supported with cgroup v1"), "swap configuration under cgroup v1 is unexpected")
366+
return
367+
}
368+
369+
if lcr.Unified == nil {
370+
lcr.Unified = map[string]string{}
371+
}
372+
373+
lcr.Unified[cm.Cgroup2MaxSwapFilename] = fmt.Sprintf("%d", swapMemory)
374+
}
375+
376+
// The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
377+
// For more info, please look at the following KEP: https://kep.k8s.io/2400
378+
func calcSwapForBurstablePods(containerMemoryRequest, nodeTotalMemory, totalPodsSwapAvailable int64) (int64, error) {
379+
if nodeTotalMemory <= 0 {
380+
return 0, fmt.Errorf("total node memory is 0")
381+
}
382+
if containerMemoryRequest > nodeTotalMemory {
383+
return 0, fmt.Errorf("container request %d is larger than total node memory %d", containerMemoryRequest, nodeTotalMemory)
384+
}
385+
386+
containerMemoryProportion := float64(containerMemoryRequest) / float64(nodeTotalMemory)
387+
swapAllocation := containerMemoryProportion * float64(totalPodsSwapAvailable)
388+
389+
return int64(swapAllocation), nil
390+
}

0 commit comments

Comments
 (0)