@@ -20,6 +20,9 @@ limitations under the License.
20
20
package kuberuntime
21
21
22
22
import (
23
+ "fmt"
24
+ cadvisorv1 "github.com/google/cadvisor/info/v1"
25
+ kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
23
26
"math"
24
27
"os"
25
28
"strconv"
@@ -46,7 +49,7 @@ func (m *kubeGenericRuntimeManager) applyPlatformSpecificContainerConfig(config
46
49
enforceMemoryQoS := false
47
50
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
48
51
if utilfeature .DefaultFeatureGate .Enabled (kubefeatures .MemoryQoS ) &&
49
- libcontainercgroups . IsCgroup2UnifiedMode () {
52
+ isCgroup2UnifiedMode () {
50
53
enforceMemoryQoS = true
51
54
}
52
55
cl , err := m .generateLinuxContainerConfig (container , pod , uid , username , nsTarget , enforceMemoryQoS )
@@ -99,21 +102,17 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
99
102
100
103
lcr .HugepageLimits = GetHugepageLimitsFromResources (container .Resources )
101
104
102
- if utilfeature .DefaultFeatureGate .Enabled (kubefeatures .NodeSwap ) {
105
+ if swapConfigurationHelper := newSwapConfigurationHelper ( * m . machineInfo ); utilfeature .DefaultFeatureGate .Enabled (kubefeatures .NodeSwap ) {
103
106
// NOTE(ehashman): Behaviour is defined in the opencontainers runtime spec:
104
107
// https://github.com/opencontainers/runtime-spec/blob/1c3f411f041711bbeecf35ff7e93461ea6789220/config-linux.md#memory
105
108
switch m .memorySwapBehavior {
106
- case kubelettypes .UnlimitedSwap :
107
- // -1 = unlimited swap
108
- lcr .MemorySwapLimitInBytes = - 1
109
109
case kubelettypes .LimitedSwap :
110
- fallthrough
110
+ swapConfigurationHelper . ConfigureLimitedSwap ( lcr , pod , container )
111
111
default :
112
- // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
113
- // Some swapping is still possible.
114
- // Note that if memory limit is 0, memory swap limit is ignored.
115
- lcr .MemorySwapLimitInBytes = lcr .MemoryLimitInBytes
112
+ swapConfigurationHelper .ConfigureUnlimitedSwap (lcr )
116
113
}
114
+ } else {
115
+ swapConfigurationHelper .ConfigureNoSwap (lcr )
117
116
}
118
117
119
118
// Set memory.min and memory.high to enforce MemoryQoS
@@ -122,7 +121,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
122
121
memoryRequest := container .Resources .Requests .Memory ().Value ()
123
122
memoryLimit := container .Resources .Limits .Memory ().Value ()
124
123
if memoryRequest != 0 {
125
- unified [cm .MemoryMin ] = strconv .FormatInt (memoryRequest , 10 )
124
+ unified [cm .Cgroup2MemoryMin ] = strconv .FormatInt (memoryRequest , 10 )
126
125
}
127
126
128
127
// Guaranteed pods by their QoS definition requires that memory request equals memory limit and cpu request must equal cpu limit.
@@ -148,7 +147,7 @@ func (m *kubeGenericRuntimeManager) generateLinuxContainerResources(pod *v1.Pod,
148
147
}
149
148
}
150
149
if memoryHigh != 0 && memoryHigh > memoryRequest {
151
- unified [cm .MemoryHigh ] = strconv .FormatInt (memoryHigh , 10 )
150
+ unified [cm .Cgroup2MemoryHigh ] = strconv .FormatInt (memoryHigh , 10 )
152
151
}
153
152
}
154
153
if len (unified ) > 0 {
@@ -171,7 +170,7 @@ func (m *kubeGenericRuntimeManager) generateContainerResources(pod *v1.Pod, cont
171
170
enforceMemoryQoS := false
172
171
// Set memory.min and memory.high if MemoryQoS enabled with cgroups v2
173
172
if utilfeature .DefaultFeatureGate .Enabled (kubefeatures .MemoryQoS ) &&
174
- libcontainercgroups . IsCgroup2UnifiedMode () {
173
+ isCgroup2UnifiedMode () {
175
174
enforceMemoryQoS = true
176
175
}
177
176
return & runtimeapi.ContainerResources {
@@ -216,7 +215,7 @@ func (m *kubeGenericRuntimeManager) calculateLinuxResources(cpuRequest, cpuLimit
216
215
}
217
216
218
217
// runc requires cgroupv2 for unified mode
219
- if libcontainercgroups . IsCgroup2UnifiedMode () {
218
+ if isCgroup2UnifiedMode () {
220
219
resources .Unified = map [string ]string {
221
220
// Ask the kernel to kill all processes in the container cgroup in case of OOM.
222
221
// See memory.oom.group in https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html for
@@ -298,3 +297,94 @@ func toKubeContainerResources(statusResources *runtimeapi.ContainerResources) *k
298
297
}
299
298
return cStatusResources
300
299
}
300
+
301
+ // Note: this function variable is being added here so it would be possible to mock
302
+ // the cgroup version for unit tests by assigning a new mocked function into it. Without it,
303
+ // the cgroup version would solely depend on the environment running the test.
304
+ var isCgroup2UnifiedMode = func () bool {
305
+ return libcontainercgroups .IsCgroup2UnifiedMode ()
306
+ }
307
+
308
+ type swapConfigurationHelper struct {
309
+ machineInfo cadvisorv1.MachineInfo
310
+ }
311
+
312
+ func newSwapConfigurationHelper (machineInfo cadvisorv1.MachineInfo ) * swapConfigurationHelper {
313
+ return & swapConfigurationHelper {machineInfo : machineInfo }
314
+ }
315
+
316
+ func (m swapConfigurationHelper ) ConfigureLimitedSwap (lcr * runtimeapi.LinuxContainerResources , pod * v1.Pod , container * v1.Container ) {
317
+ podQos := kubeapiqos .GetPodQOS (pod )
318
+ containerDoesNotRequestMemory := container .Resources .Requests .Memory ().IsZero () && container .Resources .Limits .Memory ().IsZero ()
319
+ memoryRequestEqualsToLimit := container .Resources .Requests .Memory ().Cmp (* container .Resources .Limits .Memory ()) == 0
320
+
321
+ if podQos != v1 .PodQOSBurstable || containerDoesNotRequestMemory || ! isCgroup2UnifiedMode () || memoryRequestEqualsToLimit {
322
+ m .ConfigureNoSwap (lcr )
323
+ return
324
+ }
325
+
326
+ containerMemoryRequest := container .Resources .Requests .Memory ()
327
+ swapLimit , err := calcSwapForBurstablePods (containerMemoryRequest .Value (), int64 (m .machineInfo .MemoryCapacity ), int64 (m .machineInfo .SwapCapacity ))
328
+
329
+ if err != nil {
330
+ klog .ErrorS (err , "cannot calculate swap allocation amount; disallowing swap" )
331
+ m .ConfigureNoSwap (lcr )
332
+ return
333
+ }
334
+
335
+ m .configureSwap (lcr , swapLimit )
336
+ }
337
+
338
+ func (m swapConfigurationHelper ) ConfigureNoSwap (lcr * runtimeapi.LinuxContainerResources ) {
339
+ if ! isCgroup2UnifiedMode () {
340
+ // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit
341
+ // Some swapping is still possible.
342
+ // Note that if memory limit is 0, memory swap limit is ignored.
343
+ lcr .MemorySwapLimitInBytes = lcr .MemoryLimitInBytes
344
+ return
345
+ }
346
+
347
+ m .configureSwap (lcr , 0 )
348
+ }
349
+
350
+ func (m swapConfigurationHelper ) ConfigureUnlimitedSwap (lcr * runtimeapi.LinuxContainerResources ) {
351
+ if ! isCgroup2UnifiedMode () {
352
+ m .ConfigureNoSwap (lcr )
353
+ return
354
+ }
355
+
356
+ if lcr .Unified == nil {
357
+ lcr .Unified = map [string ]string {}
358
+ }
359
+
360
+ lcr .Unified [cm .Cgroup2MaxSwapFilename ] = "max"
361
+ }
362
+
363
+ func (m swapConfigurationHelper ) configureSwap (lcr * runtimeapi.LinuxContainerResources , swapMemory int64 ) {
364
+ if ! isCgroup2UnifiedMode () {
365
+ klog .ErrorS (fmt .Errorf ("swap configuration is not supported with cgroup v1" ), "swap configuration under cgroup v1 is unexpected" )
366
+ return
367
+ }
368
+
369
+ if lcr .Unified == nil {
370
+ lcr .Unified = map [string ]string {}
371
+ }
372
+
373
+ lcr .Unified [cm .Cgroup2MaxSwapFilename ] = fmt .Sprintf ("%d" , swapMemory )
374
+ }
375
+
376
+ // The swap limit is calculated as (<containerMemoryRequest>/<nodeTotalMemory>)*<totalPodsSwapAvailable>.
377
+ // For more info, please look at the following KEP: https://kep.k8s.io/2400
378
+ func calcSwapForBurstablePods (containerMemoryRequest , nodeTotalMemory , totalPodsSwapAvailable int64 ) (int64 , error ) {
379
+ if nodeTotalMemory <= 0 {
380
+ return 0 , fmt .Errorf ("total node memory is 0" )
381
+ }
382
+ if containerMemoryRequest > nodeTotalMemory {
383
+ return 0 , fmt .Errorf ("container request %d is larger than total node memory %d" , containerMemoryRequest , nodeTotalMemory )
384
+ }
385
+
386
+ containerMemoryProportion := float64 (containerMemoryRequest ) / float64 (nodeTotalMemory )
387
+ swapAllocation := containerMemoryProportion * float64 (totalPodsSwapAvailable )
388
+
389
+ return int64 (swapAllocation ), nil
390
+ }
0 commit comments