pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.9.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784

commit 280858b0bb3384b9ec06b455e196b453888bd6b8
Author: Tejun Heo <tj@kernel.org>
Date:   Fri Mar 11 07:31:23 2016 -0500

    sched: Misc preps for cgroup unified hierarchy interface
    
    Make the following changes in preparation for the cpu controller
    interface implementation for the unified hierarchy.  This patch
    doesn't cause any functional differences.
    
    * s/cpu_stats_show()/cpu_cfs_stats_show()/
    
    * s/cpu_files/cpu_legacy_files/
    
    * Separate out cpuacct_stats_read() from cpuacct_stats_show().  While
      at it, make the @val array u64 for consistency.
    
    Signed-off-by: Tejun Heo <tj@kernel.org>
    Cc: Ingo Molnar <mingo@redhat.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Li Zefan <lizefan@huawei.com>
    Cc: Johannes Weiner <hannes@cmpxchg.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd689fe02..57472485b79c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8705,7 +8705,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -8745,7 +8745,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -8766,7 +8766,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.seq_show = cpu_stats_show,
+		.seq_show = cpu_cfs_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8791,7 +8791,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_files,
 	.early_init	= true,
 };
 
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index bc0b309c3f19..d1e5dd0b3a64 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -276,26 +276,33 @@ static int cpuacct_all_seq_show(struct seq_file *m, void *V)
 	return 0;
 }
 
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
+static void cpuacct_stats_read(struct cpuacct *ca,
+			       u64 (*val)[CPUACCT_STAT_NSTATS])
 {
-	struct cpuacct *ca = css_ca(seq_css(sf));
-	s64 val[CPUACCT_STAT_NSTATS];
 	int cpu;
-	int stat;
 
-	memset(val, 0, sizeof(val));
+	memset(val, 0, sizeof(*val));
+
 	for_each_possible_cpu(cpu) {
 		u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
 
-		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_USER];
-		val[CPUACCT_STAT_USER]   += cpustat[CPUTIME_NICE];
-		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
-		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
-		val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
+		(*val)[CPUACCT_STAT_USER]   += cpustat[CPUTIME_USER];
+		(*val)[CPUACCT_STAT_USER]   += cpustat[CPUTIME_NICE];
+		(*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+		(*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+		(*val)[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
 	}
+}
+
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
+{
+	u64 val[CPUACCT_STAT_NSTATS];
+	int stat;
+
+	cpuacct_stats_read(css_ca(seq_css(sf)), &val);
 
 	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
-		seq_printf(sf, "%s %lld\n",
+		seq_printf(sf, "%s %llu\n",
 			   cpuacct_stat_desc[stat],
 			   cputime64_to_clock_t(val[stat]));
 	}

commit 015cbdcb90034fd566d00de9d3d405613da3cd26
Author: Tejun Heo <tj@kernel.org>
Date:   Fri Mar 11 07:31:23 2016 -0500

    sched: Implement interface for cgroup unified hierarchy
    
    While the cpu controller doesn't have any functional problems, there
    are a couple interface issues which can be addressed in the v2
    interface.
    
    * cpuacct being a separate controller.  This separation is artificial
      and rather pointless as demonstrated by most use cases co-mounting
      the two controllers.  It also forces certain information to be
      accounted twice.
    
    * Use of different time units.  Writable control knobs use
      microseconds, some stat fields use nanoseconds while other cpuacct
      stat fields use centiseconds.
    
    * Control knobs which can't be used in the root cgroup still show up
      in the root.
    
    * Control knob names and semantics aren't consistent with other
      controllers.
    
    This patchset implements cpu controller's interface on the unified
    hierarchy which adheres to the controller file conventions described
    in Documentation/cgroups/unified-hierarchy.txt.  Overall, the
    following changes are made.
    
    * cpuacct is implictly enabled and disabled by cpu and its information
      is reported through "cpu.stat" which now uses microseconds for all
      time durations.  All time duration fields now have "_usec" appended
      to them for clarity.  While this doesn't solve the double accounting
      immediately, once majority of users switch to v2, cpu can directly
      account and report the relevant stats and cpuacct can be disabled on
      the unified hierarchy.
    
      Note that cpuacct.usage_percpu is currently not included in
      "cpu.stat".  If this information is actually called for, it can be
      added later.
    
    * "cpu.shares" is replaced with "cpu.weight" and operates on the
      standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
      The weight is scaled to scheduler weight so that 100 maps to 1024
      and the ratio relationship is preserved - if weight is W and its
      scaled value is S, W / 100 == S / 1024.  While the mapped range is a
      bit smaller than the orignal scheduler weight range, the dead zones
      on both sides are relatively small and covers wider range than the
      nice value mappings.  This file doesn't make sense in the root
      cgroup and isn't create on root.
    
    * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
      which contains both quota and period.
    
    * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
      "cpu.rt.max" which contains both runtime and period.
    
    v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
        CFS bandwidth stats and also using raw division for u64.  Use
        CONFIG_CFS_BANDWITH and do_div() instead.
    
        The semantics of "cpu.rt.max" is not fully decided yet.  Dropped
        for now.
    
    Signed-off-by: Tejun Heo <tj@kernel.org>
    Cc: Ingo Molnar <mingo@redhat.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Li Zefan <lizefan@huawei.com>
    Cc: Johannes Weiner <hannes@cmpxchg.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 57472485b79c..c0ae869f51c4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8784,6 +8784,139 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* terminate */
 };
 
+static int cpu_stats_show(struct seq_file *sf, void *v)
+{
+	cpuacct_cpu_stats_show(sf);
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		struct task_group *tg = css_tg(seq_css(sf));
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		u64 throttled_usec;
+
+		throttled_usec = cfs_b->throttled_time;
+		do_div(throttled_usec, NSEC_PER_USEC);
+
+		seq_printf(sf, "nr_periods %d\n"
+			   "nr_throttled %d\n"
+			   "throttled_usec %llu\n",
+			   cfs_b->nr_periods, cfs_b->nr_throttled,
+			   throttled_usec);
+	}
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+	u64 weight = scale_load_down(tg->shares);
+
+	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cftype, u64 weight)
+{
+	/*
+	 * cgroup weight knobs should use the common MIN, DFL and MAX
+	 * values which are 1, 100 and 10000 respectively.  While it loses
+	 * a bit of range on both ends, it maps pretty well onto the shares
+	 * value used by scheduler and the round-trip conversions preserve
+	 * the original value over the entire range.
+	 */
+	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+		return -ERANGE;
+
+	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+						  long period, long quota)
+{
+	if (quota < 0)
+		seq_puts(sf, "max");
+	else
+		seq_printf(sf, "%ld", quota);
+
+	seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+						 u64 *periodp, u64 *quotap)
+{
+	char tok[21];	/* U64_MAX */
+
+	if (!sscanf(buf, "%s %llu", tok, periodp))
+		return -EINVAL;
+
+	*periodp *= NSEC_PER_USEC;
+
+	if (sscanf(tok, "%llu", quotap))
+		*quotap *= NSEC_PER_USEC;
+	else if (!strcmp(tok, "max"))
+		*quotap = RUNTIME_INF;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+	return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	struct task_group *tg = css_tg(of_css(of));
+	u64 period = tg_get_cfs_period(tg);
+	u64 quota;
+	int ret;
+
+	ret = cpu_period_quota_parse(buf, &period, &quota);
+	if (!ret)
+		ret = tg_set_cfs_bandwidth(tg, period, quota);
+	return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+	{
+		.name = "stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stats_show,
+	},
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_weight_read_u64,
+		.write_u64 = cpu_weight_write_u64,
+	},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_max_show,
+		.write = cpu_max_write,
+	},
+#endif
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_released	= cpu_cgroup_css_released,
@@ -8792,7 +8925,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.legacy_cftypes	= cpu_legacy_files,
+	.dfl_cftypes	= cpu_files,
 	.early_init	= true,
+#ifdef CONFIG_CGROUP_CPUACCT
+	/*
+	 * cpuacct is enabled together with cpu on the unified hierarchy
+	 * and its stats are reported through "cpu.stat".
+	 */
+	.depends_on	= 1 << cpuacct_cgrp_id,
+#endif
 };
 
 #endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index d1e5dd0b3a64..57f390514c39 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -347,6 +347,31 @@ static struct cftype files[] = {
 	{ }	/* terminate */
 };
 
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
+void cpuacct_cpu_stats_show(struct seq_file *sf)
+{
+	struct cgroup_subsys_state *css;
+	u64 usage, val[CPUACCT_STAT_NSTATS];
+
+	css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
+
+	usage = cpuusage_read(css, seq_cft(sf));
+	cpuacct_stats_read(css_ca(css), &val);
+
+	val[CPUACCT_STAT_USER] *= TICK_NSEC;
+	val[CPUACCT_STAT_SYSTEM] *= TICK_NSEC;
+	do_div(usage, NSEC_PER_USEC);
+	do_div(val[CPUACCT_STAT_USER], NSEC_PER_USEC);
+	do_div(val[CPUACCT_STAT_SYSTEM], NSEC_PER_USEC);
+
+	seq_printf(sf, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n",
+		   usage, val[CPUACCT_STAT_USER], val[CPUACCT_STAT_SYSTEM]);
+
+	css_put(css);
+}
+
 /*
  * charge this task's execution time to its accounting group.
  *
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ba72807c73d4..ddf7af466d35 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -2,6 +2,7 @@
 
 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
 
 #else
 
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
 {
 }
 
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
+{
+}
+
 #endif

commit 5019fe3d7ec456b58d451ef06fe1f81d7d9f28a9
Author: Tejun Heo <tj@kernel.org>
Date:   Fri Aug 5 12:41:01 2016 -0400

    cgroup: add documentation regarding CPU controller cgroup v2 support
    
    Signed-off-by: Tejun Heo <tj@kernel.org>

diff --git a/Documentation/cgroup-v2-cpu.txt b/Documentation/cgroup-v2-cpu.txt
new file mode 100644
index 000000000000..1ed7032d4472
--- /dev/null
+++ b/Documentation/cgroup-v2-cpu.txt
@@ -0,0 +1,368 @@
+
+
+CPU Controller on Control Group v2
+
+August, 2016		Tejun Heo <tj@kernel.org>
+
+
+While most controllers have support for cgroup v2 now, the CPU
+controller support is not upstream yet due to objections from the
+scheduler maintainers on the basic designs of cgroup v2.  This
+document explains the current situation as well as an interim
+solution, and details the disagreements and arguments.  The latest
+version of this document can be found at the following URL.
+
+ https://git.kernel.org/cgit/linux/kernel/git/tj/cgroup.git/tree/Documentation/cgroup-v2-cpu.txt?h=cgroup-v2-cpu
+
+This document was posted to the linux-kernel and cgroup mailing lists.
+Unfortunately, no consensus was reached as of Oct, 2016.  The thread
+can be found at the following URL.
+
+ http://lkml.kernel.org/r/20160805170752.GK2542@mtj.duckdns.org
+
+
+CONTENTS
+
+1. Current Situation and Interim Solution
+2. Disagreements and Arguments
+  2-1. Contentious Restrictions
+    2-1-1. Process Granularity
+    2-1-2. No Internal Process Constraint
+  2-2. Impact on CPU Controller
+    2-2-1. Impact of Process Granularity
+    2-2-2. Impact of No Internal Process Constraint
+  2-3. Arguments for cgroup v2
+3. Way Forward
+4. References
+
+
+1. Current Situation and Interim Solution
+
+All objections from the scheduler maintainers apply to cgroup v2 core
+design, and there are no known objections to the specifics of the CPU
+controller cgroup v2 interface.  The only blocked part is changes to
+expose the CPU controller interface on cgroup v2, which comprises the
+following two patches:
+
+ [1] sched: Misc preps for cgroup unified hierarchy interface
+ [2] sched: Implement interface for cgroup unified hierarchy
+
+The necessary changes are superficial and implement the interface
+files on cgroup v2.  The combined diffstat is as follows.
+
+ kernel/sched/core.c    |  149 +++++++++++++++++++++++++++++++++++++++++++++++--
+ kernel/sched/cpuacct.c |   57 ++++++++++++------
+ kernel/sched/cpuacct.h |    5 +
+ 3 files changed, 189 insertions(+), 22 deletions(-)
+
+The patches are easy to apply and forward-port.  The following git
+branch will always carry the two patches on top of the latest release
+of the upstream kernel.
+
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu
+
+There also are versioned branches going back to v4.4.
+
+ git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git/cgroup-v2-cpu-$KERNEL_VER
+
+While it's difficult to tell whether the CPU controller support will
+be merged, there are crucial resource control features in cgroup v2
+that are only possible due to the design choices that are being
+objected to, and every effort will be made to ease enabling the CPU
+controller cgroup v2 support out-of-tree for parties which choose to.
+
+
+2. Disagreements and Arguments
+
+There have been several lengthy discussion threads [3][4] on LKML
+around the structural constraints of cgroup v2.  The two that affect
+the CPU controller are process granularity and no internal process
+constraint.  Both arise primarily from the need for common resource
+domain definition across different resources.
+
+The common resource domain is a powerful concept in cgroup v2 that
+allows controllers to make basic assumptions about the structural
+organization of processes and controllers inside the cgroup hierarchy,
+and thus solve problems spanning multiple types of resources.  The
+prime example for this is page cache writeback: dirty page cache is
+regulated through throttling buffered writers based on memory
+availability, and initiating batched write outs to the disk based on
+IO capacity.  Tracking and controlling writeback inside a cgroup thus
+requires the direct cooperation of the memory and the IO controller.
+
+This easily extends to other areas, such as CPU cycles consumed while
+performing memory reclaim or IO encryption.
+
+
+2-1. Contentious Restrictions
+
+For controllers of different resources to work together, they must
+agree on a common organization.  This uniform model across controllers
+imposes two contentious restrictions on the CPU controller: process
+granularity and the no-internal-process constraint.
+
+
+  2-1-1. Process Granularity
+
+  For memory, because an address space is shared between all threads
+  of a process, the terminal consumer is a process, not a thread.
+  Separating the threads of a single process into different memory
+  control domains doesn't make semantical sense.  cgroup v2 ensures
+  that all controller can agree on the same organization by requiring
+  that threads of the same process belong to the same cgroup.
+
+  There are other reasons to enforce process granularity.  One
+  important one is isolating system-level management operations from
+  in-process application operations.  The cgroup interface, being a
+  virtual filesystem, is very unfit for multiple independent
+  operations taking place at the same time as most operations have to
+  be multi-step and there is no way to synchronize multiple accessors.
+  See also [5] Documentation/cgroup-v2.txt, "R-2. Thread Granularity"
+
+
+  2-1-2. No Internal Process Constraint
+
+  cgroup v2 does not allow processes to belong to any cgroup which has
+  child cgroups when resource controllers are enabled on it (the
+  notable exception being the root cgroup itself).  This is because,
+  for some resources, a resource domain (cgroup) is not directly
+  comparable to the terminal consumer (process/task) of said resource,
+  and so putting the two into a sibling relationship isn't meaningful.
+
+  - Differing Control Parameters and Capabilities
+
+    A cgroup controller has different resource control parameters and
+    capabilities from a terminal consumer, be that a task or process.
+    There are a couple cases where a cgroup control knob can be mapped
+    to a per-task or per-process API but they are exceptions and the
+    mappings aren't obvious even in those cases.
+
+    For example, task priorities (also known as nice values) set
+    through setpriority(2) are mapped to the CPU controller
+    "cpu.shares" values.  However, how exactly the two ranges map and
+    even the fact that they map to each other at all are not obvious.
+
+    The situation gets further muddled when considering other resource
+    types and control knobs.  IO priorities set through ioprio_set(2)
+    cannot be mapped to IO controller weights and most cgroup resource
+    control knobs including the bandwidth control knobs of the CPU
+    controller don't have counterparts in the terminal consumers.
+
+  - Anonymous Resource Consumption
+
+    For CPU, every time slice consumed from inside a cgroup, which
+    comprises most but not all of consumed CPU time for the cgroup,
+    can be clearly attributed to a specific task or process.  Because
+    these two types of entities are directly comparable as consumers
+    of CPU time, it's theoretically possible to mix tasks and cgroups
+    on the same tree levels and let them directly compete for the time
+    quota available to their common ancestor.
+
+    However, the same can't be said for resource types like memory or
+    IO: the memory consumed by the page cache, for example, can be
+    tracked on a per-cgroup level, but due to mismatches in lifetimes
+    of involved objects (page cache can persist long after processes
+    are gone), shared usages and the implementation overhead of
+    tracking persistent state, it can no longer be attributed to
+    individual processes after instantiation.  Consequently, any IO
+    incurred by page cache writeback can be attributed to a cgroup,
+    but not to the individual consumers inside the cgroup.
+
+  For memory and IO, this makes a resource domain (cgroup) an object
+  of a fundamentally different type than a terminal consumer
+  (process).  A process can't be a first class object in the resource
+  distribution graph as its total resource consumption can't be
+  described without the containing resource domain.
+
+  Disallowing processes in internal cgroups avoids competition between
+  cgroups and processes which cannot be meaningfully defined for these
+  resources.  All resource control takes place among cgroups and a
+  terminal consumer interacts with the containing cgroup the same way
+  it would with the system without cgroup.
+
+  Root cgroup is exempt from this constraint, which is in line with
+  how root cgroup is handled in general - it's excluded from cgroup
+  resource accounting and control.
+
+
+Enforcing process granularity and no internal process constraint
+allows all controllers to be on the same footing in terms of resource
+distribution hierarchy.
+
+
+2-2. Impact on CPU Controller
+
+As indicated earlier, the CPU controller's resource distribution graph
+is the simplest.  Every schedulable resource consumption can be
+attributed to a specific task.  In addition, for weight based control,
+the per-task priority set through setpriority(2) can be translated to
+and from a per-cgroup weight.  As such, the CPU controller can treat a
+task and a cgroup symmetrically, allowing support for any tree layout
+of cgroups and tasks.  Both process granularity and the no internal
+process constraint restrict how the CPU controller can be used.
+
+
+  2-2-1. Impact of Process Granularity
+
+  Process granularity prevents tasks belonging to the same process to
+  be assigned to different cgroups.  It was pointed out [6] that this
+  excludes the valid use case of hierarchical CPU distribution within
+  processes.
+
+  To address this issue, the rgroup (resource group) [7][8][9]
+  interface, an extension of the existing setpriority(2) API, was
+  proposed, which is in line with other programmable priority
+  mechanisms and eliminates the risk of in-application configuration
+  and system configuration stepping on each other's toes.
+  Unfortunately, the proposal quickly turned into discussions around
+  cgroup v2 design decisions [4] and no consensus could be reached.
+
+
+  2-2-2. Impact of No Internal Process Constraint
+
+  The no internal process constraint disallows tasks from competing
+  directly against cgroups.  Here is an excerpt from Peter Zijlstra
+  pointing out the issue [10] - R, L and A are cgroups; t1, t2, t3 and
+  t4 are tasks:
+
+
+          R
+        / | \
+       t1 t2 A
+           /   \
+          t3   t4
+
+
+    Is fundamentally different from:
+
+
+               R
+             /   \
+           L       A
+         /   \   /   \
+        t1  t2  t3   t4
+
+
+    Because if in the first hierarchy you add a task (t5) to R, all of
+    its A will run at 1/4th of total bandwidth where before it had
+    1/3rd, whereas with the second example, if you add our t5 to L, A
+    doesn't get any less bandwidth.
+
+
+  It is true that the trees are semantically different from each other
+  and the symmetric handling of tasks and cgroups is aesthetically
+  pleasing.  However, it isn't clear what the practical usefulness of
+  a layout with direct competition between tasks and cgroups would be,
+  considering that number and behavior of tasks are controlled by each
+  application, and cgroups primarily deal with system level resource
+  distribution; changes in the number of active threads would directly
+  impact resource distribution.  Real world use cases of such layouts
+  could not be established during the discussions.
+
+
+2-3. Arguments for cgroup v2
+
+There are strong demands for comprehensive hierarchical resource
+control across all major resources, and establishing a common resource
+hierarchy is an essential step.  As with most engineering decisions,
+common resource hierarchy definition comes with its trade-offs.  With
+cgroup v2, the trade-offs are in the form of structural constraints
+which, among others, restrict the CPU controller's space of possible
+configurations.
+
+However, even with the restrictions, cgroup v2, in combination with
+rgroup, covers most of identified real world use cases while enabling
+new important use cases of resource control across multiple resource
+types that were fundamentally broken previously.
+
+Furthermore, for resource control, treating resource domains as
+objects of a different type from terminal consumers has important
+advantages - it can account for resource consumptions which are not
+tied to any specific terminal consumer, be that a task or process, and
+allows decoupling resource distribution controls from in-application
+APIs.  Even the CPU controller may benefit from it as the kernel can
+consume significant amount of CPU cycles in interrupt context or tasks
+shared across multiple resource domains (e.g. softirq).
+
+Finally, it's important to note that enabling cgroup v2 support for
+the CPU controller doesn't block use cases which require the features
+which are not available on cgroup v2.  Unlikely, but should anybody
+actually rely on the CPU controller's symmetric handling of tasks and
+cgroups, backward compatibility is and will be maintained by being
+able to disconnect the controller from the cgroup v2 hierarchy and use
+it standalone.  This also holds for cpuset which is often used in
+highly customized configurations which might be a poor fit for common
+resource domains.
+
+The required changes are minimal, the benefits for the target use
+cases are critical and obvious, and use cases which have to use v1 can
+continue to do so.
+
+
+3. Way Forward
+
+cgroup v2 primarily aims to solve the problem of comprehensive
+hierarchical resource control across all major computing resources,
+which is one of the core problems of modern server infrastructure
+engineering.  The trade-offs that cgroup v2 took are results of
+pursuing that goal and gaining a better understanding of the nature of
+resource control in the process.
+
+I believe that real world usages will prove cgroup v2's model right,
+considering the crucial pieces of comprehensive resource control that
+cannot be implemented without common resource domains.  This is not to
+say that cgroup v2 is fixed in stone and can't be updated; if there is
+an approach which better serves both comprehensive resource control
+and the CPU controller's flexibility, we will surely move towards
+that.  It goes without saying that discussions around such approach
+should consider practical aspects of resource control as a whole
+rather than absolutely focusing on a particular controller.
+
+Until such consensus can be reached, the CPU controller cgroup v2
+support will be maintained out of the mainline kernel in an easily
+accessible form.  If there is anything cgroup developers can do to
+ease the pain, please feel free to contact us on the cgroup mailing
+list at cgroups@vger.kernel.org.
+
+
+4. References
+
+[1]  http://lkml.kernel.org/r/20160105164834.GE5995@mtj.duckdns.org
+     [PATCH 1/2] sched: Misc preps for cgroup unified hierarchy interface
+     Tejun Heo <tj@kernel.org>
+
+[2]  http://lkml.kernel.org/r/20160105164852.GF5995@mtj.duckdns.org
+     [PATCH 2/2] sched: Implement interface for cgroup unified hierarchy
+     Tejun Heo <tj@kernel.org>
+
+[3]  http://lkml.kernel.org/r/1438641689-14655-4-git-send-email-tj@kernel.org
+     [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
+     Tejun Heo <tj@kernel.org>
+
+[4]  http://lkml.kernel.org/r/20160407064549.GH3430@twins.programming.kicks-ass.net
+     Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
+     Peter Zijlstra <peterz@infradead.org>
+
+[5]  https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/cgroup-v2.txt
+     Control Group v2
+     Tejun Heo <tj@kernel.org>
+
+[6]  http://lkml.kernel.org/r/CAPM31RJNy3jgG=DYe6GO=wyL4BPPxwUm1f2S6YXacQmo7viFZA@mail.gmail.com
+     Re: [PATCH 3/3] sched: Implement interface for cgroup unified hierarchy
+     Paul Turner <pjt@google.com>
+
+[7]  http://lkml.kernel.org/r/20160105154503.GC5995@mtj.duckdns.org
+     [RFD] cgroup: thread granularity support for cpu controller
+     Tejun Heo <tj@kernel.org>
+
+[8]  http://lkml.kernel.org/r/1457710888-31182-1-git-send-email-tj@kernel.org
+     [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource group and PRIO_RGRP
+     Tejun Heo <tj@kernel.org>
+
+[9]  http://lkml.kernel.org/r/20160311160522.GA24046@htj.duckdns.org
+     Example program for PRIO_RGRP
+     Tejun Heo <tj@kernel.org>
+
+[10] http://lkml.kernel.org/r/20160407082810.GN3430@twins.programming.kicks-ass.net
+     Re: [PATCHSET RFC cgroup/for-4.6] cgroup, sched: implement resource
+     Peter Zijlstra <peterz@infradead.org>