pkgs/os-specific/linux/kernel/cpu-cgroup-v2-patches/4.4.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

commit e7cae741f6d645ac68fe8823ca6ef45dbbf6891b
Author: Tejun Heo <tj@kernel.org>
Date:   Fri Mar 11 07:31:23 2016 -0500

    sched: Misc preps for cgroup unified hierarchy interface
    
    Make the following changes in preparation for the cpu controller
    interface implementation for the unified hierarchy.  This patch
    doesn't cause any functional differences.
    
    * s/cpu_stats_show()/cpu_cfs_stats_show()/
    
    * s/cpu_files/cpu_legacy_files/
    
    * Separate out cpuacct_stats_read() from cpuacct_stats_show().  While
      at it, remove pointless cpuacct_stat_desc[] array.
    
    Signed-off-by: Tejun Heo <tj@kernel.org>
    Cc: Ingo Molnar <mingo@redhat.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Li Zefan <lizefan@huawei.com>
    Cc: Johannes Weiner <hannes@cmpxchg.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 732e993..77f3ddd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8512,7 +8512,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stats_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -8552,7 +8552,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -8573,7 +8573,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.seq_show = cpu_stats_show,
+		.seq_show = cpu_cfs_stats_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -8599,7 +8599,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_files,
 	.early_init	= 1,
 };
 
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index dd7cbb5..42b2dd5 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -177,36 +177,33 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
 	return 0;
 }
 
-static const char * const cpuacct_stat_desc[] = {
-	[CPUACCT_STAT_USER] = "user",
-	[CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct seq_file *sf, void *v)
+static void cpuacct_stats_read(struct cpuacct *ca, u64 *userp, u64 *sysp)
 {
-	struct cpuacct *ca = css_ca(seq_css(sf));
 	int cpu;
-	s64 val = 0;
 
+	*userp = 0;
 	for_each_online_cpu(cpu) {
 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-		val += kcpustat->cpustat[CPUTIME_USER];
-		val += kcpustat->cpustat[CPUTIME_NICE];
+		*userp += kcpustat->cpustat[CPUTIME_USER];
+		*userp += kcpustat->cpustat[CPUTIME_NICE];
 	}
-	val = cputime64_to_clock_t(val);
-	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
 
-	val = 0;
+	*sysp = 0;
 	for_each_online_cpu(cpu) {
 		struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-		val += kcpustat->cpustat[CPUTIME_SYSTEM];
-		val += kcpustat->cpustat[CPUTIME_IRQ];
-		val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+		*sysp += kcpustat->cpustat[CPUTIME_SYSTEM];
+		*sysp += kcpustat->cpustat[CPUTIME_IRQ];
+		*sysp += kcpustat->cpustat[CPUTIME_SOFTIRQ];
 	}
+}
 
-	val = cputime64_to_clock_t(val);
-	seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
+{
+	cputime64_t user, sys;
 
+	cpuacct_stats_read(css_ca(seq_css(sf)), &user, &sys);
+	seq_printf(sf, "user %lld\n", cputime64_to_clock_t(user));
+	seq_printf(sf, "system %lld\n", cputime64_to_clock_t(sys));
 	return 0;
 }
 

commit 1bb33e8a69f089f2d3f58a0e681d4ff352e11c97
Author: Tejun Heo <tj@kernel.org>
Date:   Fri Mar 11 07:31:23 2016 -0500

    sched: Implement interface for cgroup unified hierarchy
    
    While the cpu controller doesn't have any functional problems, there
    are a couple interface issues which can be addressed in the v2
    interface.
    
    * cpuacct being a separate controller.  This separation is artificial
      and rather pointless as demonstrated by most use cases co-mounting
      the two controllers.  It also forces certain information to be
      accounted twice.
    
    * Use of different time units.  Writable control knobs use
      microseconds, some stat fields use nanoseconds while other cpuacct
      stat fields use centiseconds.
    
    * Control knobs which can't be used in the root cgroup still show up
      in the root.
    
    * Control knob names and semantics aren't consistent with other
      controllers.
    
    This patchset implements cpu controller's interface on the unified
    hierarchy which adheres to the controller file conventions described
    in Documentation/cgroups/unified-hierarchy.txt.  Overall, the
    following changes are made.
    
    * cpuacct is implictly enabled and disabled by cpu and its information
      is reported through "cpu.stat" which now uses microseconds for all
      time durations.  All time duration fields now have "_usec" appended
      to them for clarity.  While this doesn't solve the double accounting
      immediately, once majority of users switch to v2, cpu can directly
      account and report the relevant stats and cpuacct can be disabled on
      the unified hierarchy.
    
      Note that cpuacct.usage_percpu is currently not included in
      "cpu.stat".  If this information is actually called for, it can be
      added later.
    
    * "cpu.shares" is replaced with "cpu.weight" and operates on the
      standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
      The weight is scaled to scheduler weight so that 100 maps to 1024
      and the ratio relationship is preserved - if weight is W and its
      scaled value is S, W / 100 == S / 1024.  While the mapped range is a
      bit smaller than the orignal scheduler weight range, the dead zones
      on both sides are relatively small and covers wider range than the
      nice value mappings.  This file doesn't make sense in the root
      cgroup and isn't create on root.
    
    * "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
      which contains both quota and period.
    
    * "cpu.rt_runtime_us" and "cpu.rt_period_us" are replaced by
      "cpu.rt.max" which contains both runtime and period.
    
    v2: cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED for
        CFS bandwidth stats and also using raw division for u64.  Use
        CONFIG_CFS_BANDWITH and do_div() instead.
    
        The semantics of "cpu.rt.max" is not fully decided yet.  Dropped
        for now.
    
    Signed-off-by: Tejun Heo <tj@kernel.org>
    Cc: Ingo Molnar <mingo@redhat.com>
    Cc: Peter Zijlstra <peterz@infradead.org>
    Cc: Li Zefan <lizefan@huawei.com>
    Cc: Johannes Weiner <hannes@cmpxchg.org>

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 77f3ddd..7aafe63 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8591,6 +8591,139 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* terminate */
 };
 
+static int cpu_stats_show(struct seq_file *sf, void *v)
+{
+	cpuacct_cpu_stats_show(sf);
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		struct task_group *tg = css_tg(seq_css(sf));
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		u64 throttled_usec;
+
+		throttled_usec = cfs_b->throttled_time;
+		do_div(throttled_usec, NSEC_PER_USEC);
+
+		seq_printf(sf, "nr_periods %d\n"
+			   "nr_throttled %d\n"
+			   "throttled_usec %llu\n",
+			   cfs_b->nr_periods, cfs_b->nr_throttled,
+			   throttled_usec);
+	}
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+	u64 weight = scale_load_down(tg->shares);
+
+	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cftype, u64 weight)
+{
+	/*
+	 * cgroup weight knobs should use the common MIN, DFL and MAX
+	 * values which are 1, 100 and 10000 respectively.  While it loses
+	 * a bit of range on both ends, it maps pretty well onto the shares
+	 * value used by scheduler and the round-trip conversions preserve
+	 * the original value over the entire range.
+	 */
+	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+		return -ERANGE;
+
+	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+						  long period, long quota)
+{
+	if (quota < 0)
+		seq_puts(sf, "max");
+	else
+		seq_printf(sf, "%ld", quota);
+
+	seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+						 u64 *periodp, u64 *quotap)
+{
+	char tok[21];	/* U64_MAX */
+
+	if (!sscanf(buf, "%s %llu", tok, periodp))
+		return -EINVAL;
+
+	*periodp *= NSEC_PER_USEC;
+
+	if (sscanf(tok, "%llu", quotap))
+		*quotap *= NSEC_PER_USEC;
+	else if (!strcmp(tok, "max"))
+		*quotap = RUNTIME_INF;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+	return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	struct task_group *tg = css_tg(of_css(of));
+	u64 period = tg_get_cfs_period(tg);
+	u64 quota;
+	int ret;
+
+	ret = cpu_period_quota_parse(buf, &period, &quota);
+	if (!ret)
+		ret = tg_set_cfs_bandwidth(tg, period, quota);
+	return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+	{
+		.name = "stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stats_show,
+	},
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_weight_read_u64,
+		.write_u64 = cpu_weight_write_u64,
+	},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_max_show,
+		.write = cpu_max_write,
+	},
+#endif
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_free	= cpu_cgroup_css_free,
@@ -8600,7 +8733,15 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.legacy_cftypes	= cpu_legacy_files,
+	.dfl_cftypes	= cpu_files,
 	.early_init	= 1,
+#ifdef CONFIG_CGROUP_CPUACCT
+	/*
+	 * cpuacct is enabled together with cpu on the unified hierarchy
+	 * and its stats are reported through "cpu.stat".
+	 */
+	.depends_on	= 1 << cpuacct_cgrp_id,
+#endif
 };
 
 #endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 42b2dd5..b4d32a6 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -224,6 +224,30 @@ static struct cftype files[] = {
 	{ }	/* terminate */
 };
 
+/* used to print cpuacct stats in cpu.stat on the unified hierarchy */
+void cpuacct_cpu_stats_show(struct seq_file *sf)
+{
+	struct cgroup_subsys_state *css;
+	u64 usage, user, sys;
+
+	css = cgroup_get_e_css(seq_css(sf)->cgroup, &cpuacct_cgrp_subsys);
+
+	usage = cpuusage_read(css, seq_cft(sf));
+	cpuacct_stats_read(css_ca(css), &user, &sys);
+
+	user *= TICK_NSEC;
+	sys *= TICK_NSEC;
+	do_div(usage, NSEC_PER_USEC);
+	do_div(user, NSEC_PER_USEC);
+	do_div(sys, NSEC_PER_USEC);
+
+	seq_printf(sf, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n", usage, user, sys);
+
+	css_put(css);
+}
+
 /*
  * charge this task's execution time to its accounting group.
  *
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
index ed60562..44eace9 100644
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -2,6 +2,7 @@
 
 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+extern void cpuacct_cpu_stats_show(struct seq_file *sf);
 
 #else
 
@@ -14,4 +15,8 @@ cpuacct_account_field(struct task_struct *p, int index, u64 val)
 {
 }
 
+static inline void cpuacct_cpu_stats_show(struct seq_file *sf)
+{
+}
+
 #endif