summaryrefslogtreecommitdiff
path: root/src/basic/cgroup-util.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/basic/cgroup-util.c')
-rw-r--r--src/basic/cgroup-util.c300
1 files changed, 230 insertions, 70 deletions
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c
index 038ece4b06..830a63c185 100644
--- a/src/basic/cgroup-util.c
+++ b/src/basic/cgroup-util.c
@@ -12,6 +12,7 @@
#include <sys/stat.h>
#include <sys/statfs.h>
#include <sys/types.h>
+#include <sys/utsname.h>
#include <sys/xattr.h>
#include <unistd.h>
@@ -129,10 +130,12 @@ bool cg_ns_supported(void) {
if (enabled >= 0)
return enabled;
- if (access("/proc/self/ns/cgroup", F_OK) == 0)
- enabled = 1;
- else
- enabled = 0;
+ if (access("/proc/self/ns/cgroup", F_OK) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
+ enabled = false;
+ } else
+ enabled = true;
return enabled;
}
@@ -197,10 +200,8 @@ int cg_rmdir(const char *controller, const char *path) {
return -errno;
r = cg_hybrid_unified();
- if (r < 0)
+ if (r <= 0)
return r;
- if (r == 0)
- return 0;
if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
@@ -817,7 +818,7 @@ int cg_attach(const char *controller, const char *path, pid_t pid) {
xsprintf(c, PID_FMT "\n", pid);
- r = write_string_file(fs, c, 0);
+ r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
@@ -985,10 +986,9 @@ int cg_get_xattr(const char *controller, const char *path, const char *name, voi
int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
_cleanup_fclose_ FILE *f = NULL;
- char line[LINE_MAX];
const char *fs, *controller_str;
+ int unified, r;
size_t cs = 0;
- int unified;
assert(path);
assert(pid >= 0);
@@ -1018,10 +1018,15 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) {
(void) __fsetlocking(f, FSETLOCKING_BYCALLER);
- FOREACH_LINE(line, f, return -errno) {
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
char *e, *p;
- truncate_nl(line);
+ r = read_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0)
+ break;
if (unified) {
e = startswith(line, "0:");
@@ -1095,7 +1100,7 @@ int cg_install_release_agent(const char *controller, const char *agent) {
sc = strstrip(contents);
if (isempty(sc)) {
- r = write_string_file(fs, agent, 0);
+ r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
} else if (!path_equal(sc, agent))
@@ -1113,7 +1118,7 @@ int cg_install_release_agent(const char *controller, const char *agent) {
sc = strstrip(contents);
if (streq(sc, "0")) {
- r = write_string_file(fs, "1", 0);
+ r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
@@ -1140,7 +1145,7 @@ int cg_uninstall_release_agent(const char *controller) {
if (r < 0)
return r;
- r = write_string_file(fs, "0", 0);
+ r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
@@ -1150,7 +1155,7 @@ int cg_uninstall_release_agent(const char *controller) {
if (r < 0)
return r;
- r = write_string_file(fs, "", 0);
+ r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0)
return r;
@@ -1166,7 +1171,7 @@ int cg_is_empty(const char *controller, const char *path) {
r = cg_enumerate_processes(controller, path, &f);
if (r == -ENOENT)
- return 1;
+ return true;
if (r < 0)
return r;
@@ -1196,6 +1201,8 @@ int cg_is_empty_recursive(const char *controller, const char *path) {
* via the "populated" attribute of "cgroup.events". */
r = cg_read_event(controller, path, "populated", &t);
+ if (r == -ENOENT)
+ return true;
if (r < 0)
return r;
@@ -1210,7 +1217,7 @@ int cg_is_empty_recursive(const char *controller, const char *path) {
r = cg_enumerate_subgroups(controller, path, &d);
if (r == -ENOENT)
- return 1;
+ return true;
if (r < 0)
return r;
@@ -1845,9 +1852,7 @@ char *cg_escape(const char *p) {
* needs free()! */
if (IN_SET(p[0], 0, '_', '.') ||
- streq(p, "notify_on_release") ||
- streq(p, "release_agent") ||
- streq(p, "tasks") ||
+ STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
startswith(p, "cgroup."))
need_prefix = true;
else {
@@ -2007,7 +2012,7 @@ int cg_set_attribute(const char *controller, const char *path, const char *attri
if (r < 0)
return r;
- return write_string_file(p, value, 0);
+ return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
}
int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
@@ -2102,6 +2107,7 @@ done:
int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) {
CGroupController c;
+ CGroupMask done;
bool created;
int r;
@@ -2117,7 +2123,7 @@ int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path
r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path);
if (r < 0)
return r;
- created = !!r;
+ created = r;
/* If we are in the unified hierarchy, we are done now */
r = cg_all_unified();
@@ -2126,17 +2132,28 @@ int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path
if (r > 0)
return created;
+ supported &= CGROUP_MASK_V1;
+ mask = CGROUP_MASK_EXTEND_JOINED(mask);
+ done = 0;
+
/* Otherwise, do the same in the other hierarchies */
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
const char *n;
- n = cgroup_controller_to_string(c);
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
+ continue;
- if (mask & bit)
+ n = cgroup_controller_to_string(c);
+ if (FLAGS_SET(mask, bit))
(void) cg_create(n, path);
- else if (supported & bit)
+ else
(void) cg_trim(n, path, true);
+
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
}
return created;
@@ -2144,6 +2161,7 @@ int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path
int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) {
CGroupController c;
+ CGroupMask done;
int r;
r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid);
@@ -2156,20 +2174,26 @@ int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_m
if (r > 0)
return 0;
+ supported &= CGROUP_MASK_V1;
+ done = 0;
+
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
const char *p = NULL;
- if (!(supported & bit))
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
continue;
if (path_callback)
p = path_callback(bit, userdata);
-
if (!p)
p = path;
(void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid);
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
}
return 0;
@@ -2194,6 +2218,7 @@ int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids,
int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) {
CGroupController c;
+ CGroupMask done;
int r = 0, q;
if (!path_equal(from, to)) {
@@ -2208,27 +2233,34 @@ int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to
if (q > 0)
return r;
+ supported &= CGROUP_MASK_V1;
+ done = 0;
+
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
const char *p = NULL;
- if (!(supported & bit))
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
continue;
if (to_callback)
p = to_callback(bit, userdata);
-
if (!p)
p = to;
(void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0);
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
}
- return 0;
+ return r;
}
int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) {
CGroupController c;
+ CGroupMask done;
int r, q;
r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root);
@@ -2241,16 +2273,23 @@ int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root)
if (q > 0)
return r;
+ supported &= CGROUP_MASK_V1;
+ done = 0;
+
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
- if (!(supported & bit))
+ if (!FLAGS_SET(supported, bit))
+ continue;
+
+ if (FLAGS_SET(done, bit))
continue;
(void) cg_trim(cgroup_controller_to_string(c), path, delete_root);
+ done |= CGROUP_MASK_EXTEND_JOINED(bit);
}
- return 0;
+ return r;
}
int cg_mask_to_string(CGroupMask mask, char **ret) {
@@ -2270,7 +2309,7 @@ int cg_mask_to_string(CGroupMask mask, char **ret) {
const char *k;
size_t l;
- if (!(mask & CGROUP_CONTROLLER_TO_MASK(c)))
+ if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
continue;
k = cgroup_controller_to_string(c);
@@ -2295,8 +2334,10 @@ int cg_mask_to_string(CGroupMask mask, char **ret) {
return 0;
}
-int cg_mask_from_string(const char *value, CGroupMask *mask) {
- assert(mask);
+int cg_mask_from_string(const char *value, CGroupMask *ret) {
+ CGroupMask m = 0;
+
+ assert(ret);
assert(value);
for (;;) {
@@ -2314,18 +2355,20 @@ int cg_mask_from_string(const char *value, CGroupMask *mask) {
if (v < 0)
continue;
- *mask |= CGROUP_CONTROLLER_TO_MASK(v);
+ m |= CGROUP_CONTROLLER_TO_MASK(v);
}
+
+ *ret = m;
return 0;
}
int cg_mask_supported(CGroupMask *ret) {
- CGroupMask mask = 0;
+ CGroupMask mask;
int r;
- /* Determines the mask of supported cgroup controllers. Only
- * includes controllers we can make sense of and that are
- * actually accessible. */
+ /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
+ * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
+ * pseudo-controllers. */
r = cg_all_unified();
if (r < 0)
@@ -2353,23 +2396,26 @@ int cg_mask_supported(CGroupMask *ret) {
if (r < 0)
return r;
- /* Currently, we support the cpu, memory, io and pids
- * controller in the unified hierarchy, mask
+ /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask
* everything else off. */
- mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS;
+ mask &= CGROUP_MASK_V2;
} else {
CGroupController c;
- /* In the legacy hierarchy, we check whether which
- * hierarchies are mounted. */
+ /* In the legacy hierarchy, we check which hierarchies are mounted. */
+ mask = 0;
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
+ CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
const char *n;
+ if (!FLAGS_SET(CGROUP_MASK_V1, bit))
+ continue;
+
n = cgroup_controller_to_string(c);
if (controller_is_accessible(n) >= 0)
- mask |= CGROUP_CONTROLLER_TO_MASK(c);
+ mask |= bit;
}
}
@@ -2384,10 +2430,9 @@ int cg_kernel_controllers(Set **ret) {
assert(ret);
- /* Determines the full list of kernel-known controllers. Might
- * include controllers we don't actually support, arbitrary
- * named hierarchies and controllers that aren't currently
- * accessible (because not mounted). */
+ /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
+ * and controllers that aren't currently accessible (because not mounted). This does not include "name="
+ * pseudo-controllers. */
controllers = set_new(&string_hash_ops);
if (!controllers)
@@ -2498,11 +2543,10 @@ static int cg_unified_update(void) {
unified_cache = CGROUP_UNIFIED_NONE;
}
}
- } else {
- log_debug("Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
- (unsigned long long) fs.f_type);
- return -ENOMEDIUM;
- }
+ } else
+ return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
+ "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
+ (unsigned long long)fs.f_type);
return 0;
}
@@ -2549,22 +2593,45 @@ int cg_unified_flush(void) {
return cg_unified_update();
}
-int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
+int cg_enable_everywhere(
+ CGroupMask supported,
+ CGroupMask mask,
+ const char *p,
+ CGroupMask *ret_result_mask) {
+
_cleanup_fclose_ FILE *f = NULL;
_cleanup_free_ char *fs = NULL;
CGroupController c;
+ CGroupMask ret = 0;
int r;
assert(p);
- if (supported == 0)
+ if (supported == 0) {
+ if (ret_result_mask)
+ *ret_result_mask = 0;
return 0;
+ }
r = cg_all_unified();
if (r < 0)
return r;
- if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */
+ if (r == 0) {
+ /* On the legacy hiearchy there's no concept of "enabling" controllers in cgroups defined. Let's claim
+ * complete success right away. (If you wonder why we return the full mask here, rather than zero: the
+ * caller tends to use the returned mask later on to compare if all controllers where properly joined,
+ * and if not requeues realization. This use is the primary purpose of the return value, hence let's
+ * minimize surprises here and reduce triggers for re-realization by always saying we fully
+ * succeeded.) */
+ if (ret_result_mask)
+ *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with
+ * CGROUP_MASK_V2: The 'supported' mask
+ * might contain pure-V1 or BPF
+ * controllers, and we never want to
+ * claim that we could enable those with
+ * cgroup.subtree_control */
return 0;
+ }
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs);
if (r < 0)
@@ -2574,32 +2641,63 @@ int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) {
CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
const char *n;
- if (!(supported & bit))
+ if (!FLAGS_SET(CGROUP_MASK_V2, bit))
+ continue;
+
+ if (!FLAGS_SET(supported, bit))
continue;
n = cgroup_controller_to_string(c);
{
char s[1 + strlen(n) + 1];
- s[0] = mask & bit ? '+' : '-';
+ s[0] = FLAGS_SET(mask, bit) ? '+' : '-';
strcpy(s + 1, n);
if (!f) {
f = fopen(fs, "we");
- if (!f) {
- log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
- break;
- }
+ if (!f)
+ return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p);
}
- r = write_string_stream(f, s, 0);
+ r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER);
if (r < 0) {
- log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs);
+ log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m",
+ FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs);
clearerr(f);
+
+ /* If we can't turn off a controller, leave it on in the reported resulting mask. This
+ * happens for example when we attempt to turn off a controller up in the tree that is
+ * used down in the tree. */
+ if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY
+ * only here, and not follow the same logic
+ * for other errors such as EINVAL or
+ * EOPNOTSUPP or anything else. That's
+ * because EBUSY indicates that the
+ * controllers is currently enabled and
+ * cannot be disabled because something down
+ * the hierarchy is still using it. Any other
+ * error most likely means something like "I
+ * never heard of this controller" or
+ * similar. In the former case it's hence
+ * safe to assume the controller is still on
+ * after the failed operation, while in the
+ * latter case it's safer to assume the
+ * controller is unknown and hence certainly
+ * not enabled. */
+ ret |= bit;
+ } else {
+ /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */
+ if (FLAGS_SET(mask, bit))
+ ret |= bit;
}
}
}
+ /* Let's return the precise set of controllers now enabled for the cgroup. */
+ if (ret_result_mask)
+ *ret_result_mask = ret;
+
return 0;
}
@@ -2608,6 +2706,7 @@ bool cg_is_unified_wanted(void) {
int r;
bool b;
const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL;
+ _cleanup_free_ char *c = NULL;
/* If we have a cached value, return that. */
if (wanted >= 0)
@@ -2618,11 +2717,19 @@ bool cg_is_unified_wanted(void) {
if (cg_unified_flush() >= 0)
return (wanted = unified_cache >= CGROUP_UNIFIED_ALL);
- /* Otherwise, let's see what the kernel command line has to say.
- * Since checking is expensive, cache a non-error result. */
+ /* If we were explicitly passed systemd.unified_cgroup_hierarchy,
+ * respect that. */
r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b);
+ if (r > 0)
+ return (wanted = b);
+
+ /* If we passed cgroup_no_v1=all with no other instructions, it seems
+ * highly unlikely that we want to use hybrid or legacy hierarchy. */
+ r = proc_cmdline_get_key("cgroup_no_v1", 0, &c);
+ if (r > 0 && streq_ptr(c, "all"))
+ return (wanted = true);
- return (wanted = r > 0 ? b : is_default);
+ return (wanted = is_default);
}
bool cg_is_legacy_wanted(void) {
@@ -2768,6 +2875,59 @@ static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
[CGROUP_CONTROLLER_MEMORY] = "memory",
[CGROUP_CONTROLLER_DEVICES] = "devices",
[CGROUP_CONTROLLER_PIDS] = "pids",
+ [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
+ [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
};
DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
+
+CGroupMask get_cpu_accounting_mask(void) {
+ static CGroupMask needed_mask = (CGroupMask) -1;
+
+ /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
+ * provided externally from the CPU controller, which means we don't
+ * need to enable the CPU controller just to get metrics. This is good,
+ * because enabling the CPU controller comes at a minor performance
+ * hit, especially when it's propagated deep into large hierarchies.
+ * There's also no separate CPU accounting controller available within
+ * a unified hierarchy.
+ *
+ * This combination of factors results in the desired cgroup mask to
+ * enable for CPU accounting varying as follows:
+ *
+ * ╔═════════════════════╤═════════════════════╗
+ * ║ Linux ≥4.15 │ Linux <4.15 ║
+ * ╔═══════════════╬═════════════════════╪═════════════════════╣
+ * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║
+ * ╟───────────────╫─────────────────────┼─────────────────────╢
+ * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
+ * ╚═══════════════╩═════════════════════╧═════════════════════╝
+ *
+ * We check kernel version here instead of manually checking whether
+ * cpu.stat is present for every cgroup, as that check in itself would
+ * already be fairly expensive.
+ *
+ * Kernels where this patch has been backported will therefore have the
+ * CPU controller enabled unnecessarily. This is more expensive than
+ * necessary, but harmless. ☺️
+ */
+
+ if (needed_mask == (CGroupMask) -1) {
+ if (cg_all_unified()) {
+ struct utsname u;
+ assert_se(uname(&u) >= 0);
+
+ if (str_verscmp(u.release, "4.15") < 0)
+ needed_mask = CGROUP_MASK_CPU;
+ else
+ needed_mask = 0;
+ } else
+ needed_mask = CGROUP_MASK_CPUACCT;
+ }
+
+ return needed_mask;
+}
+
+bool cpu_accounting_is_cheap(void) {
+ return get_cpu_accounting_mask() == 0;
+}