diff options
Diffstat (limited to 'src/basic/cgroup-util.c')
-rw-r--r-- | src/basic/cgroup-util.c | 300 |
1 files changed, 230 insertions, 70 deletions
diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index 038ece4b06..830a63c185 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -12,6 +12,7 @@ #include <sys/stat.h> #include <sys/statfs.h> #include <sys/types.h> +#include <sys/utsname.h> #include <sys/xattr.h> #include <unistd.h> @@ -129,10 +130,12 @@ bool cg_ns_supported(void) { if (enabled >= 0) return enabled; - if (access("/proc/self/ns/cgroup", F_OK) == 0) - enabled = 1; - else - enabled = 0; + if (access("/proc/self/ns/cgroup", F_OK) < 0) { + if (errno != ENOENT) + log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m"); + enabled = false; + } else + enabled = true; return enabled; } @@ -197,10 +200,8 @@ int cg_rmdir(const char *controller, const char *path) { return -errno; r = cg_hybrid_unified(); - if (r < 0) + if (r <= 0) return r; - if (r == 0) - return 0; if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) { r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path); @@ -817,7 +818,7 @@ int cg_attach(const char *controller, const char *path, pid_t pid) { xsprintf(c, PID_FMT "\n", pid); - r = write_string_file(fs, c, 0); + r = write_string_file(fs, c, WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return r; @@ -985,10 +986,9 @@ int cg_get_xattr(const char *controller, const char *path, const char *name, voi int cg_pid_get_path(const char *controller, pid_t pid, char **path) { _cleanup_fclose_ FILE *f = NULL; - char line[LINE_MAX]; const char *fs, *controller_str; + int unified, r; size_t cs = 0; - int unified; assert(path); assert(pid >= 0); @@ -1018,10 +1018,15 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path) { (void) __fsetlocking(f, FSETLOCKING_BYCALLER); - FOREACH_LINE(line, f, return -errno) { + for (;;) { + _cleanup_free_ char *line = NULL; char *e, *p; - truncate_nl(line); + r = read_line(f, LONG_LINE_MAX, &line); + if (r < 0) + return r; + if (r == 0) + break; if (unified) { e = startswith(line, "0:"); @@ -1095,7 +1100,7 @@ int cg_install_release_agent(const char *controller, const char *agent) { sc = strstrip(contents); if (isempty(sc)) { - r = write_string_file(fs, agent, 0); + r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return r; } else if (!path_equal(sc, agent)) @@ -1113,7 +1118,7 @@ int cg_install_release_agent(const char *controller, const char *agent) { sc = strstrip(contents); if (streq(sc, "0")) { - r = write_string_file(fs, "1", 0); + r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return r; @@ -1140,7 +1145,7 @@ int cg_uninstall_release_agent(const char *controller) { if (r < 0) return r; - r = write_string_file(fs, "0", 0); + r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return r; @@ -1150,7 +1155,7 @@ int cg_uninstall_release_agent(const char *controller) { if (r < 0) return r; - r = write_string_file(fs, "", 0); + r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) return r; @@ -1166,7 +1171,7 @@ int cg_is_empty(const char *controller, const char *path) { r = cg_enumerate_processes(controller, path, &f); if (r == -ENOENT) - return 1; + return true; if (r < 0) return r; @@ -1196,6 +1201,8 @@ int cg_is_empty_recursive(const char *controller, const char *path) { * via the "populated" attribute of "cgroup.events". */ r = cg_read_event(controller, path, "populated", &t); + if (r == -ENOENT) + return true; if (r < 0) return r; @@ -1210,7 +1217,7 @@ int cg_is_empty_recursive(const char *controller, const char *path) { r = cg_enumerate_subgroups(controller, path, &d); if (r == -ENOENT) - return 1; + return true; if (r < 0) return r; @@ -1845,9 +1852,7 @@ char *cg_escape(const char *p) { * needs free()! */ if (IN_SET(p[0], 0, '_', '.') || - streq(p, "notify_on_release") || - streq(p, "release_agent") || - streq(p, "tasks") || + STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") || startswith(p, "cgroup.")) need_prefix = true; else { @@ -2007,7 +2012,7 @@ int cg_set_attribute(const char *controller, const char *path, const char *attri if (r < 0) return r; - return write_string_file(p, value, 0); + return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER); } int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) { @@ -2102,6 +2107,7 @@ done: int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path) { CGroupController c; + CGroupMask done; bool created; int r; @@ -2117,7 +2123,7 @@ int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path r = cg_create(SYSTEMD_CGROUP_CONTROLLER, path); if (r < 0) return r; - created = !!r; + created = r; /* If we are in the unified hierarchy, we are done now */ r = cg_all_unified(); @@ -2126,17 +2132,28 @@ int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path if (r > 0) return created; + supported &= CGROUP_MASK_V1; + mask = CGROUP_MASK_EXTEND_JOINED(mask); + done = 0; + /* Otherwise, do the same in the other hierarchies */ for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); const char *n; - n = cgroup_controller_to_string(c); + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) + continue; - if (mask & bit) + n = cgroup_controller_to_string(c); + if (FLAGS_SET(mask, bit)) (void) cg_create(n, path); - else if (supported & bit) + else (void) cg_trim(n, path, true); + + done |= CGROUP_MASK_EXTEND_JOINED(bit); } return created; @@ -2144,6 +2161,7 @@ int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t path_callback, void *userdata) { CGroupController c; + CGroupMask done; int r; r = cg_attach(SYSTEMD_CGROUP_CONTROLLER, path, pid); @@ -2156,20 +2174,26 @@ int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_m if (r > 0) return 0; + supported &= CGROUP_MASK_V1; + done = 0; + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); const char *p = NULL; - if (!(supported & bit)) + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) continue; if (path_callback) p = path_callback(bit, userdata); - if (!p) p = path; (void) cg_attach_fallback(cgroup_controller_to_string(c), p, pid); + done |= CGROUP_MASK_EXTEND_JOINED(bit); } return 0; @@ -2194,6 +2218,7 @@ int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t to_callback, void *userdata) { CGroupController c; + CGroupMask done; int r = 0, q; if (!path_equal(from, to)) { @@ -2208,27 +2233,34 @@ int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to if (q > 0) return r; + supported &= CGROUP_MASK_V1; + done = 0; + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); const char *p = NULL; - if (!(supported & bit)) + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) continue; if (to_callback) p = to_callback(bit, userdata); - if (!p) p = to; (void) cg_migrate_recursive_fallback(SYSTEMD_CGROUP_CONTROLLER, to, cgroup_controller_to_string(c), p, 0); + done |= CGROUP_MASK_EXTEND_JOINED(bit); } - return 0; + return r; } int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) { CGroupController c; + CGroupMask done; int r, q; r = cg_trim(SYSTEMD_CGROUP_CONTROLLER, path, delete_root); @@ -2241,16 +2273,23 @@ int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root) if (q > 0) return r; + supported &= CGROUP_MASK_V1; + done = 0; + for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); - if (!(supported & bit)) + if (!FLAGS_SET(supported, bit)) + continue; + + if (FLAGS_SET(done, bit)) continue; (void) cg_trim(cgroup_controller_to_string(c), path, delete_root); + done |= CGROUP_MASK_EXTEND_JOINED(bit); } - return 0; + return r; } int cg_mask_to_string(CGroupMask mask, char **ret) { @@ -2270,7 +2309,7 @@ int cg_mask_to_string(CGroupMask mask, char **ret) { const char *k; size_t l; - if (!(mask & CGROUP_CONTROLLER_TO_MASK(c))) + if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c))) continue; k = cgroup_controller_to_string(c); @@ -2295,8 +2334,10 @@ int cg_mask_to_string(CGroupMask mask, char **ret) { return 0; } -int cg_mask_from_string(const char *value, CGroupMask *mask) { - assert(mask); +int cg_mask_from_string(const char *value, CGroupMask *ret) { + CGroupMask m = 0; + + assert(ret); assert(value); for (;;) { @@ -2314,18 +2355,20 @@ int cg_mask_from_string(const char *value, CGroupMask *mask) { if (v < 0) continue; - *mask |= CGROUP_CONTROLLER_TO_MASK(v); + m |= CGROUP_CONTROLLER_TO_MASK(v); } + + *ret = m; return 0; } int cg_mask_supported(CGroupMask *ret) { - CGroupMask mask = 0; + CGroupMask mask; int r; - /* Determines the mask of supported cgroup controllers. Only - * includes controllers we can make sense of and that are - * actually accessible. */ + /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that + * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz + * pseudo-controllers. */ r = cg_all_unified(); if (r < 0) @@ -2353,23 +2396,26 @@ int cg_mask_supported(CGroupMask *ret) { if (r < 0) return r; - /* Currently, we support the cpu, memory, io and pids - * controller in the unified hierarchy, mask + /* Currently, we support the cpu, memory, io and pids controller in the unified hierarchy, mask * everything else off. */ - mask &= CGROUP_MASK_CPU | CGROUP_MASK_MEMORY | CGROUP_MASK_IO | CGROUP_MASK_PIDS; + mask &= CGROUP_MASK_V2; } else { CGroupController c; - /* In the legacy hierarchy, we check whether which - * hierarchies are mounted. */ + /* In the legacy hierarchy, we check which hierarchies are mounted. */ + mask = 0; for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) { + CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); const char *n; + if (!FLAGS_SET(CGROUP_MASK_V1, bit)) + continue; + n = cgroup_controller_to_string(c); if (controller_is_accessible(n) >= 0) - mask |= CGROUP_CONTROLLER_TO_MASK(c); + mask |= bit; } } @@ -2384,10 +2430,9 @@ int cg_kernel_controllers(Set **ret) { assert(ret); - /* Determines the full list of kernel-known controllers. Might - * include controllers we don't actually support, arbitrary - * named hierarchies and controllers that aren't currently - * accessible (because not mounted). */ + /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support + * and controllers that aren't currently accessible (because not mounted). This does not include "name=" + * pseudo-controllers. */ controllers = set_new(&string_hash_ops); if (!controllers) @@ -2498,11 +2543,10 @@ static int cg_unified_update(void) { unified_cache = CGROUP_UNIFIED_NONE; } } - } else { - log_debug("Unknown filesystem type %llx mounted on /sys/fs/cgroup.", - (unsigned long long) fs.f_type); - return -ENOMEDIUM; - } + } else + return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM), + "Unknown filesystem type %llx mounted on /sys/fs/cgroup.", + (unsigned long long)fs.f_type); return 0; } @@ -2549,22 +2593,45 @@ int cg_unified_flush(void) { return cg_unified_update(); } -int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) { +int cg_enable_everywhere( + CGroupMask supported, + CGroupMask mask, + const char *p, + CGroupMask *ret_result_mask) { + _cleanup_fclose_ FILE *f = NULL; _cleanup_free_ char *fs = NULL; CGroupController c; + CGroupMask ret = 0; int r; assert(p); - if (supported == 0) + if (supported == 0) { + if (ret_result_mask) + *ret_result_mask = 0; return 0; + } r = cg_all_unified(); if (r < 0) return r; - if (r == 0) /* on the legacy hiearchy there's no joining of controllers defined */ + if (r == 0) { + /* On the legacy hiearchy there's no concept of "enabling" controllers in cgroups defined. Let's claim + * complete success right away. (If you wonder why we return the full mask here, rather than zero: the + * caller tends to use the returned mask later on to compare if all controllers where properly joined, + * and if not requeues realization. This use is the primary purpose of the return value, hence let's + * minimize surprises here and reduce triggers for re-realization by always saying we fully + * succeeded.) */ + if (ret_result_mask) + *ret_result_mask = mask & supported & CGROUP_MASK_V2; /* If you wonder why we mask this with + * CGROUP_MASK_V2: The 'supported' mask + * might contain pure-V1 or BPF + * controllers, and we never want to + * claim that we could enable those with + * cgroup.subtree_control */ return 0; + } r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, p, "cgroup.subtree_control", &fs); if (r < 0) @@ -2574,32 +2641,63 @@ int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p) { CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c); const char *n; - if (!(supported & bit)) + if (!FLAGS_SET(CGROUP_MASK_V2, bit)) + continue; + + if (!FLAGS_SET(supported, bit)) continue; n = cgroup_controller_to_string(c); { char s[1 + strlen(n) + 1]; - s[0] = mask & bit ? '+' : '-'; + s[0] = FLAGS_SET(mask, bit) ? '+' : '-'; strcpy(s + 1, n); if (!f) { f = fopen(fs, "we"); - if (!f) { - log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); - break; - } + if (!f) + return log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", p); } - r = write_string_stream(f, s, 0); + r = write_string_stream(f, s, WRITE_STRING_FILE_DISABLE_BUFFER); if (r < 0) { - log_debug_errno(r, "Failed to enable controller %s for %s (%s): %m", n, p, fs); + log_debug_errno(r, "Failed to %s controller %s for %s (%s): %m", + FLAGS_SET(mask, bit) ? "enable" : "disable", n, p, fs); clearerr(f); + + /* If we can't turn off a controller, leave it on in the reported resulting mask. This + * happens for example when we attempt to turn off a controller up in the tree that is + * used down in the tree. */ + if (!FLAGS_SET(mask, bit) && r == -EBUSY) /* You might wonder why we check for EBUSY + * only here, and not follow the same logic + * for other errors such as EINVAL or + * EOPNOTSUPP or anything else. That's + * because EBUSY indicates that the + * controllers is currently enabled and + * cannot be disabled because something down + * the hierarchy is still using it. Any other + * error most likely means something like "I + * never heard of this controller" or + * similar. In the former case it's hence + * safe to assume the controller is still on + * after the failed operation, while in the + * latter case it's safer to assume the + * controller is unknown and hence certainly + * not enabled. */ + ret |= bit; + } else { + /* Otherwise, if we managed to turn on a controller, set the bit reflecting that. */ + if (FLAGS_SET(mask, bit)) + ret |= bit; } } } + /* Let's return the precise set of controllers now enabled for the cgroup. */ + if (ret_result_mask) + *ret_result_mask = ret; + return 0; } @@ -2608,6 +2706,7 @@ bool cg_is_unified_wanted(void) { int r; bool b; const bool is_default = DEFAULT_HIERARCHY == CGROUP_UNIFIED_ALL; + _cleanup_free_ char *c = NULL; /* If we have a cached value, return that. */ if (wanted >= 0) @@ -2618,11 +2717,19 @@ bool cg_is_unified_wanted(void) { if (cg_unified_flush() >= 0) return (wanted = unified_cache >= CGROUP_UNIFIED_ALL); - /* Otherwise, let's see what the kernel command line has to say. - * Since checking is expensive, cache a non-error result. */ + /* If we were explicitly passed systemd.unified_cgroup_hierarchy, + * respect that. */ r = proc_cmdline_get_bool("systemd.unified_cgroup_hierarchy", &b); + if (r > 0) + return (wanted = b); + + /* If we passed cgroup_no_v1=all with no other instructions, it seems + * highly unlikely that we want to use hybrid or legacy hierarchy. */ + r = proc_cmdline_get_key("cgroup_no_v1", 0, &c); + if (r > 0 && streq_ptr(c, "all")) + return (wanted = true); - return (wanted = r > 0 ? b : is_default); + return (wanted = is_default); } bool cg_is_legacy_wanted(void) { @@ -2768,6 +2875,59 @@ static const char *cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = { [CGROUP_CONTROLLER_MEMORY] = "memory", [CGROUP_CONTROLLER_DEVICES] = "devices", [CGROUP_CONTROLLER_PIDS] = "pids", + [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall", + [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices", }; DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController); + +CGroupMask get_cpu_accounting_mask(void) { + static CGroupMask needed_mask = (CGroupMask) -1; + + /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is + * provided externally from the CPU controller, which means we don't + * need to enable the CPU controller just to get metrics. This is good, + * because enabling the CPU controller comes at a minor performance + * hit, especially when it's propagated deep into large hierarchies. + * There's also no separate CPU accounting controller available within + * a unified hierarchy. + * + * This combination of factors results in the desired cgroup mask to + * enable for CPU accounting varying as follows: + * + * ╔═════════════════════╤═════════════════════╗ + * ║ Linux ≥4.15 │ Linux <4.15 ║ + * ╔═══════════════╬═════════════════════╪═════════════════════╣ + * ║ Unified ║ nothing │ CGROUP_MASK_CPU ║ + * ╟───────────────╫─────────────────────┼─────────────────────╢ + * ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║ + * ╚═══════════════╩═════════════════════╧═════════════════════╝ + * + * We check kernel version here instead of manually checking whether + * cpu.stat is present for every cgroup, as that check in itself would + * already be fairly expensive. + * + * Kernels where this patch has been backported will therefore have the + * CPU controller enabled unnecessarily. This is more expensive than + * necessary, but harmless. ☺️ + */ + + if (needed_mask == (CGroupMask) -1) { + if (cg_all_unified()) { + struct utsname u; + assert_se(uname(&u) >= 0); + + if (str_verscmp(u.release, "4.15") < 0) + needed_mask = CGROUP_MASK_CPU; + else + needed_mask = 0; + } else + needed_mask = CGROUP_MASK_CPUACCT; + } + + return needed_mask; +} + +bool cpu_accounting_is_cheap(void) { + return get_cpu_accounting_mask() == 0; +} |