diff options
Diffstat (limited to 'src/core/execute.c')
-rw-r--r-- | src/core/execute.c | 498 |
1 files changed, 378 insertions, 120 deletions
diff --git a/src/core/execute.c b/src/core/execute.c index 8ac69d1a0f..595a3c6eca 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -50,12 +50,12 @@ #include "chown-recursive.h" #include "cpu-set-util.h" #include "def.h" +#include "env-file.h" #include "env-util.h" #include "errno-list.h" #include "execute.h" #include "exit-status.h" #include "fd-util.h" -#include "fileio.h" #include "format-util.h" #include "fs-util.h" #include "glob-util.h" @@ -76,7 +76,6 @@ #if HAVE_SECCOMP #include "seccomp-util.h" #endif -#include "securebits.h" #include "securebits-util.h" #include "selinux-util.h" #include "signal-util.h" @@ -89,6 +88,7 @@ #include "strv.h" #include "syslog-util.h" #include "terminal-util.h" +#include "umask-util.h" #include "unit.h" #include "user-util.h" #include "util.h" @@ -147,11 +147,11 @@ static int shift_fds(int fds[], size_t n_fds) { return 0; } -static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) { +static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) { size_t i, n_fds; int r; - n_fds = n_storage_fds + n_socket_fds; + n_fds = n_socket_fds + n_storage_fds; if (n_fds <= 0) return 0; @@ -323,7 +323,8 @@ static int connect_logger_as( uid_t uid, gid_t gid) { - int fd, r; + _cleanup_close_ int fd = -1; + int r; assert(context); assert(params); @@ -339,14 +340,12 @@ static int connect_logger_as( if (r < 0) return r; - if (shutdown(fd, SHUT_RD) < 0) { - safe_close(fd); + if (shutdown(fd, SHUT_RD) < 0) return -errno; - } (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); - dprintf(fd, + if (dprintf(fd, "%s\n" "%s\n" "%i\n" @@ -360,10 +359,12 @@ static int connect_logger_as( !!context->syslog_level_prefix, is_syslog_output(output), is_kmsg_output(output), - is_terminal_output(output)); + is_terminal_output(output)) < 0) + return -errno; - return move_fd(fd, nfd, false); + return move_fd(TAKE_FD(fd), nfd, false); } + static int open_terminal_as(const char *path, int flags, int nfd) { int fd; @@ -378,10 +379,9 @@ static int open_terminal_as(const char *path, int flags, int nfd) { } static int acquire_path(const char *path, int flags, mode_t mode) { - union sockaddr_union sa = { - .sa.sa_family = AF_UNIX, - }; - int fd, r; + union sockaddr_union sa = {}; + _cleanup_close_ int fd = -1; + int r, salen; assert(path); @@ -390,11 +390,11 @@ static int acquire_path(const char *path, int flags, mode_t mode) { fd = open(path, flags|O_NOCTTY, mode); if (fd >= 0) - return fd; + return TAKE_FD(fd); if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */ return -errno; - if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */ + if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */ return -ENXIO; /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */ @@ -403,25 +403,24 @@ static int acquire_path(const char *path, int flags, mode_t mode) { if (fd < 0) return -errno; - strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path)); - if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) { - safe_close(fd); + salen = sockaddr_un_set_path(&sa.un, path); + if (salen < 0) + return salen; + + if (connect(fd, &sa.sa, salen) < 0) return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have * indication that his wasn't an AF_UNIX socket after all */ - } if ((flags & O_ACCMODE) == O_RDONLY) r = shutdown(fd, SHUT_WR); else if ((flags & O_ACCMODE) == O_WRONLY) r = shutdown(fd, SHUT_RD); else - return fd; - if (r < 0) { - safe_close(fd); + return TAKE_FD(fd); + if (r < 0) return -errno; - } - return fd; + return TAKE_FD(fd); } static int fixup_input( @@ -544,6 +543,30 @@ static int setup_input( } } +static bool can_inherit_stderr_from_stdout( + const ExecContext *context, + ExecOutput o, + ExecOutput e) { + + assert(context); + + /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the + * stderr fd */ + + if (e == EXEC_OUTPUT_INHERIT) + return true; + if (e != o) + return false; + + if (e == EXEC_OUTPUT_NAMED_FD) + return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]); + + if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND)) + return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]); + + return true; +} + static int setup_output( const Unit *unit, const ExecContext *context, @@ -602,7 +625,7 @@ static int setup_output( return fileno; /* Duplicate from stdout if possible */ - if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT) + if (can_inherit_stderr_from_stdout(context, o, e)) return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno; o = e; @@ -675,9 +698,10 @@ static int setup_output( (void) fd_nonblock(named_iofds[fileno], false); return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno; - case EXEC_OUTPUT_FILE: { + case EXEC_OUTPUT_FILE: + case EXEC_OUTPUT_FILE_APPEND: { bool rw; - int fd; + int fd, flags; assert(context->stdio_file[fileno]); @@ -687,11 +711,15 @@ static int setup_output( if (rw) return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno; - fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask); + flags = O_WRONLY; + if (o == EXEC_OUTPUT_FILE_APPEND) + flags |= O_APPEND; + + fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask); if (fd < 0) return fd; - return move_fd(fd, fileno, false); + return move_fd(fd, fileno, 0); } default: @@ -914,7 +942,7 @@ static int get_fixed_user(const ExecContext *c, const char **user, * (i.e. are "/" or "/bin/nologin"). */ name = c->user; - r = get_user_creds_clean(&name, uid, gid, home, shell); + r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN); if (r < 0) return r; @@ -932,7 +960,7 @@ static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) return 0; name = c->group; - r = get_group_creds(&name, gid); + r = get_group_creds(&name, gid, 0); if (r < 0) return r; @@ -1004,7 +1032,7 @@ static int get_supplementary_groups(const ExecContext *c, const char *user, return -E2BIG; g = *i; - r = get_group_creds(&g, l_gids+k); + r = get_group_creds(&g, l_gids+k, 0); if (r < 0) return r; @@ -1151,6 +1179,16 @@ static int setup_pam( goto fail; } + if (!tty) { + _cleanup_free_ char *q = NULL; + + /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure + * out if that's the case, and read the TTY off it. */ + + if (getttyname_malloc(STDIN_FILENO, &q) >= 0) + tty = strjoina("/dev/", q); + } + if (tty) { pam_code = pam_set_item(handle, PAM_TTY, tty); if (pam_code != PAM_SUCCESS) @@ -1415,7 +1453,7 @@ static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ return r; } - return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action); + return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false); } static int apply_syscall_archs(const Unit *u, const ExecContext *c) { @@ -1498,7 +1536,7 @@ static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "ProtectKernelModules=")) return 0; - return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM)); + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false); } static int apply_private_devices(const Unit *u, const ExecContext *c) { @@ -1513,7 +1551,7 @@ static int apply_private_devices(const Unit *u, const ExecContext *c) { if (skip_seccomp_unavailable(u, "PrivateDevices=")) return 0; - return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM)); + return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false); } static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) { @@ -1585,6 +1623,8 @@ static void do_idle_pipe_dance(int idle_pipe[4]) { idle_pipe[3] = safe_close(idle_pipe[3]); } +static const char *exec_directory_env_name_to_string(ExecDirectoryType t); + static int build_environment( const Unit *u, const ExecContext *c, @@ -1598,14 +1638,16 @@ static int build_environment( char ***ret) { _cleanup_strv_free_ char **our_env = NULL; + ExecDirectoryType t; size_t n_env = 0; char *x; assert(u); assert(c); + assert(p); assert(ret); - our_env = new0(char*, 14); + our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX); if (!our_env) return -ENOMEM; @@ -1710,8 +1752,37 @@ static int build_environment( our_env[n_env++] = x; } + for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) { + _cleanup_free_ char *pre = NULL, *joined = NULL; + const char *n; + + if (!p->prefix[t]) + continue; + + if (strv_isempty(c->directories[t].paths)) + continue; + + n = exec_directory_env_name_to_string(t); + if (!n) + continue; + + pre = strjoin(p->prefix[t], "/"); + if (!pre) + return -ENOMEM; + + joined = strv_join_prefix(c->directories[t].paths, ":", pre); + if (!joined) + return -ENOMEM; + + x = strjoin(n, "=", joined); + if (!x) + return -ENOMEM; + + our_env[n_env++] = x; + } + our_env[n_env++] = NULL; - assert(n_env <= 12); + assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX); *ret = TAKE_PTR(our_env); @@ -2010,7 +2081,7 @@ static int setup_exec_directory( if (context->dynamic_user && !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) { - _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL; + _cleanup_free_ char *private_root = NULL; /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we * want to avoid leaving a directory around fully accessible that is owned by a dynamic user @@ -2075,18 +2146,8 @@ static int setup_exec_directory( goto fail; } - parent = dirname_malloc(p); - if (!parent) { - r = -ENOMEM; - goto fail; - } - - r = path_make_relative(parent, pp, &relative); - if (r < 0) - goto fail; - /* And link it up from the original place */ - r = symlink_idempotent(relative, p); + r = symlink_idempotent(pp, p, true); if (r < 0) goto fail; @@ -2379,11 +2440,24 @@ static int apply_mount_namespace( bind_mount_free_many(bind_mounts, n_bind_mounts); - /* If we couldn't set up the namespace this is probably due to a - * missing capability. In this case, silently proceeed. */ - if (IN_SET(r, -EPERM, -EACCES)) { - log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m"); - return 0; + /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports + * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively + * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a + * completely different execution environment. */ + if (r == -ENOANO) { + if (n_bind_mounts == 0 && + context->n_temporary_filesystems == 0 && + !root_dir && !root_image && + !context->dynamic_user) { + log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring."); + return 0; + } + + log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n" + "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s", + n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user)); + + return -EOPNOTSUPP; } return r; @@ -2456,9 +2530,6 @@ static int setup_keyring( * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */ - if (!(p->flags & EXEC_NEW_KEYRING)) - return 0; - if (context->keyring_mode == EXEC_KEYRING_INHERIT) return 0; @@ -2566,6 +2637,7 @@ static int close_remaining_fds( const DynamicCreds *dcreds, int user_lookup_fd, int socket_fd, + int exec_fd, int *fds, size_t n_fds) { size_t n_dont_close = 0; @@ -2582,6 +2654,8 @@ static int close_remaining_fds( if (socket_fd >= 0) dont_close[n_dont_close++] = socket_fd; + if (exec_fd >= 0) + dont_close[n_dont_close++] = exec_fd; if (n_fds > 0) { memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds); n_dont_close += n_fds; @@ -2707,6 +2781,37 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p static char *exec_command_line(char **argv); +static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) { + bool using_subcgroup; + char *p; + + assert(params); + assert(ret); + + if (!params->cgroup_path) + return -EINVAL; + + /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated + * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control + * processes started after the main unit's process in the unit's main cgroup because it is now an inner one, + * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process, + * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=, + * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre= + * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP + * flag, which is only passed for the former statements, not for the latter. */ + + using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL); + if (using_subcgroup) + p = strjoin(params->cgroup_path, "/.control"); + else + p = strdup(params->cgroup_path); + if (!p) + return -ENOMEM; + + *ret = p; + return using_subcgroup; +} + static int exec_child( Unit *unit, const ExecCommand *command, @@ -2714,20 +2819,20 @@ static int exec_child( const ExecParameters *params, ExecRuntime *runtime, DynamicCreds *dcreds, - char **argv, int socket_fd, int named_iofds[3], int *fds, - size_t n_storage_fds, size_t n_socket_fds, + size_t n_storage_fds, char **files_env, int user_lookup_fd, int *exit_status) { _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL; - _cleanup_free_ char *home_buffer = NULL; + int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1; _cleanup_free_ gid_t *supplementary_gids = NULL; const char *username = NULL, *groupname = NULL; + _cleanup_free_ char *home_buffer = NULL; const char *home = NULL, *shell = NULL; dev_t journal_stream_dev = 0; ino_t journal_stream_ino = 0; @@ -2747,7 +2852,6 @@ static int exec_child( #endif uid_t uid = UID_INVALID; gid_t gid = GID_INVALID; - int r, ngids = 0; size_t n_fds; ExecDirectoryType dt; int secure_bits; @@ -2791,8 +2895,8 @@ static int exec_child( /* In case anything used libc syslog(), close this here, too */ closelog(); - n_fds = n_storage_fds + n_socket_fds; - r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds); + n_fds = n_socket_fds + n_storage_fds; + r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m"); @@ -2810,7 +2914,7 @@ static int exec_child( const char *vc = params->confirm_spawn; _cleanup_free_ char *cmdline = NULL; - cmdline = exec_command_line(argv); + cmdline = exec_command_line(command->argv); if (!cmdline) { *exit_status = EXIT_MEMORY; return log_oom(); @@ -2828,10 +2932,22 @@ static int exec_child( } } + /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is + * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note + * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS + * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they + * might internally call into other NSS modules that are involved in hostname resolution, we never know. */ + if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 || + setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) { + *exit_status = EXIT_MEMORY; + return log_unit_error_errno(unit, errno, "Failed to update environment: %m"); + } + if (context->dynamic_user && dcreds) { _cleanup_strv_free_ char **suggested_paths = NULL; - /* Make sure we bypass our own NSS module for any NSS checks */ + /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS + * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/ if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) { *exit_status = EXIT_USER; return log_unit_error_errno(unit, errno, "Failed to update environment: %m"); @@ -2909,6 +3025,24 @@ static int exec_child( if (socket_fd >= 0) (void) fd_nonblock(socket_fd, false); + /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields. + * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */ + if (params->cgroup_path) { + _cleanup_free_ char *p = NULL; + + r = exec_parameters_get_cgroup_path(params, &p); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m"); + } + + r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL); + if (r < 0) { + *exit_status = EXIT_CGROUP; + return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p); + } + } + r = setup_input(context, params, socket_fd, named_iofds); if (r < 0) { *exit_status = EXIT_STDIN; @@ -2927,14 +3061,6 @@ static int exec_child( return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m"); } - if (params->cgroup_path) { - r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL); - if (r < 0) { - *exit_status = EXIT_CGROUP; - return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path); - } - } - if (context->oom_score_adjust_set) { /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces * prohibit write access to this file, and we shouldn't trip up over that. */ @@ -3130,11 +3256,6 @@ static int exec_child( } } - /* Apply just after mount namespace setup */ - r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status); - if (r < 0) - return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m"); - /* Drop groups as early as possbile */ if (needs_setuid) { r = enforce_groups(gid, supplementary_gids, ngids); @@ -3165,18 +3286,59 @@ static int exec_child( } /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are - * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd - * was needed to upload the policy and can now be closed as well. */ - r = close_all_fds(fds, n_fds); + * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd + * however if we have it as we want to keep it open until the final execve(). */ + + if (params->exec_fd >= 0) { + exec_fd = params->exec_fd; + + if (exec_fd < 3 + (int) n_fds) { + int moved_fd; + + /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the + * process we are about to execute. */ + + moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds); + if (moved_fd < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m"); + } + + safe_close(exec_fd); + exec_fd = moved_fd; + } else { + /* This fd should be FD_CLOEXEC already, but let's make sure. */ + r = fd_cloexec(exec_fd, true); + if (r < 0) { + *exit_status = EXIT_FDS; + return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m"); + } + } + + fds_with_exec_fd = newa(int, n_fds + 1); + memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int)); + fds_with_exec_fd[n_fds] = exec_fd; + n_fds_with_exec_fd = n_fds + 1; + } else { + fds_with_exec_fd = fds; + n_fds_with_exec_fd = n_fds; + } + + r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd); if (r >= 0) r = shift_fds(fds, n_fds); if (r >= 0) - r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking); + r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking); if (r < 0) { *exit_status = EXIT_FDS; return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m"); } + /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off + * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined, + * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we + * came this far. */ + secure_bits = context->secure_bits; if (needs_sandboxing) { @@ -3268,6 +3430,12 @@ static int exec_child( } } + /* Apply working directory here, because the working directory might be on NFS and only the user running + * this service might have the correct privilege to change to the working directory */ + r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status); + if (r < 0) + return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m"); + if (needs_sandboxing) { /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires @@ -3389,7 +3557,7 @@ static int exec_child( strv_free_and_replace(accum_env, ee); } - final_argv = replace_env_argv(argv, accum_env); + final_argv = replace_env_argv(command->argv, accum_env); if (!final_argv) { *exit_status = EXIT_MEMORY; return log_oom(); @@ -3407,10 +3575,35 @@ static int exec_child( LOG_UNIT_INVOCATION_ID(unit)); } + if (exec_fd >= 0) { + uint8_t hot = 1; + + /* We have finished with all our initializations. Let's now let the manager know that. From this point + * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m"); + } + } + execve(command->path, final_argv, accum_env); + r = -errno; + + if (exec_fd >= 0) { + uint8_t hot = 0; - if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { - log_struct_errno(LOG_INFO, errno, + /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager + * that POLLHUP on it no longer means execve() succeeded. */ + + if (write(exec_fd, &hot, sizeof(hot)) < 0) { + *exit_status = EXIT_EXEC; + return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m"); + } + } + + if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) { + log_struct_errno(LOG_INFO, r, "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR, LOG_UNIT_ID(unit), LOG_UNIT_INVOCATION_ID(unit), @@ -3421,7 +3614,7 @@ static int exec_child( } *exit_status = EXIT_EXEC; - return log_unit_error_errno(unit, errno, "Failed to execute command: %m"); + return log_unit_error_errno(unit, r, "Failed to execute command: %m"); } static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l); @@ -3435,13 +3628,11 @@ int exec_spawn(Unit *unit, DynamicCreds *dcreds, pid_t *ret) { + int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL; + _cleanup_free_ char *subcgroup_path = NULL; _cleanup_strv_free_ char **files_env = NULL; - int *fds = NULL; size_t n_storage_fds = 0, n_socket_fds = 0; _cleanup_free_ char *line = NULL; - int socket_fd, r; - int named_iofds[3] = { -1, -1, -1 }; - char **argv; pid_t pid; assert(unit); @@ -3449,7 +3640,7 @@ int exec_spawn(Unit *unit, assert(context); assert(ret); assert(params); - assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0)); + assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0)); if (context->std_input == EXEC_INPUT_SOCKET || context->std_output == EXEC_OUTPUT_SOCKET || @@ -3469,8 +3660,8 @@ int exec_spawn(Unit *unit, } else { socket_fd = -1; fds = params->fds; - n_storage_fds = params->n_storage_fds; n_socket_fds = params->n_socket_fds; + n_storage_fds = params->n_storage_fds; } r = exec_context_named_iofds(context, params, named_iofds); @@ -3481,8 +3672,7 @@ int exec_spawn(Unit *unit, if (r < 0) return log_unit_error_errno(unit, r, "Failed to load environment files: %m"); - argv = params->argv ?: command->argv; - line = exec_command_line(argv); + line = exec_command_line(command->argv); if (!line) return log_oom(); @@ -3492,6 +3682,17 @@ int exec_spawn(Unit *unit, LOG_UNIT_ID(unit), LOG_UNIT_INVOCATION_ID(unit)); + if (params->cgroup_path) { + r = exec_parameters_get_cgroup_path(params, &subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m"); + if (r > 0) { /* We are using a child cgroup */ + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); + } + } + pid = fork(); if (pid < 0) return log_unit_error_errno(unit, errno, "Failed to fork: %m"); @@ -3505,12 +3706,11 @@ int exec_spawn(Unit *unit, params, runtime, dcreds, - argv, socket_fd, named_iofds, fds, - n_storage_fds, n_socket_fds, + n_storage_fds, files_env, unit->manager->user_lookup_fds[1], &exit_status); @@ -3530,13 +3730,11 @@ int exec_spawn(Unit *unit, log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid); - /* We add the new process to the cgroup both in the child (so - * that we can be sure that no user code is ever executed - * outside of the cgroup) and in the parent (so that we can be - * sure that when we kill the cgroup the process will be - * killed too). */ - if (params->cgroup_path) - (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid); + /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever + * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the + * process will be killed too). */ + if (subcgroup_path) + (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid); exec_status_start(&command->exec_status, pid); @@ -3624,6 +3822,9 @@ void exec_context_done(ExecContext *c) { exec_context_free_log_extra_fields(c); + c->log_rate_limit_interval_usec = 0; + c->log_rate_limit_burst = 0; + c->stdin_data = mfree(c->stdin_data); c->stdin_data_size = 0; } @@ -3655,7 +3856,6 @@ static void exec_command_done(ExecCommand *c) { assert(c); c->path = mfree(c->path); - c->argv = strv_free(c->argv); } @@ -3685,6 +3885,24 @@ void exec_command_free_array(ExecCommand **c, size_t n) { c[i] = exec_command_free_list(c[i]); } +void exec_command_reset_status_array(ExecCommand *c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) + exec_status_reset(&c[i].exec_status); +} + +void exec_command_reset_status_list_array(ExecCommand **c, size_t n) { + size_t i; + + for (i = 0; i < n; i++) { + ExecCommand *z; + + LIST_FOREACH(command, z, c[i]) + exec_status_reset(&z->exec_status); + } +} + typedef struct InvalidEnvInfo { const Unit *unit; const char *path; @@ -3813,7 +4031,7 @@ static int exec_context_load_environment(const Unit *unit, const ExecContext *c, assert(pglob.gl_pathc > 0); for (n = 0; n < pglob.gl_pathc; n++) { - k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p); + k = load_env_file(NULL, pglob.gl_pathv[n], &p); if (k < 0) { if (ignore) continue; @@ -3976,9 +4194,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { for (i = 0; i < RLIM_NLIMITS; i++) if (c->rlimit[i]) { - fprintf(f, "Limit%s%s: " RLIM_FMT "\n", + fprintf(f, "%sLimit%s: " RLIM_FMT "\n", prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max); - fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n", + fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n", prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur); } @@ -4036,8 +4254,12 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]); if (c->std_output == EXEC_OUTPUT_FILE) fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); + if (c->std_output == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]); if (c->std_error == EXEC_OUTPUT_FILE) fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]); + if (c->std_error == EXEC_OUTPUT_FILE_APPEND) + fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]); if (c->tty_path) fprintf(f, @@ -4084,6 +4306,17 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) { fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t)); } + if (c->log_rate_limit_interval_usec > 0) { + char buf_timespan[FORMAT_TIMESPAN_MAX]; + + fprintf(f, + "%sLogRateLimitIntervalSec: %s\n", + prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC)); + } + + if (c->log_rate_limit_burst > 0) + fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst); + if (c->n_log_extra_fields > 0) { size_t j; @@ -4331,18 +4564,22 @@ void exec_context_free_log_extra_fields(ExecContext *c) { void exec_status_start(ExecStatus *s, pid_t pid) { assert(s); - zero(*s); - s->pid = pid; + *s = (ExecStatus) { + .pid = pid, + }; + dual_timestamp_get(&s->start_timestamp); } void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) { assert(s); - if (s->pid && s->pid != pid) - zero(*s); + if (s->pid != pid) { + *s = (ExecStatus) { + .pid = pid, + }; + } - s->pid = pid; dual_timestamp_get(&s->exit_timestamp); s->code = code; @@ -4350,12 +4587,18 @@ void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int if (context) { if (context->utmp_id) - utmp_put_dead_process(context->utmp_id, pid, code, status); + (void) utmp_put_dead_process(context->utmp_id, pid, code, status); exec_context_tty_reset(context, NULL); } } +void exec_status_reset(ExecStatus *s) { + assert(s); + + *s = (ExecStatus) {}; +} + void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) { char buf[FORMAT_TIMESTAMP_MAX]; @@ -4487,8 +4730,7 @@ int exec_command_set(ExecCommand *c, const char *path, ...) { return -ENOMEM; } - free(c->path); - c->path = p; + free_and_replace(c->path, p); return strv_free_and_replace(c->argv, l); } @@ -4920,10 +5162,8 @@ void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) { finalize: r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL); - if (r < 0) { + if (r < 0) log_debug_errno(r, "Failed to add exec-runtime: %m"); - return; - } } void exec_runtime_vacuum(Manager *m) { @@ -4942,6 +5182,13 @@ void exec_runtime_vacuum(Manager *m) { } } +void exec_params_clear(ExecParameters *p) { + if (!p) + return; + + strv_free(p->environment); +} + static const char* const exec_input_table[_EXEC_INPUT_MAX] = { [EXEC_INPUT_NULL] = "null", [EXEC_INPUT_TTY] = "tty", @@ -4968,6 +5215,7 @@ static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = { [EXEC_OUTPUT_SOCKET] = "socket", [EXEC_OUTPUT_NAMED_FD] = "fd", [EXEC_OUTPUT_FILE] = "file", + [EXEC_OUTPUT_FILE_APPEND] = "append", }; DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput); @@ -4998,6 +5246,16 @@ static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = { DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType); +static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = { + [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY", + [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY", + [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY", + [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY", + [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY", +}; + +DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType); + static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = { [EXEC_KEYRING_INHERIT] = "inherit", [EXEC_KEYRING_PRIVATE] = "private", |