summaryrefslogtreecommitdiff
path: root/src/core/execute.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/execute.c')
-rw-r--r--src/core/execute.c498
1 files changed, 378 insertions, 120 deletions
diff --git a/src/core/execute.c b/src/core/execute.c
index 8ac69d1a0f..595a3c6eca 100644
--- a/src/core/execute.c
+++ b/src/core/execute.c
@@ -50,12 +50,12 @@
#include "chown-recursive.h"
#include "cpu-set-util.h"
#include "def.h"
+#include "env-file.h"
#include "env-util.h"
#include "errno-list.h"
#include "execute.h"
#include "exit-status.h"
#include "fd-util.h"
-#include "fileio.h"
#include "format-util.h"
#include "fs-util.h"
#include "glob-util.h"
@@ -76,7 +76,6 @@
#if HAVE_SECCOMP
#include "seccomp-util.h"
#endif
-#include "securebits.h"
#include "securebits-util.h"
#include "selinux-util.h"
#include "signal-util.h"
@@ -89,6 +88,7 @@
#include "strv.h"
#include "syslog-util.h"
#include "terminal-util.h"
+#include "umask-util.h"
#include "unit.h"
#include "user-util.h"
#include "util.h"
@@ -147,11 +147,11 @@ static int shift_fds(int fds[], size_t n_fds) {
return 0;
}
-static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) {
+static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
size_t i, n_fds;
int r;
- n_fds = n_storage_fds + n_socket_fds;
+ n_fds = n_socket_fds + n_storage_fds;
if (n_fds <= 0)
return 0;
@@ -323,7 +323,8 @@ static int connect_logger_as(
uid_t uid,
gid_t gid) {
- int fd, r;
+ _cleanup_close_ int fd = -1;
+ int r;
assert(context);
assert(params);
@@ -339,14 +340,12 @@ static int connect_logger_as(
if (r < 0)
return r;
- if (shutdown(fd, SHUT_RD) < 0) {
- safe_close(fd);
+ if (shutdown(fd, SHUT_RD) < 0)
return -errno;
- }
(void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
- dprintf(fd,
+ if (dprintf(fd,
"%s\n"
"%s\n"
"%i\n"
@@ -360,10 +359,12 @@ static int connect_logger_as(
!!context->syslog_level_prefix,
is_syslog_output(output),
is_kmsg_output(output),
- is_terminal_output(output));
+ is_terminal_output(output)) < 0)
+ return -errno;
- return move_fd(fd, nfd, false);
+ return move_fd(TAKE_FD(fd), nfd, false);
}
+
static int open_terminal_as(const char *path, int flags, int nfd) {
int fd;
@@ -378,10 +379,9 @@ static int open_terminal_as(const char *path, int flags, int nfd) {
}
static int acquire_path(const char *path, int flags, mode_t mode) {
- union sockaddr_union sa = {
- .sa.sa_family = AF_UNIX,
- };
- int fd, r;
+ union sockaddr_union sa = {};
+ _cleanup_close_ int fd = -1;
+ int r, salen;
assert(path);
@@ -390,11 +390,11 @@ static int acquire_path(const char *path, int flags, mode_t mode) {
fd = open(path, flags|O_NOCTTY, mode);
if (fd >= 0)
- return fd;
+ return TAKE_FD(fd);
if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
return -errno;
- if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
+ if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
return -ENXIO;
/* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
@@ -403,25 +403,24 @@ static int acquire_path(const char *path, int flags, mode_t mode) {
if (fd < 0)
return -errno;
- strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
- if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
- safe_close(fd);
+ salen = sockaddr_un_set_path(&sa.un, path);
+ if (salen < 0)
+ return salen;
+
+ if (connect(fd, &sa.sa, salen) < 0)
return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
* indication that his wasn't an AF_UNIX socket after all */
- }
if ((flags & O_ACCMODE) == O_RDONLY)
r = shutdown(fd, SHUT_WR);
else if ((flags & O_ACCMODE) == O_WRONLY)
r = shutdown(fd, SHUT_RD);
else
- return fd;
- if (r < 0) {
- safe_close(fd);
+ return TAKE_FD(fd);
+ if (r < 0)
return -errno;
- }
- return fd;
+ return TAKE_FD(fd);
}
static int fixup_input(
@@ -544,6 +543,30 @@ static int setup_input(
}
}
+static bool can_inherit_stderr_from_stdout(
+ const ExecContext *context,
+ ExecOutput o,
+ ExecOutput e) {
+
+ assert(context);
+
+ /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
+ * stderr fd */
+
+ if (e == EXEC_OUTPUT_INHERIT)
+ return true;
+ if (e != o)
+ return false;
+
+ if (e == EXEC_OUTPUT_NAMED_FD)
+ return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
+
+ if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
+ return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
+
+ return true;
+}
+
static int setup_output(
const Unit *unit,
const ExecContext *context,
@@ -602,7 +625,7 @@ static int setup_output(
return fileno;
/* Duplicate from stdout if possible */
- if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
+ if (can_inherit_stderr_from_stdout(context, o, e))
return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
o = e;
@@ -675,9 +698,10 @@ static int setup_output(
(void) fd_nonblock(named_iofds[fileno], false);
return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
- case EXEC_OUTPUT_FILE: {
+ case EXEC_OUTPUT_FILE:
+ case EXEC_OUTPUT_FILE_APPEND: {
bool rw;
- int fd;
+ int fd, flags;
assert(context->stdio_file[fileno]);
@@ -687,11 +711,15 @@ static int setup_output(
if (rw)
return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
- fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
+ flags = O_WRONLY;
+ if (o == EXEC_OUTPUT_FILE_APPEND)
+ flags |= O_APPEND;
+
+ fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
if (fd < 0)
return fd;
- return move_fd(fd, fileno, false);
+ return move_fd(fd, fileno, 0);
}
default:
@@ -914,7 +942,7 @@ static int get_fixed_user(const ExecContext *c, const char **user,
* (i.e. are "/" or "/bin/nologin"). */
name = c->user;
- r = get_user_creds_clean(&name, uid, gid, home, shell);
+ r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
if (r < 0)
return r;
@@ -932,7 +960,7 @@ static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid)
return 0;
name = c->group;
- r = get_group_creds(&name, gid);
+ r = get_group_creds(&name, gid, 0);
if (r < 0)
return r;
@@ -1004,7 +1032,7 @@ static int get_supplementary_groups(const ExecContext *c, const char *user,
return -E2BIG;
g = *i;
- r = get_group_creds(&g, l_gids+k);
+ r = get_group_creds(&g, l_gids+k, 0);
if (r < 0)
return r;
@@ -1151,6 +1179,16 @@ static int setup_pam(
goto fail;
}
+ if (!tty) {
+ _cleanup_free_ char *q = NULL;
+
+ /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
+ * out if that's the case, and read the TTY off it. */
+
+ if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
+ tty = strjoina("/dev/", q);
+ }
+
if (tty) {
pam_code = pam_set_item(handle, PAM_TTY, tty);
if (pam_code != PAM_SUCCESS)
@@ -1415,7 +1453,7 @@ static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_
return r;
}
- return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
+ return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
}
static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
@@ -1498,7 +1536,7 @@ static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
return 0;
- return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
}
static int apply_private_devices(const Unit *u, const ExecContext *c) {
@@ -1513,7 +1551,7 @@ static int apply_private_devices(const Unit *u, const ExecContext *c) {
if (skip_seccomp_unavailable(u, "PrivateDevices="))
return 0;
- return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
+ return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
}
static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
@@ -1585,6 +1623,8 @@ static void do_idle_pipe_dance(int idle_pipe[4]) {
idle_pipe[3] = safe_close(idle_pipe[3]);
}
+static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
+
static int build_environment(
const Unit *u,
const ExecContext *c,
@@ -1598,14 +1638,16 @@ static int build_environment(
char ***ret) {
_cleanup_strv_free_ char **our_env = NULL;
+ ExecDirectoryType t;
size_t n_env = 0;
char *x;
assert(u);
assert(c);
+ assert(p);
assert(ret);
- our_env = new0(char*, 14);
+ our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
if (!our_env)
return -ENOMEM;
@@ -1710,8 +1752,37 @@ static int build_environment(
our_env[n_env++] = x;
}
+ for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
+ _cleanup_free_ char *pre = NULL, *joined = NULL;
+ const char *n;
+
+ if (!p->prefix[t])
+ continue;
+
+ if (strv_isempty(c->directories[t].paths))
+ continue;
+
+ n = exec_directory_env_name_to_string(t);
+ if (!n)
+ continue;
+
+ pre = strjoin(p->prefix[t], "/");
+ if (!pre)
+ return -ENOMEM;
+
+ joined = strv_join_prefix(c->directories[t].paths, ":", pre);
+ if (!joined)
+ return -ENOMEM;
+
+ x = strjoin(n, "=", joined);
+ if (!x)
+ return -ENOMEM;
+
+ our_env[n_env++] = x;
+ }
+
our_env[n_env++] = NULL;
- assert(n_env <= 12);
+ assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
*ret = TAKE_PTR(our_env);
@@ -2010,7 +2081,7 @@ static int setup_exec_directory(
if (context->dynamic_user &&
!IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
- _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
+ _cleanup_free_ char *private_root = NULL;
/* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
* want to avoid leaving a directory around fully accessible that is owned by a dynamic user
@@ -2075,18 +2146,8 @@ static int setup_exec_directory(
goto fail;
}
- parent = dirname_malloc(p);
- if (!parent) {
- r = -ENOMEM;
- goto fail;
- }
-
- r = path_make_relative(parent, pp, &relative);
- if (r < 0)
- goto fail;
-
/* And link it up from the original place */
- r = symlink_idempotent(relative, p);
+ r = symlink_idempotent(pp, p, true);
if (r < 0)
goto fail;
@@ -2379,11 +2440,24 @@ static int apply_mount_namespace(
bind_mount_free_many(bind_mounts, n_bind_mounts);
- /* If we couldn't set up the namespace this is probably due to a
- * missing capability. In this case, silently proceeed. */
- if (IN_SET(r, -EPERM, -EACCES)) {
- log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
- return 0;
+ /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
+ * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
+ * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
+ * completely different execution environment. */
+ if (r == -ENOANO) {
+ if (n_bind_mounts == 0 &&
+ context->n_temporary_filesystems == 0 &&
+ !root_dir && !root_image &&
+ !context->dynamic_user) {
+ log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
+ return 0;
+ }
+
+ log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
+ "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
+ n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
+
+ return -EOPNOTSUPP;
}
return r;
@@ -2456,9 +2530,6 @@ static int setup_keyring(
* on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
* UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
- if (!(p->flags & EXEC_NEW_KEYRING))
- return 0;
-
if (context->keyring_mode == EXEC_KEYRING_INHERIT)
return 0;
@@ -2566,6 +2637,7 @@ static int close_remaining_fds(
const DynamicCreds *dcreds,
int user_lookup_fd,
int socket_fd,
+ int exec_fd,
int *fds, size_t n_fds) {
size_t n_dont_close = 0;
@@ -2582,6 +2654,8 @@ static int close_remaining_fds(
if (socket_fd >= 0)
dont_close[n_dont_close++] = socket_fd;
+ if (exec_fd >= 0)
+ dont_close[n_dont_close++] = exec_fd;
if (n_fds > 0) {
memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
n_dont_close += n_fds;
@@ -2707,6 +2781,37 @@ static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p
static char *exec_command_line(char **argv);
+static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
+ bool using_subcgroup;
+ char *p;
+
+ assert(params);
+ assert(ret);
+
+ if (!params->cgroup_path)
+ return -EINVAL;
+
+ /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
+ * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
+ * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
+ * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
+ * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
+ * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
+ * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
+ * flag, which is only passed for the former statements, not for the latter. */
+
+ using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
+ if (using_subcgroup)
+ p = strjoin(params->cgroup_path, "/.control");
+ else
+ p = strdup(params->cgroup_path);
+ if (!p)
+ return -ENOMEM;
+
+ *ret = p;
+ return using_subcgroup;
+}
+
static int exec_child(
Unit *unit,
const ExecCommand *command,
@@ -2714,20 +2819,20 @@ static int exec_child(
const ExecParameters *params,
ExecRuntime *runtime,
DynamicCreds *dcreds,
- char **argv,
int socket_fd,
int named_iofds[3],
int *fds,
- size_t n_storage_fds,
size_t n_socket_fds,
+ size_t n_storage_fds,
char **files_env,
int user_lookup_fd,
int *exit_status) {
_cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
- _cleanup_free_ char *home_buffer = NULL;
+ int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
_cleanup_free_ gid_t *supplementary_gids = NULL;
const char *username = NULL, *groupname = NULL;
+ _cleanup_free_ char *home_buffer = NULL;
const char *home = NULL, *shell = NULL;
dev_t journal_stream_dev = 0;
ino_t journal_stream_ino = 0;
@@ -2747,7 +2852,6 @@ static int exec_child(
#endif
uid_t uid = UID_INVALID;
gid_t gid = GID_INVALID;
- int r, ngids = 0;
size_t n_fds;
ExecDirectoryType dt;
int secure_bits;
@@ -2791,8 +2895,8 @@ static int exec_child(
/* In case anything used libc syslog(), close this here, too */
closelog();
- n_fds = n_storage_fds + n_socket_fds;
- r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
+ n_fds = n_socket_fds + n_storage_fds;
+ r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
if (r < 0) {
*exit_status = EXIT_FDS;
return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
@@ -2810,7 +2914,7 @@ static int exec_child(
const char *vc = params->confirm_spawn;
_cleanup_free_ char *cmdline = NULL;
- cmdline = exec_command_line(argv);
+ cmdline = exec_command_line(command->argv);
if (!cmdline) {
*exit_status = EXIT_MEMORY;
return log_oom();
@@ -2828,10 +2932,22 @@ static int exec_child(
}
}
+ /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
+ * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
+ * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
+ * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
+ * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
+ if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
+ setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
+ *exit_status = EXIT_MEMORY;
+ return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
+ }
+
if (context->dynamic_user && dcreds) {
_cleanup_strv_free_ char **suggested_paths = NULL;
- /* Make sure we bypass our own NSS module for any NSS checks */
+ /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
+ * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
*exit_status = EXIT_USER;
return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
@@ -2909,6 +3025,24 @@ static int exec_child(
if (socket_fd >= 0)
(void) fd_nonblock(socket_fd, false);
+ /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
+ * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
+ if (params->cgroup_path) {
+ _cleanup_free_ char *p = NULL;
+
+ r = exec_parameters_get_cgroup_path(params, &p);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
+ }
+
+ r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
+ if (r < 0) {
+ *exit_status = EXIT_CGROUP;
+ return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
+ }
+ }
+
r = setup_input(context, params, socket_fd, named_iofds);
if (r < 0) {
*exit_status = EXIT_STDIN;
@@ -2927,14 +3061,6 @@ static int exec_child(
return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
}
- if (params->cgroup_path) {
- r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
- if (r < 0) {
- *exit_status = EXIT_CGROUP;
- return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
- }
- }
-
if (context->oom_score_adjust_set) {
/* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
* prohibit write access to this file, and we shouldn't trip up over that. */
@@ -3130,11 +3256,6 @@ static int exec_child(
}
}
- /* Apply just after mount namespace setup */
- r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
- if (r < 0)
- return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
-
/* Drop groups as early as possbile */
if (needs_setuid) {
r = enforce_groups(gid, supplementary_gids, ngids);
@@ -3165,18 +3286,59 @@ static int exec_child(
}
/* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
- * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
- * was needed to upload the policy and can now be closed as well. */
- r = close_all_fds(fds, n_fds);
+ * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
+ * however if we have it as we want to keep it open until the final execve(). */
+
+ if (params->exec_fd >= 0) {
+ exec_fd = params->exec_fd;
+
+ if (exec_fd < 3 + (int) n_fds) {
+ int moved_fd;
+
+ /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
+ * process we are about to execute. */
+
+ moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
+ if (moved_fd < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
+ }
+
+ safe_close(exec_fd);
+ exec_fd = moved_fd;
+ } else {
+ /* This fd should be FD_CLOEXEC already, but let's make sure. */
+ r = fd_cloexec(exec_fd, true);
+ if (r < 0) {
+ *exit_status = EXIT_FDS;
+ return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
+ }
+ }
+
+ fds_with_exec_fd = newa(int, n_fds + 1);
+ memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
+ fds_with_exec_fd[n_fds] = exec_fd;
+ n_fds_with_exec_fd = n_fds + 1;
+ } else {
+ fds_with_exec_fd = fds;
+ n_fds_with_exec_fd = n_fds;
+ }
+
+ r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
if (r >= 0)
r = shift_fds(fds, n_fds);
if (r >= 0)
- r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
+ r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
if (r < 0) {
*exit_status = EXIT_FDS;
return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
}
+ /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
+ * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
+ * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
+ * came this far. */
+
secure_bits = context->secure_bits;
if (needs_sandboxing) {
@@ -3268,6 +3430,12 @@ static int exec_child(
}
}
+ /* Apply working directory here, because the working directory might be on NFS and only the user running
+ * this service might have the correct privilege to change to the working directory */
+ r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
+
if (needs_sandboxing) {
/* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
* influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
@@ -3389,7 +3557,7 @@ static int exec_child(
strv_free_and_replace(accum_env, ee);
}
- final_argv = replace_env_argv(argv, accum_env);
+ final_argv = replace_env_argv(command->argv, accum_env);
if (!final_argv) {
*exit_status = EXIT_MEMORY;
return log_oom();
@@ -3407,10 +3575,35 @@ static int exec_child(
LOG_UNIT_INVOCATION_ID(unit));
}
+ if (exec_fd >= 0) {
+ uint8_t hot = 1;
+
+ /* We have finished with all our initializations. Let's now let the manager know that. From this point
+ * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
+
+ if (write(exec_fd, &hot, sizeof(hot)) < 0) {
+ *exit_status = EXIT_EXEC;
+ return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
+ }
+ }
+
execve(command->path, final_argv, accum_env);
+ r = -errno;
+
+ if (exec_fd >= 0) {
+ uint8_t hot = 0;
- if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
- log_struct_errno(LOG_INFO, errno,
+ /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
+ * that POLLHUP on it no longer means execve() succeeded. */
+
+ if (write(exec_fd, &hot, sizeof(hot)) < 0) {
+ *exit_status = EXIT_EXEC;
+ return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
+ }
+ }
+
+ if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
+ log_struct_errno(LOG_INFO, r,
"MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
LOG_UNIT_ID(unit),
LOG_UNIT_INVOCATION_ID(unit),
@@ -3421,7 +3614,7 @@ static int exec_child(
}
*exit_status = EXIT_EXEC;
- return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
+ return log_unit_error_errno(unit, r, "Failed to execute command: %m");
}
static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
@@ -3435,13 +3628,11 @@ int exec_spawn(Unit *unit,
DynamicCreds *dcreds,
pid_t *ret) {
+ int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
+ _cleanup_free_ char *subcgroup_path = NULL;
_cleanup_strv_free_ char **files_env = NULL;
- int *fds = NULL;
size_t n_storage_fds = 0, n_socket_fds = 0;
_cleanup_free_ char *line = NULL;
- int socket_fd, r;
- int named_iofds[3] = { -1, -1, -1 };
- char **argv;
pid_t pid;
assert(unit);
@@ -3449,7 +3640,7 @@ int exec_spawn(Unit *unit,
assert(context);
assert(ret);
assert(params);
- assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
+ assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
if (context->std_input == EXEC_INPUT_SOCKET ||
context->std_output == EXEC_OUTPUT_SOCKET ||
@@ -3469,8 +3660,8 @@ int exec_spawn(Unit *unit,
} else {
socket_fd = -1;
fds = params->fds;
- n_storage_fds = params->n_storage_fds;
n_socket_fds = params->n_socket_fds;
+ n_storage_fds = params->n_storage_fds;
}
r = exec_context_named_iofds(context, params, named_iofds);
@@ -3481,8 +3672,7 @@ int exec_spawn(Unit *unit,
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
- argv = params->argv ?: command->argv;
- line = exec_command_line(argv);
+ line = exec_command_line(command->argv);
if (!line)
return log_oom();
@@ -3492,6 +3682,17 @@ int exec_spawn(Unit *unit,
LOG_UNIT_ID(unit),
LOG_UNIT_INVOCATION_ID(unit));
+ if (params->cgroup_path) {
+ r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
+ if (r > 0) { /* We are using a child cgroup */
+ r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
+ if (r < 0)
+ return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
+ }
+ }
+
pid = fork();
if (pid < 0)
return log_unit_error_errno(unit, errno, "Failed to fork: %m");
@@ -3505,12 +3706,11 @@ int exec_spawn(Unit *unit,
params,
runtime,
dcreds,
- argv,
socket_fd,
named_iofds,
fds,
- n_storage_fds,
n_socket_fds,
+ n_storage_fds,
files_env,
unit->manager->user_lookup_fds[1],
&exit_status);
@@ -3530,13 +3730,11 @@ int exec_spawn(Unit *unit,
log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
- /* We add the new process to the cgroup both in the child (so
- * that we can be sure that no user code is ever executed
- * outside of the cgroup) and in the parent (so that we can be
- * sure that when we kill the cgroup the process will be
- * killed too). */
- if (params->cgroup_path)
- (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
+ /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
+ * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
+ * process will be killed too). */
+ if (subcgroup_path)
+ (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
exec_status_start(&command->exec_status, pid);
@@ -3624,6 +3822,9 @@ void exec_context_done(ExecContext *c) {
exec_context_free_log_extra_fields(c);
+ c->log_rate_limit_interval_usec = 0;
+ c->log_rate_limit_burst = 0;
+
c->stdin_data = mfree(c->stdin_data);
c->stdin_data_size = 0;
}
@@ -3655,7 +3856,6 @@ static void exec_command_done(ExecCommand *c) {
assert(c);
c->path = mfree(c->path);
-
c->argv = strv_free(c->argv);
}
@@ -3685,6 +3885,24 @@ void exec_command_free_array(ExecCommand **c, size_t n) {
c[i] = exec_command_free_list(c[i]);
}
+void exec_command_reset_status_array(ExecCommand *c, size_t n) {
+ size_t i;
+
+ for (i = 0; i < n; i++)
+ exec_status_reset(&c[i].exec_status);
+}
+
+void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ ExecCommand *z;
+
+ LIST_FOREACH(command, z, c[i])
+ exec_status_reset(&z->exec_status);
+ }
+}
+
typedef struct InvalidEnvInfo {
const Unit *unit;
const char *path;
@@ -3813,7 +4031,7 @@ static int exec_context_load_environment(const Unit *unit, const ExecContext *c,
assert(pglob.gl_pathc > 0);
for (n = 0; n < pglob.gl_pathc; n++) {
- k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
+ k = load_env_file(NULL, pglob.gl_pathv[n], &p);
if (k < 0) {
if (ignore)
continue;
@@ -3976,9 +4194,9 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
for (i = 0; i < RLIM_NLIMITS; i++)
if (c->rlimit[i]) {
- fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
+ fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
- fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
+ fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
}
@@ -4036,8 +4254,12 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
if (c->std_output == EXEC_OUTPUT_FILE)
fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
+ if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
+ fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
if (c->std_error == EXEC_OUTPUT_FILE)
fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
+ if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
+ fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
if (c->tty_path)
fprintf(f,
@@ -4084,6 +4306,17 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
}
+ if (c->log_rate_limit_interval_usec > 0) {
+ char buf_timespan[FORMAT_TIMESPAN_MAX];
+
+ fprintf(f,
+ "%sLogRateLimitIntervalSec: %s\n",
+ prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
+ }
+
+ if (c->log_rate_limit_burst > 0)
+ fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
+
if (c->n_log_extra_fields > 0) {
size_t j;
@@ -4331,18 +4564,22 @@ void exec_context_free_log_extra_fields(ExecContext *c) {
void exec_status_start(ExecStatus *s, pid_t pid) {
assert(s);
- zero(*s);
- s->pid = pid;
+ *s = (ExecStatus) {
+ .pid = pid,
+ };
+
dual_timestamp_get(&s->start_timestamp);
}
void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
assert(s);
- if (s->pid && s->pid != pid)
- zero(*s);
+ if (s->pid != pid) {
+ *s = (ExecStatus) {
+ .pid = pid,
+ };
+ }
- s->pid = pid;
dual_timestamp_get(&s->exit_timestamp);
s->code = code;
@@ -4350,12 +4587,18 @@ void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int
if (context) {
if (context->utmp_id)
- utmp_put_dead_process(context->utmp_id, pid, code, status);
+ (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
exec_context_tty_reset(context, NULL);
}
}
+void exec_status_reset(ExecStatus *s) {
+ assert(s);
+
+ *s = (ExecStatus) {};
+}
+
void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
char buf[FORMAT_TIMESTAMP_MAX];
@@ -4487,8 +4730,7 @@ int exec_command_set(ExecCommand *c, const char *path, ...) {
return -ENOMEM;
}
- free(c->path);
- c->path = p;
+ free_and_replace(c->path, p);
return strv_free_and_replace(c->argv, l);
}
@@ -4920,10 +5162,8 @@ void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
finalize:
r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
- if (r < 0) {
+ if (r < 0)
log_debug_errno(r, "Failed to add exec-runtime: %m");
- return;
- }
}
void exec_runtime_vacuum(Manager *m) {
@@ -4942,6 +5182,13 @@ void exec_runtime_vacuum(Manager *m) {
}
}
+void exec_params_clear(ExecParameters *p) {
+ if (!p)
+ return;
+
+ strv_free(p->environment);
+}
+
static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
[EXEC_INPUT_NULL] = "null",
[EXEC_INPUT_TTY] = "tty",
@@ -4968,6 +5215,7 @@ static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
[EXEC_OUTPUT_SOCKET] = "socket",
[EXEC_OUTPUT_NAMED_FD] = "fd",
[EXEC_OUTPUT_FILE] = "file",
+ [EXEC_OUTPUT_FILE_APPEND] = "append",
};
DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
@@ -4998,6 +5246,16 @@ static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
+static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
+ [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
+ [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
+ [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
+ [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
+ [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
+};
+
+DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
+
static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
[EXEC_KEYRING_INHERIT] = "inherit",
[EXEC_KEYRING_PRIVATE] = "private",