diff options
Diffstat (limited to 'src/nspawn/nspawn.c')
-rw-r--r-- | src/nspawn/nspawn.c | 371 |
1 files changed, 282 insertions, 89 deletions
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 4e3803be82..71b14e2302 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -1,3 +1,4 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ /*** This file is part of systemd. @@ -77,6 +78,7 @@ #include "mount-util.h" #include "netlink-util.h" #include "nspawn-cgroup.h" +#include "nspawn-def.h" #include "nspawn-expose-ports.h" #include "nspawn-mount.h" #include "nspawn-network.h" @@ -106,11 +108,11 @@ #include "user-util.h" #include "util.h" -/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit - * UID range here. We leave a bit of room at the lower end and a lot of room at the upper end, so that other subsystems - * may have their own allocation ranges too. */ -#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000)) -#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000)) +#if HAVE_SPLIT_USR +#define STATIC_RESOLV_CONF "/lib/systemd/resolv.conf" +#else +#define STATIC_RESOLV_CONF "/usr/lib/systemd/resolv.conf" +#endif /* nspawn is listening on the socket at the path in the constant nspawn_notify_socket_path * nspawn_notify_socket_path is relative to the container @@ -188,6 +190,7 @@ static bool arg_network_veth = false; static char **arg_network_veth_extra = NULL; static char *arg_network_bridge = NULL; static char *arg_network_zone = NULL; +static char *arg_network_namespace_path = NULL; static unsigned long arg_personality = PERSONALITY_INVALID; static char *arg_image = NULL; static VolatileMode arg_volatile_mode = VOLATILE_NO; @@ -258,6 +261,9 @@ static void help(void) { " and attach it to an existing bridge on the host\n" " --network-zone=NAME Similar, but attach the new interface to an\n" " an automatically managed bridge interface\n" + " --network-namespace-path=PATH\n" + " Set network namespace to the one represented by\n" + " the specified kernel namespace file node\n" " -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n" " Expose a container IP port on the host\n" " -Z --selinux-context=SECLABEL\n" @@ -318,7 +324,7 @@ static int custom_mount_check_all(void) { return 0; } -static int detect_unified_cgroup_hierarchy(const char *directory) { +static int detect_unified_cgroup_hierarchy_from_environment(void) { const char *e; int r; @@ -332,11 +338,16 @@ static int detect_unified_cgroup_hierarchy(const char *directory) { arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL; else arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; - - return 0; } - /* Otherwise inherit the default from the host system */ + return 0; +} + +static int detect_unified_cgroup_hierarchy_from_image(const char *directory) { + int r; + + /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd in the + * image actually supports. */ r = cg_all_unified(); if (r < 0) return log_error_errno(r, "Failed to determine whether we are in all unified mode."); @@ -362,6 +373,10 @@ static int detect_unified_cgroup_hierarchy(const char *directory) { } else arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE; + log_debug("Using %s hierarchy for container.", + arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" : + arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified"); + return 0; } @@ -423,6 +438,7 @@ static int parse_argv(int argc, char *argv[]) { ARG_NETWORK_BRIDGE, ARG_NETWORK_ZONE, ARG_NETWORK_VETH_EXTRA, + ARG_NETWORK_NAMESPACE_PATH, ARG_PERSONALITY, ARG_VOLATILE, ARG_TEMPLATE, @@ -439,55 +455,56 @@ static int parse_argv(int argc, char *argv[]) { }; static const struct option options[] = { - { "help", no_argument, NULL, 'h' }, - { "version", no_argument, NULL, ARG_VERSION }, - { "directory", required_argument, NULL, 'D' }, - { "template", required_argument, NULL, ARG_TEMPLATE }, - { "ephemeral", no_argument, NULL, 'x' }, - { "user", required_argument, NULL, 'u' }, - { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK }, - { "as-pid2", no_argument, NULL, 'a' }, - { "boot", no_argument, NULL, 'b' }, - { "uuid", required_argument, NULL, ARG_UUID }, - { "read-only", no_argument, NULL, ARG_READ_ONLY }, - { "capability", required_argument, NULL, ARG_CAPABILITY }, - { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY }, - { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL }, - { "bind", required_argument, NULL, ARG_BIND }, - { "bind-ro", required_argument, NULL, ARG_BIND_RO }, - { "tmpfs", required_argument, NULL, ARG_TMPFS }, - { "overlay", required_argument, NULL, ARG_OVERLAY }, - { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, - { "machine", required_argument, NULL, 'M' }, - { "slice", required_argument, NULL, 'S' }, - { "setenv", required_argument, NULL, 'E' }, - { "selinux-context", required_argument, NULL, 'Z' }, - { "selinux-apifs-context", required_argument, NULL, 'L' }, - { "quiet", no_argument, NULL, 'q' }, - { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */ - { "register", required_argument, NULL, ARG_REGISTER }, - { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT }, - { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE }, - { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN }, - { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN }, - { "network-veth", no_argument, NULL, 'n' }, - { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA }, - { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE }, - { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE }, - { "personality", required_argument, NULL, ARG_PERSONALITY }, - { "image", required_argument, NULL, 'i' }, - { "volatile", optional_argument, NULL, ARG_VOLATILE }, - { "port", required_argument, NULL, 'p' }, - { "property", required_argument, NULL, ARG_PROPERTY }, - { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, - { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, - { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, - { "settings", required_argument, NULL, ARG_SETTINGS }, - { "chdir", required_argument, NULL, ARG_CHDIR }, - { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT }, - { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, - { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, - { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, ARG_VERSION }, + { "directory", required_argument, NULL, 'D' }, + { "template", required_argument, NULL, ARG_TEMPLATE }, + { "ephemeral", no_argument, NULL, 'x' }, + { "user", required_argument, NULL, 'u' }, + { "private-network", no_argument, NULL, ARG_PRIVATE_NETWORK }, + { "as-pid2", no_argument, NULL, 'a' }, + { "boot", no_argument, NULL, 'b' }, + { "uuid", required_argument, NULL, ARG_UUID }, + { "read-only", no_argument, NULL, ARG_READ_ONLY }, + { "capability", required_argument, NULL, ARG_CAPABILITY }, + { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY }, + { "link-journal", required_argument, NULL, ARG_LINK_JOURNAL }, + { "bind", required_argument, NULL, ARG_BIND }, + { "bind-ro", required_argument, NULL, ARG_BIND_RO }, + { "tmpfs", required_argument, NULL, ARG_TMPFS }, + { "overlay", required_argument, NULL, ARG_OVERLAY }, + { "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO }, + { "machine", required_argument, NULL, 'M' }, + { "slice", required_argument, NULL, 'S' }, + { "setenv", required_argument, NULL, 'E' }, + { "selinux-context", required_argument, NULL, 'Z' }, + { "selinux-apifs-context", required_argument, NULL, 'L' }, + { "quiet", no_argument, NULL, 'q' }, + { "share-system", no_argument, NULL, ARG_SHARE_SYSTEM }, /* not documented */ + { "register", required_argument, NULL, ARG_REGISTER }, + { "keep-unit", no_argument, NULL, ARG_KEEP_UNIT }, + { "network-interface", required_argument, NULL, ARG_NETWORK_INTERFACE }, + { "network-macvlan", required_argument, NULL, ARG_NETWORK_MACVLAN }, + { "network-ipvlan", required_argument, NULL, ARG_NETWORK_IPVLAN }, + { "network-veth", no_argument, NULL, 'n' }, + { "network-veth-extra", required_argument, NULL, ARG_NETWORK_VETH_EXTRA }, + { "network-bridge", required_argument, NULL, ARG_NETWORK_BRIDGE }, + { "network-zone", required_argument, NULL, ARG_NETWORK_ZONE }, + { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH }, + { "personality", required_argument, NULL, ARG_PERSONALITY }, + { "image", required_argument, NULL, 'i' }, + { "volatile", optional_argument, NULL, ARG_VOLATILE }, + { "port", required_argument, NULL, 'p' }, + { "property", required_argument, NULL, ARG_PROPERTY }, + { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, + { "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN }, + { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, + { "settings", required_argument, NULL, ARG_SETTINGS }, + { "chdir", required_argument, NULL, ARG_CHDIR }, + { "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT }, + { "notify-ready", required_argument, NULL, ARG_NOTIFY_READY }, + { "root-hash", required_argument, NULL, ARG_ROOT_HASH }, + { "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER }, {} }; @@ -573,8 +590,7 @@ static int parse_argv(int argc, char *argv[]) { if (r < 0) return log_oom(); - /* fall through */ - + _fallthrough_; case 'n': arg_network_veth = true; arg_private_network = true; @@ -628,13 +644,19 @@ static int parse_argv(int argc, char *argv[]) { if (strv_extend(&arg_network_ipvlan, optarg) < 0) return log_oom(); - /* fall through */ - + _fallthrough_; case ARG_PRIVATE_NETWORK: arg_private_network = true; arg_settings_mask |= SETTING_NETWORK; break; + case ARG_NETWORK_NAMESPACE_PATH: + r = parse_path_argument_and_warn(optarg, false, &arg_network_namespace_path); + if (r < 0) + return r; + + break; + case 'b': if (arg_start_mode == START_PID2) { log_error("--boot and --as-pid2 may not be combined."); @@ -1094,6 +1116,17 @@ static int parse_argv(int argc, char *argv[]) { assert_not_reached("Unhandled option"); } + /* If --network-namespace-path is given with any other network-related option, + * we need to error out, to avoid conflicts between different network options. */ + if (arg_network_namespace_path && + (arg_network_interfaces || arg_network_macvlan || + arg_network_ipvlan || arg_network_veth_extra || + arg_network_bridge || arg_network_zone || + arg_network_veth || arg_private_network)) { + log_error("--network-namespace-path cannot be combined with other network options."); + return -EINVAL; + } + parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC); parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID); parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS); @@ -1120,6 +1153,8 @@ static int parse_argv(int argc, char *argv[]) { arg_userns_chown = true; if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0) { + /* Save the user from accidentally registering either user-$SESSION.scope or user@.service. + * The latter is not technically a user session, but we don't need to labour the point. */ log_error("--keep-unit --register=yes may not be used when invoked from a user session."); return -EINVAL; } @@ -1410,7 +1445,7 @@ static int setup_resolv_conf(const char *dest) { return 0; } - if (access("/usr/lib/systemd/resolv.conf", F_OK) >= 0 && + if (access(STATIC_RESOLV_CONF, F_OK) >= 0 && resolved_listening() > 0) { /* resolved is enabled on the host. In this, case bind mount its static resolv.conf file into the @@ -1422,7 +1457,7 @@ static int setup_resolv_conf(const char *dest) { if (found == 0) /* missing? */ (void) touch(resolved); - r = mount_verbose(LOG_DEBUG, "/usr/lib/systemd/resolv.conf", resolved, NULL, MS_BIND, NULL); + r = mount_verbose(LOG_DEBUG, STATIC_RESOLV_CONF, resolved, NULL, MS_BIND, NULL); if (r >= 0) return mount_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL); } @@ -2015,8 +2050,7 @@ static int wait_for_container(pid_t pid, ContainerStatus *container) { return 0; } - /* fall through */ - + _fallthrough_; case CLD_DUMPED: log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status)); return -EIO; @@ -2044,18 +2078,27 @@ static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo } static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) { + pid_t pid; + + assert(s); + assert(ssi); + + pid = PTR_TO_PID(userdata); + for (;;) { siginfo_t si = {}; + if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0) return log_error_errno(errno, "Failed to waitid(): %m"); if (si.si_pid == 0) /* No pending children. */ break; - if (si.si_pid == PTR_TO_PID(userdata)) { + if (si.si_pid == pid) { /* The main process we care for has exited. Return from * signal handler but leave the zombie. */ sd_event_exit(sd_event_source_get_event(s), 0); break; } + /* Reap all other children. */ (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED); } @@ -2063,6 +2106,24 @@ static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, vo return 0; } +static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) { + pid_t pid; + + assert(m); + + pid = PTR_TO_PID(userdata); + + if (arg_kill_signal > 0) { + log_info("Container termination requested. Attempting to halt container."); + (void) kill(pid, arg_kill_signal); + } else { + log_info("Container termination requested. Exiting."); + sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0); + } + + return 0; +} + static int determine_names(void) { int r; @@ -2089,7 +2150,7 @@ static int determine_names(void) { return -ENOENT; } - if (i->type == IMAGE_RAW) + if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK)) r = free_and_strdup(&arg_image, i->path); else r = free_and_strdup(&arg_directory, i->path); @@ -2494,12 +2555,15 @@ static int outer_child( int kmsg_socket, int rtnl_socket, int uid_shift_socket, - FDSet *fds) { + int unified_cgroup_hierarchy_socket, + FDSet *fds, + int netns_fd) { pid_t pid; ssize_t l; int r; _cleanup_close_ int fd = -1; + bool create_netns; assert(barrier); assert(directory); @@ -2544,7 +2608,13 @@ static int outer_child( return r; if (dissected_image) { - r = dissected_image_mount(dissected_image, directory, DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)); + /* If we are operating on a disk image, then mount its root directory now, but leave out the rest. We + * can read the UID shift from it if we need to. Further down we'll mount the rest, but then with the + * uid shift known. That way we can mount VFAT file systems shifted to the right place right away. This + * makes sure ESP partitions and userns are compatible. */ + + r = dissected_image_mount(dissected_image, directory, arg_uid_shift, + DISSECT_IMAGE_MOUNT_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)); if (r < 0) return r; } @@ -2580,6 +2650,32 @@ static int outer_child( log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range); } + if (dissected_image) { + /* Now we know the uid shift, let's now mount everything else that might be in the image. */ + r = dissected_image_mount(dissected_image, directory, arg_uid_shift, + DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|DISSECT_IMAGE_DISCARD_ON_LOOP|(arg_read_only ? DISSECT_IMAGE_READ_ONLY : 0)); + if (r < 0) + return r; + } + + if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { + /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */ + + r = detect_unified_cgroup_hierarchy_from_image(directory); + if (r < 0) + return r; + + l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send cgroup mode: %m"); + if (l != sizeof(arg_unified_cgroup_hierarchy)) { + log_error("Short write while sending cgroup mode: %m"); + return -EIO; + } + + unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket); + } + /* Turn directory into bind mount */ r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL); if (r < 0) @@ -2718,9 +2814,11 @@ static int outer_child( if (fd < 0) return fd; + create_netns = !arg_network_namespace_path && arg_private_network; + pid = raw_clone(SIGCHLD|CLONE_NEWNS| arg_clone_ns_flags | - (arg_private_network ? CLONE_NEWNET : 0) | + (create_netns ? CLONE_NEWNET : 0) | (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0)); if (pid < 0) return log_error_errno(errno, "Failed to fork inner child: %m"); @@ -2734,6 +2832,12 @@ static int outer_child( * requested, so that we all are owned by the user if * user namespaces are turned on. */ + if (arg_network_namespace_path) { + r = namespace_enter(-1, -1, netns_fd, -1, -1); + if (r < 0) + return r; + } + r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, fds); if (r < 0) _exit(EXIT_FAILURE); @@ -2766,11 +2870,13 @@ static int outer_child( notify_socket = safe_close(notify_socket); kmsg_socket = safe_close(kmsg_socket); rtnl_socket = safe_close(rtnl_socket); + netns_fd = safe_close(netns_fd); return 0; } static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { + bool tried_hashed = false; unsigned n_tries = 100; uid_t candidate; int r; @@ -2785,13 +2891,13 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { (void) mkdir("/run/systemd/nspawn-uid", 0755); for (;;) { - char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; + char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1]; _cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT; if (--n_tries <= 0) return -EBUSY; - if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX) + if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX) goto next; if ((candidate & UINT32_C(0xFFFF)) != 0) goto next; @@ -2819,14 +2925,27 @@ static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) { return 0; next: - random_bytes(&candidate, sizeof(candidate)); - candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN; + if (arg_machine && !tried_hashed) { + /* Try to hash the base from the container name */ + + static const uint8_t hash_key[] = { + 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf, + 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72 + }; + + candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key); + + tried_hashed = true; + } else + random_bytes(&candidate, sizeof(candidate)); + + candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN; candidate &= (uid_t) UINT32_C(0xFFFF0000); } } static int setup_uid_map(pid_t pid) { - char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; + char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; int r; assert(pid > 1); @@ -3212,18 +3331,22 @@ static int run(int master, pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, notify_socket_pair[2] = { -1, -1 }, - uid_shift_socket_pair[2] = { -1, -1 }; + uid_shift_socket_pair[2] = { -1, -1 }, + unified_cgroup_hierarchy_socket_pair[2] = { -1, -1}; + _cleanup_close_ int notify_socket= -1; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; ContainerStatus container_status = 0; char last_char = 0; int ifi = 0, r; ssize_t l; sigset_t mask_chld; + _cleanup_close_ int netns_fd = -1; assert_se(sigemptyset(&mask_chld) == 0); assert_se(sigaddset(&mask_chld, SIGCHLD) == 0); @@ -3264,6 +3387,10 @@ static int run(int master, if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) return log_error_errno(errno, "Failed to create uid shift socket pair: %m"); + if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0) + return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m"); + /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt * parent's blocking calls and give it a chance to call wait() and terminate. */ r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); @@ -3274,6 +3401,20 @@ static int run(int master, if (r < 0) return log_error_errno(errno, "Failed to install SIGCHLD handler: %m"); + if (arg_network_namespace_path) { + netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC); + if (netns_fd < 0) + return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path); + + r = fd_is_network_ns(netns_fd); + if (r < 0 && r != -ENOTTY) + return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path); + if (r == 0) { + log_error("Path %s doesn't refer to a network namespace", arg_network_namespace_path); + return -EINVAL; + } + } + *pid = raw_clone(SIGCHLD|CLONE_NEWNS); if (*pid < 0) return log_error_errno(errno, "clone() failed%s: %m", @@ -3292,6 +3433,7 @@ static int run(int master, uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]); notify_socket_pair[0] = safe_close(notify_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); + unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]); (void) reset_all_signal_handlers(); (void) reset_signal_mask(); @@ -3308,7 +3450,9 @@ static int run(int master, kmsg_socket_pair[1], rtnl_socket_pair[1], uid_shift_socket_pair[1], - fds); + unified_cgroup_hierarchy_socket_pair[1], + fds, + netns_fd); if (r < 0) _exit(EXIT_FAILURE); @@ -3325,6 +3469,7 @@ static int run(int master, uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); notify_socket_pair[1] = safe_close(notify_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); + unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]); if (arg_userns_mode != USER_NAMESPACE_NO) { /* The child just let us know the UID shift it might have read from the image. */ @@ -3355,6 +3500,17 @@ static int run(int master, } } + if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) { + /* The child let us know the support cgroup mode it might have read from the image. */ + l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0); + if (l < 0) + return log_error_errno(errno, "Failed to read cgroup mode: %m"); + if (l != sizeof(arg_unified_cgroup_hierarchy)) { + log_error("Short read while reading cgroup mode."); + return -EIO; + } + } + /* Wait for the outer child. */ r = wait_for_terminate_and_warn("namespace helper", *pid, NULL); if (r != 0) @@ -3449,8 +3605,31 @@ static int run(int master, return r; } + if (arg_register || !arg_keep_unit) { + r = sd_bus_default_system(&bus); + if (r < 0) + return log_error_errno(r, "Failed to open system bus: %m"); + } + + if (!arg_keep_unit) { + /* When a new scope is created for this container, then we'll be registered as its controller, in which + * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the + * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */ + + r = sd_bus_add_match(bus, NULL, + "type='signal'," + "sender='org.freedesktop.systemd1'," + "interface='org.freedesktop.systemd1.Scope'," + "member='RequestStop'", + on_request_stop, PID_TO_PTR(*pid)); + if (r < 0) + return log_error_errno(r, "Failed to install request stop match: %m"); + } + if (arg_register) { + r = register_machine( + bus, arg_machine, *pid, arg_directory, @@ -3464,8 +3643,11 @@ static int run(int master, arg_container_service_name); if (r < 0) return r; + } else if (!arg_keep_unit) { + r = allocate_scope( + bus, arg_machine, *pid, arg_slice, @@ -3488,7 +3670,7 @@ static int run(int master, return r; } - r = chown_cgroup(*pid, arg_uid_shift); + r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift); if (r < 0) return r; @@ -3511,6 +3693,14 @@ static int run(int master, if (r < 0) return log_error_errno(r, "Failed to get default event source: %m"); + (void) sd_event_set_watchdog(event, true); + + if (bus) { + r = sd_bus_attach_event(bus, event, 0); + if (r < 0) + return log_error_errno(r, "Failed to attach bus to event loop: %m"); + } + r = setup_sd_notify_parent(event, notify_socket, PID_TO_PTR(*pid), ¬ify_event_source); if (r < 0) return r; @@ -3572,8 +3762,8 @@ static int run(int master, putc('\n', stdout); /* Kill if it is not dead yet anyway */ - if (arg_register && !arg_keep_unit) - terminate_machine(*pid); + if (arg_register && !arg_keep_unit && bus) + terminate_machine(bus, *pid); /* Normally redundant, but better safe than sorry */ (void) kill(*pid, SIGKILL); @@ -3644,11 +3834,10 @@ int main(int argc, char *argv[]) { if (r <= 0) goto finish; - if (geteuid() != 0) { - log_error("Need to be root."); - r = -EPERM; + r = must_be_root(); + if (r < 0) goto finish; - } + r = determine_names(); if (r < 0) goto finish; @@ -3661,6 +3850,10 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; + r = detect_unified_cgroup_hierarchy_from_environment(); + if (r < 0) + goto finish; + n_fd_passed = sd_listen_fds(false); if (n_fd_passed > 0) { r = fdset_new_listen_fds(&fds, false); @@ -3883,6 +4076,10 @@ int main(int argc, char *argv[]) { log_error_errno(r, "--image= is not supported, compiled without blkid support."); goto finish; } + if (r == -EPROTONOSUPPORT) { + log_error_errno(r, "Device is loopback block device with partition scanning turned off, please turn it on."); + goto finish; + } if (r < 0) { log_error_errno(r, "Failed to dissect image: %m"); goto finish; @@ -3904,10 +4101,6 @@ int main(int argc, char *argv[]) { if (r < 0) goto finish; - r = detect_unified_cgroup_hierarchy(arg_directory); - if (r < 0) - goto finish; - interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0; |