summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2020-08-14 18:56:54 +0200
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2020-09-02 11:48:55 +0200
commit0b3c497347028bbcc3c6f911967c205b6d0f275f (patch)
tree4fa12a094c23f89a598b31c48839ec07c850a394
parent2239965c299e53db961f4294ccd5cbbda4f377df (diff)
downloadsystemd-0b3c497347028bbcc3c6f911967c205b6d0f275f.tar.gz
nspawn,pid1: pass "inaccessible" nodes from cntr mgr to pid1 payload via /run/host
Let's make /run/host the sole place we pass stuff from host to container in and place the "inaccessible" nodes in /run/host too. In contrast to the previous two commits this is a minor compat break, but not a relevant one I think. Previously the container manager would place these nodes in /run/systemd/inaccessible/ and that's where PID 1 in the container would try to add them too when missing. Container manager and PID 1 in the container would thus manage the same dir together. With this change the container manager now passes an immutable directory to the container and leaves /run/systemd entirely untouched, and managed exclusively by PID 1 inside the container, which is nice to have clear separation on who manages what. In order to make sure systemd then usses the /run/host/inaccesible/ nodes this commit changes PID 1 to look for that dir and if it exists will symlink it to /run/systemd/inaccessible. Now, this will work fine if new nspawn and new pid 1 in the container work together. as then the symlink is created and the difference between the two dirs won't matter. For the case where an old nspawn invokes a new PID 1: in this case things work as they always worked: the dir is managed together. For the case where different container manager invokes a new PID 1: in this case the nodes aren't typically passed in, and PID 1 in the container will try to create them and will likely fail partially (though gracefully) when trying to create char/block device nodes. THis is fine though as there are fallbacks in place for that case. For the case where a new nspawn invokes an old PID1: this is were the (minor) incompatibily happens: in this case new nspawn will place the nodes in the /run/host/inaccessible/ subdir, but the PID 1 in the container won't look for them there. Since the nodes are also not pre-created in /run/systed/inaccessible/ PID 1 will try to create them there as if a different container manager sets them up. This is of course not sexy, but is not a total loss, since as mentioned fallbacks are in place anyway. Hence I think it's OK to accept this minor incompatibility. (cherry picked from commit 9fac502920a648d82e21b207989bfc3c00fbdebc)
-rw-r--r--src/core/mount-setup.c13
-rw-r--r--src/login/user-runtime-dir.c5
-rw-r--r--src/nspawn/nspawn.c2
-rw-r--r--src/shared/dev-setup.c29
-rw-r--r--src/shared/dev-setup.h2
-rw-r--r--src/test/test-dev-setup.c5
6 files changed, 33 insertions, 23 deletions
diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c
index feb88f3e6e..39662ebb08 100644
--- a/src/core/mount-setup.c
+++ b/src/core/mount-setup.c
@@ -536,8 +536,17 @@ int mount_setup(bool loaded_policy, bool leave_propagation) {
(void) mkdir_label("/run/systemd/system", 0755);
/* Also create /run/systemd/inaccessible nodes, so that we always have something to mount
- * inaccessible nodes from. */
- (void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID);
+ * inaccessible nodes from. If we run in a container the host might have created these for us already
+ * in /run/host/inaccessible/. Use those if we can, since tht way we likely get access to block/char
+ * device nodes that are inaccessible, and if userns is used to nodes that are on mounts owned by a
+ * userns outside the container and thus nicely read-only and not remountable. */
+ if (access("/run/host/inaccessible/", F_OK) < 0) {
+ if (errno != ENOENT)
+ log_debug_errno(errno, "Failed to check if /run/host/inaccessible exists, ignoring: %m");
+
+ (void) make_inaccessible_nodes("/run/systemd", UID_INVALID, GID_INVALID);
+ } else
+ (void) symlink("../host/inaccessible", "/run/systemd/inaccessible");
return 0;
}
diff --git a/src/login/user-runtime-dir.c b/src/login/user-runtime-dir.c
index 8ba916f05e..fcddbc7df6 100644
--- a/src/login/user-runtime-dir.c
+++ b/src/login/user-runtime-dir.c
@@ -54,6 +54,7 @@ static int user_mkdir_runtime_path(
uint64_t runtime_dir_size,
uint64_t runtime_dir_inodes) {
+ const char *p;
int r;
assert(runtime_path);
@@ -104,7 +105,9 @@ static int user_mkdir_runtime_path(
}
/* Set up inaccessible nodes now so they're available if we decide to use them with user namespaces. */
- (void) make_inaccessible_nodes(runtime_path, uid, gid);
+ p = strjoina(runtime_path, "/systemd");
+ (void) mkdir(p, 0755);
+ (void) make_inaccessible_nodes(p, uid, gid);
return 0;
fail:
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index eb7c3321ac..43712565c2 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -3531,7 +3531,7 @@ static int outer_child(
(void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
- p = prefix_roota(directory, "/run");
+ p = prefix_roota(directory, "/run/host");
(void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
r = setup_pts(directory);
diff --git a/src/shared/dev-setup.c b/src/shared/dev-setup.c
index 6e57e2a99d..528440b82f 100644
--- a/src/shared/dev-setup.c
+++ b/src/shared/dev-setup.c
@@ -57,7 +57,7 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid) {
}
int make_inaccessible_nodes(
- const char *runtime_dir,
+ const char *parent_dir,
uid_t uid,
gid_t gid) {
@@ -65,28 +65,26 @@ int make_inaccessible_nodes(
const char *name;
mode_t mode;
} table[] = {
- { "/systemd", S_IFDIR | 0755 },
- { "/systemd/inaccessible", S_IFDIR | 0000 },
- { "/systemd/inaccessible/reg", S_IFREG | 0000 },
- { "/systemd/inaccessible/dir", S_IFDIR | 0000 },
- { "/systemd/inaccessible/fifo", S_IFIFO | 0000 },
- { "/systemd/inaccessible/sock", S_IFSOCK | 0000 },
+ { "inaccessible", S_IFDIR | 0755 },
+ { "inaccessible/reg", S_IFREG | 0000 },
+ { "inaccessible/dir", S_IFDIR | 0000 },
+ { "inaccessible/fifo", S_IFIFO | 0000 },
+ { "inaccessible/sock", S_IFSOCK | 0000 },
/* The following two are likely to fail if we lack the privs for it (for example in an userns
* environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0
* device nodes to be created). But that's entirely fine. Consumers of these files should carry
* fallback to use a different node then, for example <root>/inaccessible/sock, which is close
* enough in behaviour and semantics for most uses. */
- { "/systemd/inaccessible/chr", S_IFCHR | 0000 },
- { "/systemd/inaccessible/blk", S_IFBLK | 0000 },
+ { "inaccessible/chr", S_IFCHR | 0000 },
+ { "inaccessible/blk", S_IFBLK | 0000 },
};
_cleanup_umask_ mode_t u;
- size_t i;
int r;
- if (!runtime_dir)
- runtime_dir = "/run";
+ if (!parent_dir)
+ parent_dir = "/run/systemd";
u = umask(0000);
@@ -95,10 +93,10 @@ int make_inaccessible_nodes(
* to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the
* underlying file, i.e. in the best case we offer the same node type as the underlying node. */
- for (i = 0; i < ELEMENTSOF(table); i++) {
+ for (size_t i = 0; i < ELEMENTSOF(table); i++) {
_cleanup_free_ char *path = NULL;
- path = path_join(runtime_dir, table[i].name);
+ path = path_join(parent_dir, table[i].name);
if (!path)
return log_oom();
@@ -107,8 +105,7 @@ int make_inaccessible_nodes(
else
r = mknod_label(path, table[i].mode, makedev(0, 0));
if (r < 0) {
- if (r != -EEXIST)
- log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
+ log_debug_errno(r, "Failed to create '%s', ignoring: %m", path);
continue;
}
diff --git a/src/shared/dev-setup.h b/src/shared/dev-setup.h
index 72b90ec4de..437c0e96e6 100644
--- a/src/shared/dev-setup.h
+++ b/src/shared/dev-setup.h
@@ -5,4 +5,4 @@
int dev_setup(const char *prefix, uid_t uid, gid_t gid);
-int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid);
+int make_inaccessible_nodes(const char *parent_dir, uid_t uid, gid_t gid);
diff --git a/src/test/test-dev-setup.c b/src/test/test-dev-setup.c
index 038484e475..11196cd4d6 100644
--- a/src/test/test-dev-setup.c
+++ b/src/test/test-dev-setup.c
@@ -3,6 +3,7 @@
#include "capability-util.h"
#include "dev-setup.h"
#include "fs-util.h"
+#include "mkdir.h"
#include "path-util.h"
#include "rm-rf.h"
#include "tmpfile-util.h"
@@ -17,8 +18,8 @@ int main(int argc, char *argv[]) {
assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0);
- f = prefix_roota(p, "/run");
- assert_se(mkdir(f, 0755) >= 0);
+ f = prefix_roota(p, "/run/systemd");
+ assert_se(mkdir_p(f, 0755) >= 0);
assert_se(make_inaccessible_nodes(f, 1, 1) >= 0);