summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Boccassi <luca.boccassi@microsoft.com>2021-06-30 15:51:03 +0100
committerZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2021-07-12 13:30:54 +0200
commitf284d1d451b77b2c659936355d60a04329e31df9 (patch)
tree1f51e050d2d1b8e7dbdd8dd39d3f4323ad1aae45
parentb3b992edea3f69cd08d9b711dd2c1ac6dfc21335 (diff)
downloadsystemd-f284d1d451b77b2c659936355d60a04329e31df9.tar.gz
core: when recursively bind-remounting nested mounts, use options from top one
When mount points are stacked, bind_remount_recursive_with_mountinfo() uses the existing mount options of the "lower" level mount (ie: the first one that was mounted on a mount point). But the actual mount point in use is the "top" one (ie: the last one that was mounted on a mount point), so in practice if the mount options are different between the layers, the bottom options are used by mistake on the top mount, which is not what we want. This is because libmount returns the "bottom" one first. If the hashmap returns EEXIST, which means the same key (path) with different value (options) is already present, update the hashmap instead of discarding the result. This way, the last/top mount options are always used when mounts are stacked on a mount point. This was found to cause problems as LXC version 4.x stacks two /sys mounts, the bottom one read-write and the top one read-only. systemd accidentally remounts the top-one read-write, breaking various expectations since a read-only /sys is the way we decide whether we are running in a container or not (in this particular case, networkd tests are broken as networkd expects to be able to modify network settings with a writable /sys). Future versions of LXC will no longer do this double-stacking, but we need to support running inside older versions too. This was triggered by https://github.com/systemd/systemd/commit/6720e356c137 as that causes a recursive remount of '/', which processes '/sys' as one of the submounts, from make_nosuid(). But it's likely that other combinations of options could trigger this as well. Before: root@systemd-debug:/# systemd-run -t --wait --property ProtectSystem=yes findmnt Running as unit: run-u9.service Press ^] three times within 1s to disconnect TTY. TARGET SOURCE FSTYPE OPTIONS / /dev/sda2[/var/lib/lxc/systemd-debug/rootfs] │ ext4 ro,nosuid,relatime,errors=remount-ro,stripe= ├─/dev none tmpfs rw,nosuid,relatime,size=492k,mode=755 │ ├─/dev/.lxc/proc proc proc rw,nosuid,relatime │ ├─/dev/.lxc/sys sys sysfs rw,nosuid,relatime │ ├─/dev/console devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/pts devpts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/ptmx devpts[/ptmx] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty1 devpts[/0] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty2 devpts[/1] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty3 devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty4 devpts[/3] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/shm tmpfs tmpfs rw,nosuid,nodev │ ├─/dev/hugepages hugetlbfs hugetlbfs rw,nosuid,relatime,pagesize=2M │ └─/dev/mqueue mqueue mqueue rw,nosuid,nodev,noexec,relatime ├─/proc proc proc rw,nosuid,nodev,noexec,relatime │ ├─/proc/sys proc[/sys] proc ro,nosuid,nodev,noexec,relatime │ │ ├─/proc/sys/net proc[/sys/net] proc rw,nosuid,nodev,noexec,relatime │ │ └─/proc/sys/kernel/random/boot_id │ │ none[/.lxc-boot-id] tmpfs ro,nosuid,nodev,noexec,relatime,size=492k,mo │ └─/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,nosuid,nodev,noexec,relatime ├─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime │ └─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime │ ├─/sys/devices/virtual/net sysfs sysfs rw,relatime │ │ └─/sys/devices/virtual/net │ │ sysfs[/devices/virtual/net] sysfs rw,nosuid,relatime │ ├─/sys/fs/fuse/connections fusectl fusectl rw,nosuid,nodev,noexec,relatime │ └─/sys/fs/cgroup cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,m ├─/run tmpfs tmpfs ro,nosuid,nodev,size=4912348k,nr_inodes=8192 │ ├─/run/credentials tmpfs[/systemd/inaccessible/dir] tmpfs ro,nosuid,nodev,noexec,size=4912348k,nr_inod │ └─/run/systemd/incoming tmpfs[/systemd/propagate/run-u9.service] │ tmpfs ro,nosuid,nodev,size=4912348k,nr_inodes=8192 ├─/tmp tmpfs tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409 │ └─/tmp tmpfs[/systemd-private-b730df90da424397a3f246cb15dcdbb1-run-u9.service-K6EUwf/tmp] │ tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409 └─/var/tmp /dev/sda2[/var/lib/lxc/systemd-debug/rootfs/var/tmp/systemd-private-b730df90da424397a3f246cb15dcdbb1-run-u9.service-vEHyRi/tmp] ext4 rw,nosuid,relatime,errors=remount-ro,stripe= Finished with result: success Main processes terminated with: code=exited/status=0 Service runtime: 14.249s CPU time consumed: 37ms After: root@systemd-debug:/# systemd-run -t --wait --property ProtectSystem=yes findmnt Running as unit: run-u3.service Press ^] three times within 1s to disconnect TTY. TARGET SOURCE FSTYPE OPTIONS / /dev/sda2[/var/lib/lxc/systemd-debug/rootfs] │ ext4 rw,relatime,errors=remount-ro,stripe=32699 ├─/dev none tmpfs rw,relatime,size=492k,mode=755 │ ├─/dev/.lxc/proc proc proc rw,relatime │ ├─/dev/.lxc/sys sys sysfs rw,relatime │ ├─/dev/console devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/pts devpts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/ptmx devpts[/ptmx] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty1 devpts[/0] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty2 devpts[/1] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty3 devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty4 devpts[/3] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/shm tmpfs tmpfs rw,nosuid,nodev │ ├─/dev/hugepages hugetlbfs hugetlbfs rw,relatime,pagesize=2M │ └─/dev/mqueue mqueue mqueue rw,nosuid,nodev,noexec,relatime ├─/proc proc proc rw,nosuid,nodev,noexec,relatime │ ├─/proc/sys proc[/sys] proc ro,nosuid,nodev,noexec,relatime │ │ ├─/proc/sys/net proc[/sys/net] proc rw,nosuid,nodev,noexec,relatime │ │ └─/proc/sys/kernel/random/boot_id │ │ none[/.lxc-boot-id] tmpfs ro,nosuid,nodev,noexec,relatime,size=492k,mode=75 │ └─/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,nosuid,nodev,noexec,relatime ├─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime │ └─/sys sysfs sysfs ro,nosuid,nodev,noexec,relatime │ ├─/sys/devices/virtual/net sysfs sysfs rw,relatime │ │ └─/sys/devices/virtual/net │ │ sysfs[/devices/virtual/net] sysfs rw,nosuid,nodev,noexec,relatime │ ├─/sys/fs/fuse/connections fusectl fusectl rw,nosuid,nodev,noexec,relatime │ └─/sys/fs/cgroup cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory ├─/run tmpfs tmpfs rw,nosuid,nodev,size=4912348k,nr_inodes=819200,mo │ ├─/run/credentials tmpfs[/systemd/inaccessible/dir] │ │ tmpfs ro,nosuid,nodev,noexec,size=4912348k,nr_inodes=81 │ └─/run/systemd/incoming tmpfs[/systemd/propagate/run-u3.service] │ tmpfs ro,nosuid,nodev,size=4912348k,nr_inodes=819200,mo ├─/tmp tmpfs tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409600 ├─/boot /dev/sda2[/var/lib/lxc/systemd-debug/rootfs/boot] │ ext4 ro,relatime,errors=remount-ro,stripe=32699 └─/usr /dev/sda2[/var/lib/lxc/systemd-debug/rootfs/usr] ext4 ro,relatime,errors=remount-ro,stripe=32699 Finished with result: success Main processes terminated with: code=exited/status=0 Service runtime: 14ms CPU time consumed: 5ms Host (LXC): root@systemd-debug:/# findmnt TARGET SOURCE FSTYPE OPTIONS / /dev/sda2[/var/lib/lxc/systemd-debug/rootfs] │ ext4 rw,relatime,errors=remount-ro,stripe=32699 ├─/run tmpfs tmpfs rw,nosuid,nodev,size=4912348k,nr_inodes=819200,mode=755 ├─/tmp tmpfs tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409600 ├─/dev none tmpfs rw,relatime,size=492k,mode=755 │ ├─/dev/pts devpts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/ptmx devpts[/ptmx] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty1 devpts[/0] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty2 devpts[/1] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty3 devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty4 devpts[/3] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/shm tmpfs tmpfs rw,nosuid,nodev │ ├─/dev/hugepages hugetlbfs hugetlbfs rw,relatime,pagesize=2M │ ├─/dev/mqueue mqueue mqueue rw,nosuid,nodev,noexec,relatime │ ├─/dev/console devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 │ ├─/dev/.lxc/proc proc proc rw,relatime │ └─/dev/.lxc/sys sys sysfs rw,relatime ├─/proc proc proc rw,nosuid,nodev,noexec,relatime │ ├─/proc/sys proc[/sys] proc ro,nosuid,nodev,noexec,relatime │ │ ├─/proc/sys/kernel/random/boot_id │ │ │ none[/.lxc-boot-id] tmpfs ro,nosuid,nodev,noexec,relatime,size=492k,mode=755 │ │ └─/proc/sys/net proc[/sys/net] proc rw,nosuid,nodev,noexec,relatime │ └─/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,nosuid,nodev,noexec,relatime └─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime └─/sys sysfs sysfs ro,nosuid,nodev,noexec,relatime ├─/sys/devices/virtual/net sysfs sysfs rw,relatime │ └─/sys/devices/virtual/net │ sysfs[/devices/virtual/net] │ sysfs rw,nosuid,nodev,noexec,relatime ├─/sys/fs/fuse/connections fusectl fusectl rw,nosuid,nodev,noexec,relatime └─/sys/fs/cgroup cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recurs Fixes https://github.com/systemd/systemd/issues/20032 (cherry picked from commit e01030633c73d3974390292bba381aca1224709b)
-rw-r--r--src/shared/mount-util.c8
1 files changed, 7 insertions, 1 deletions
diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c
index 13f202d7e7..9851d2f341 100644
--- a/src/shared/mount-util.c
+++ b/src/shared/mount-util.c
@@ -247,7 +247,13 @@ int bind_remount_recursive_with_mountinfo(
r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
if (r == -EEXIST)
- continue;
+ /* If the same path was recorded, but with different mount flags, update it:
+ * it means a mount point is overmounted, and libmount returns the "bottom" (or
+ * older one) first, but we want to reapply the flags from the "top" (or newer
+ * one). See: https://github.com/systemd/systemd/issues/20032
+ * Note that this shouldn't really fail, as we were just told that the key
+ * exists, and it's an update so we want 'd' to be freed immediately. */
+ r = hashmap_update(todo, d, ULONG_TO_PTR(flags));
if (r < 0)
return r;
if (r > 0)