buildstream/sandbox/_sandboxbwrap.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

#
#  Copyright (C) 2016 Codethink Limited
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
#
#  Authors:
#        Andrew Leeming <andrew.leeming@codethink.co.uk>
#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
import os
import sys
import time
import errno
import signal
import subprocess
import shutil
from contextlib import ExitStack

import psutil

from .._exceptions import SandboxError
from .. import utils, _signals
from ._mount import MountMap
from . import Sandbox, SandboxFlags


# SandboxBwrap()
#
# Default bubblewrap based sandbox implementation.
#
class SandboxBwrap(Sandbox):

    # Minimal set of devices for the sandbox
    DEVICES = [
        '/dev/full',
        '/dev/null',
        '/dev/urandom',
        '/dev/random',
        '/dev/zero'
    ]

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.user_ns_available = kwargs['user_ns_available']
        self.die_with_parent_available = kwargs['die_with_parent_available']

    def run(self, command, flags, *, cwd=None, env=None):
        stdout, stderr = self._get_output()

        # Allowable access to underlying storage as we're part of the sandbox
        root_directory = self.get_virtual_directory()._get_underlying_directory()

        # Fallback to the sandbox default settings for
        # the cwd and env.
        #
        cwd = self._get_work_directory(cwd=cwd)
        env = self._get_environment(cwd=cwd, env=env)

        if not self._has_command(command[0], env):
            raise SandboxError("Staged artifacts do not provide command "
                               "'{}'".format(command[0]),
                               reason='missing-command')

        # We want command args as a list of strings
        if isinstance(command, str):
            command = [command]

        # Create the mount map, this will tell us where
        # each mount point needs to be mounted from and to
        mount_map = MountMap(self, flags & SandboxFlags.ROOT_READ_ONLY)
        root_mount_source = mount_map.get_mount_source('/')

        # Grab the full path of the bwrap binary
        bwrap_command = [utils.get_host_tool('bwrap')]

        for k, v in env.items():
            bwrap_command += ['--setenv', k, v]
        for k in os.environ.keys() - env.keys():
            bwrap_command += ['--unsetenv', k]

        # Create a new pid namespace, this also ensures that any subprocesses
        # are cleaned up when the bwrap process exits.
        bwrap_command += ['--unshare-pid']

        # Ensure subprocesses are cleaned up when the bwrap parent dies.
        if self.die_with_parent_available:
            bwrap_command += ['--die-with-parent']

        # Add in the root filesystem stuff first.
        #
        # The rootfs is mounted as RW initially so that further mounts can be
        # placed on top. If a RO root is required, after all other mounts are
        # complete, root is remounted as RO
        bwrap_command += ["--bind", root_mount_source, "/"]

        if not flags & SandboxFlags.NETWORK_ENABLED:
            bwrap_command += ['--unshare-net']
            bwrap_command += ['--unshare-uts', '--hostname', 'buildstream']
            bwrap_command += ['--unshare-ipc']

        # Give it a proc and tmpfs
        bwrap_command += [
            '--proc', '/proc',
            '--tmpfs', '/tmp'
        ]

        # In interactive mode, we want a complete devpts inside
        # the container, so there is a /dev/console and such. In
        # the regular non-interactive sandbox, we want to hand pick
        # a minimal set of devices to expose to the sandbox.
        #
        if flags & SandboxFlags.INTERACTIVE:
            bwrap_command += ['--dev', '/dev']
        else:
            for device in self.DEVICES:
                bwrap_command += ['--dev-bind', device, device]

        # Add bind mounts to any marked directories
        marked_directories = self._get_marked_directories()
        mount_source_overrides = self._get_mount_sources()
        for mark in marked_directories:
            mount_point = mark['directory']
            if mount_point in mount_source_overrides:  # pylint: disable=consider-using-get
                mount_source = mount_source_overrides[mount_point]
            else:
                mount_source = mount_map.get_mount_source(mount_point)

            # Use --dev-bind for all mounts, this is simply a bind mount which does
            # not restrictive about devices.
            #
            # While it's important for users to be able to mount devices
            # into the sandbox for `bst shell` testing purposes, it is
            # harmless to do in a build environment where the directories
            # we mount just never contain device files.
            #
            bwrap_command += ['--dev-bind', mount_source, mount_point]

        if flags & SandboxFlags.ROOT_READ_ONLY:
            bwrap_command += ["--remount-ro", "/"]

        if cwd is not None:
            bwrap_command += ['--dir', cwd]
            bwrap_command += ['--chdir', cwd]

        # Set UID and GUI
        if self.user_ns_available:
            bwrap_command += ['--unshare-user']
            if not flags & SandboxFlags.INHERIT_UID:
                uid = self._get_config().build_uid
                gid = self._get_config().build_gid
                bwrap_command += ['--uid', str(uid), '--gid', str(gid)]

        # Add the command
        bwrap_command += command

        # bwrap might create some directories while being suid
        # and may give them to root gid, if it does, we'll want
        # to clean them up after, so record what we already had
        # there just in case so that we can safely cleanup the debris.
        #
        existing_basedirs = {
            directory: os.path.exists(os.path.join(root_directory, directory))
            for directory in ['tmp', 'dev', 'proc']
        }

        # Use the MountMap context manager to ensure that any redirected
        # mounts through fuse layers are in context and ready for bwrap
        # to mount them from.
        #
        with ExitStack() as stack:
            stack.enter_context(mount_map.mounted(self))

            # If we're interactive, we want to inherit our stdin,
            # otherwise redirect to /dev/null, ensuring process
            # disconnected from terminal.
            if flags & SandboxFlags.INTERACTIVE:
                stdin = sys.stdin
            else:
                stdin = stack.enter_context(open(os.devnull, "r"))

            # Run bubblewrap !
            exit_code = self.run_bwrap(bwrap_command, stdin, stdout, stderr,
                                       (flags & SandboxFlags.INTERACTIVE))

            # Cleanup things which bwrap might have left behind, while
            # everything is still mounted because bwrap can be creating
            # the devices on the fuse mount, so we should remove it there.
            if not flags & SandboxFlags.INTERACTIVE:
                for device in self.DEVICES:
                    device_path = os.path.join(root_mount_source, device.lstrip('/'))

                    # This will remove the device in a loop, allowing some
                    # retries in case the device file leaked by bubblewrap is still busy
                    self.try_remove_device(device_path)

            # Remove /tmp, this is a bwrap owned thing we want to be sure
            # never ends up in an artifact
            for basedir in ['tmp', 'dev', 'proc']:

                # Skip removal of directories which already existed before
                # launching bwrap
                if existing_basedirs[basedir]:
                    continue

                base_directory = os.path.join(root_mount_source, basedir)

                if flags & SandboxFlags.INTERACTIVE:
                    # Be more lenient in interactive mode here.
                    #
                    # In interactive mode; it's possible that the project shell
                    # configuration has mounted some things below the base
                    # directories, such as /dev/dri, and in this case it's less
                    # important to consider cleanup, as we wont be collecting
                    # this build result and creating an artifact.
                    #
                    # Note: Ideally; we should instead fix upstream bubblewrap to
                    #       cleanup any debris it creates at startup time, and do
                    #       the same ourselves for any directories we explicitly create.
                    #
                    shutil.rmtree(base_directory, ignore_errors=True)
                else:
                    try:
                        os.rmdir(base_directory)
                    except FileNotFoundError:
                        # ignore this, if bwrap cleaned up properly then it's not a problem.
                        #
                        # If the directory was not empty on the other hand, then this is clearly
                        # a bug, bwrap mounted a tempfs here and when it exits, that better be empty.
                        pass

        self._vdir._mark_changed()
        return exit_code

    def run_bwrap(self, argv, stdin, stdout, stderr, interactive):
        # Wrapper around subprocess.Popen() with common settings.
        #
        # This function blocks until the subprocess has terminated.
        #
        # It then returns a tuple of (exit code, stdout output, stderr output).
        # If stdout was not equal to subprocess.PIPE, stdout will be None. Same for
        # stderr.

        # Fetch the process actually launched inside the bwrap sandbox, or the
        # intermediat control bwrap processes.
        #
        # NOTE:
        #   The main bwrap process itself is setuid root and as such we cannot
        #   send it any signals. Since we launch bwrap with --unshare-pid, it's
        #   direct child is another bwrap process which retains ownership of the
        #   pid namespace. This is the right process to kill when terminating.
        #
        #   The grandchild is the binary which we asked bwrap to launch on our
        #   behalf, whatever this binary is, it is the right process to use
        #   for suspending and resuming. In the case that this is a shell, the
        #   shell will be group leader and all build scripts will stop/resume
        #   with that shell.
        #
        def get_user_proc(bwrap_pid, grand_child=False):
            bwrap_proc = psutil.Process(bwrap_pid)
            bwrap_children = bwrap_proc.children()
            if bwrap_children:
                if grand_child:
                    bwrap_grand_children = bwrap_children[0].children()
                    if bwrap_grand_children:
                        return bwrap_grand_children[0]
                else:
                    return bwrap_children[0]
            return None

        def terminate_bwrap():
            if process:
                user_proc = get_user_proc(process.pid)
                if user_proc:
                    user_proc.kill()

        def suspend_bwrap():
            if process:
                user_proc = get_user_proc(process.pid, grand_child=True)
                if user_proc:
                    group_id = os.getpgid(user_proc.pid)
                    os.killpg(group_id, signal.SIGSTOP)

        def resume_bwrap():
            if process:
                user_proc = get_user_proc(process.pid, grand_child=True)
                if user_proc:
                    group_id = os.getpgid(user_proc.pid)
                    os.killpg(group_id, signal.SIGCONT)

        with ExitStack() as stack:

            # We want to launch bwrap in a new session in non-interactive
            # mode so that we handle the SIGTERM and SIGTSTP signals separately
            # from the nested bwrap process, but in interactive mode this
            # causes launched shells to lack job control (we dont really
            # know why that is).
            #
            if interactive:
                new_session = False
            else:
                new_session = True
                stack.enter_context(_signals.suspendable(suspend_bwrap, resume_bwrap))
                stack.enter_context(_signals.terminator(terminate_bwrap))

            process = subprocess.Popen(
                argv,
                # The default is to share file descriptors from the parent process
                # to the subprocess, which is rarely good for sandboxing.
                close_fds=True,
                stdin=stdin,
                stdout=stdout,
                stderr=stderr,
                start_new_session=new_session
            )

            # Wait for the child process to finish, ensuring that
            # a SIGINT has exactly the effect the user probably
            # expects (i.e. let the child process handle it).
            try:
                while True:
                    try:
                        _, status = os.waitpid(process.pid, 0)
                        # If the process exits due to a signal, we
                        # brutally murder it to avoid zombies
                        if not os.WIFEXITED(status):
                            user_proc = get_user_proc(process.pid)
                            if user_proc:
                                utils._kill_process_tree(user_proc.pid)

                    # If we receive a KeyboardInterrupt we continue
                    # waiting for the process since we are in the same
                    # process group and it should also have received
                    # the SIGINT.
                    except KeyboardInterrupt:
                        continue

                    break
            # If we can't find the process, it has already died of its
            # own accord, and therefore we don't need to check or kill
            # anything.
            except psutil.NoSuchProcess:
                pass

            # Return the exit code - see the documentation for
            # os.WEXITSTATUS to see why this is required.
            if os.WIFEXITED(status):
                exit_code = os.WEXITSTATUS(status)
            else:
                exit_code = -1

            if interactive and stdin.isatty():
                # Make this process the foreground process again, otherwise the
                # next read() on stdin will trigger SIGTTIN and stop the process.
                # This is required because the sandboxed process does not have
                # permission to do this on its own (running in separate PID namespace).
                #
                # tcsetpgrp() will trigger SIGTTOU when called from a background
                # process, so ignore it temporarily.
                handler = signal.signal(signal.SIGTTOU, signal.SIG_IGN)
                os.tcsetpgrp(0, os.getpid())
                signal.signal(signal.SIGTTOU, handler)

        return exit_code

    def try_remove_device(self, device_path):

        # Put some upper limit on the tries here
        max_tries = 1000
        tries = 0

        while True:
            try:
                os.unlink(device_path)
            except OSError as e:
                if e.errno == errno.EBUSY:
                    # This happens on some machines, seems there is a race sometimes
                    # after bubblewrap returns and the device files it bind-mounted did
                    # not finish unmounting.
                    #
                    if tries < max_tries:
                        tries += 1
                        time.sleep(1 / 100)
                        continue
                    else:
                        # We've reached the upper limit of tries, bail out now
                        # because something must have went wrong
                        #
                        raise
                elif e.errno == errno.ENOENT:
                    # Bubblewrap cleaned it up for us, no problem if we cant remove it
                    break
                else:
                    # Something unexpected, reraise this error
                    raise
            else:
                # Successfully removed the symlink
                break