diff options
author | Sam Thursfield <sam.thursfield@codethink.co.uk> | 2015-04-13 12:31:46 +0000 |
---|---|---|
committer | Baserock Gerrit <gerrit@baserock.org> | 2015-04-21 18:25:12 +0000 |
commit | 985d512ad9969b9216720a7dc9274b41bb2802eb (patch) | |
tree | 6d45125cf8aed806f91be5b542b073329e6c5dc8 /distbuild-trove-nfsboot.write | |
parent | 64465445f2a95d74cb4a5bae3ab0d1783d6de68e (diff) | |
download | definitions-985d512ad9969b9216720a7dc9274b41bb2802eb.tar.gz |
Add distbuild-trove-nfsboot.write
The nfsboot.write deployment extension has been deprecated for a while
because it's not generally useful. It's only used for deploying
distbuild nodes to a Trove, as far as I know.
We still need to support setting up a bunch of machines that boot over
NFS from a Trove. But we can do this in a special-purpose .write
extension.
The new distbuild-trove-nfsboot.write is much more efficient than
the more generic nfsboot.write: instead of treating each system
individually (thus copying an almost identical ~2GB rootfs to the Trove
once per node) it copies the system image to the Trove once, and /then/
sets up a rootfs per node.
Upgrades are now supported, although the code assumes distbuild nodes
are stateless (as they should be) so nothing special is done for
upgrades, other than checking that there is already a version of the
given system in existance. The new extension does not create an
orig/ and run/ version of each system, because there is no need when the
deployed system is stateless.
There could be further gains in efficiency, but I don't have time to do
them right now. This write extension is full of compromises, its goal is
to better support the existing users who have a Trove and a distbuild
network deployed via NFS. It is specifically not intended to be useful
for other purposes.
Change-Id: I9a50c58b714ed272212d1d6c55b289aaa96051b1
Diffstat (limited to 'distbuild-trove-nfsboot.write')
-rwxr-xr-x | distbuild-trove-nfsboot.write | 283 |
1 files changed, 283 insertions, 0 deletions
diff --git a/distbuild-trove-nfsboot.write b/distbuild-trove-nfsboot.write new file mode 100755 index 00000000..a5a5b094 --- /dev/null +++ b/distbuild-trove-nfsboot.write @@ -0,0 +1,283 @@ +#!/usr/bin/python +# Copyright (C) 2013-2015 Codethink Limited +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see <http://www.gnu.org/licenses/>. + + +'''Morph .write extension for a distbuild network booting off a Trove with NFS. + +''' + + +import os +import sys +import tempfile + +import cliapp +import morphlib.writeexts + + +class DistbuildTroveNFSBootWriteExtension(morphlib.writeexts.WriteExtension): + + '''Create an NFS root and kernel on TFTP during Morph's deployment. + + See distbuild-trove-nfsboot.help for documentation. + + ''' + + nfsboot_root = '/srv/nfsboot' + remote_user = 'root' + + def system_path(self, system_name, version_label=None): + if version_label: + # The 'run' directory is kind of a historical artifact. Baserock + # systems that have Btrfs root disks maintain an orig/ and a run/ + # subvolume, so that one can find changes that have been made at + # runtime. For distbuild systems, this isn't necessary because the + # root filesystems of the nodes are effectively stateless. However, + # existing systems have bootloaders configured to look for the + # 'run' directory, so we need to keep creating it. + return os.path.join(self.nfsboot_root, system_name, 'systems', + version_label, 'run') + else: + return os.path.join(self.nfsboot_root, system_name) + + def process_args(self, args): + if len(args) != 2: + raise cliapp.AppException('Wrong number of command line args') + + local_system_path, nfs_host = args + + nfs_netloc = '%s@%s' % (self.remote_user, nfs_host) + + version_label = os.getenv('VERSION_LABEL', 'factory') + + controller_name = os.getenv('DISTBUILD_CONTROLLER') + worker_names = os.getenv('DISTBUILD_WORKERS').split() + system_names = set([controller_name] + worker_names) + + git_server = os.getenv('DISTBUILD_GIT_SERVER') + shared_artifact_cache = os.getenv('DISTBUILD_SHARED_ARTIFACT_CACHE') + trove_id = os.getenv('DISTBUILD_TROVE_ID') + worker_ssh_key_path = os.getenv('DISTBUILD_WORKER_SSH_KEY') + + host_map = self.parse_host_map_string(os.getenv('HOST_MAP', '')) + + kernel_relpath = self.find_kernel(local_system_path) + + copied_rootfs = None + for system_name in system_names: + remote_system_path = self.system_path(system_name, version_label) + if copied_rootfs is None: + self.transfer_system( + nfs_netloc, local_system_path, remote_system_path) + copied_rootfs = remote_system_path + else: + self.duplicate_remote_system( + nfs_netloc, copied_rootfs, remote_system_path) + + for system_name in system_names: + remote_system_path = self.system_path(system_name, version_label) + self.link_kernel_to_tftpboot_path( + nfs_netloc, system_name, version_label, kernel_relpath) + self.set_hostname( + nfs_netloc, system_name, remote_system_path) + self.write_distbuild_config( + nfs_netloc, system_name, remote_system_path, git_server, + shared_artifact_cache, trove_id, worker_ssh_key_path, + controller_name, worker_names, host_map=host_map) + + self.configure_nfs_exports(nfs_netloc, system_names) + + for system_name in system_names: + self.update_default_version(nfs_netloc, system_name, version_label) + + def parse_host_map_string(self, host_map_string): + '''Parse the HOST_MAP variable + + Returns a dict mapping hostname to value (where value is an IP + address, a fully-qualified domain name, an alternate hostname, or + whatever). + + ''' + pairs = host_map_string.split(' ') + return morphlib.util.parse_environment_pairs({}, pairs) + + def transfer_system(self, nfs_netloc, local_system_path, + remote_system_path): + self.status(msg='Copying rootfs to %(nfs_netloc)s', + nfs_netloc=nfs_netloc) + cliapp.ssh_runcmd( + nfs_netloc, ['mkdir', '-p', remote_system_path]) + # The deployed rootfs may have been created by OSTree, so definitely + # don't pass --hard-links to `rsync`. + cliapp.runcmd( + ['rsync', '--archive', '--delete', '--info=progress2', + '--protect-args', '--partial', '--sparse', '--xattrs', + local_system_path + '/', + '%s:%s' % (nfs_netloc, remote_system_path)], stdout=sys.stdout) + + def duplicate_remote_system(self, nfs_netloc, source_system_path, + target_system_path): + self.status(msg='Duplicating rootfs to %(target_system_path)s', + target_system_path=target_system_path) + cliapp.ssh_runcmd(nfs_netloc, + ['mkdir', '-p', target_system_path]) + # We can't pass --info=progress2 here, because it may not be available + # in the remote 'rsync'. The --info setting was added in RSync 3.1.0, + # old versions of Baserock have RSync 3.0.9. So the user doesn't get + # any progress info on stdout for the 'duplicate' stage. + cliapp.ssh_runcmd(nfs_netloc, + ['rsync', '--archive', '--delete', '--protect-args', '--partial', + '--sparse', '--xattrs', source_system_path + '/', + target_system_path], stdout=sys.stdout) + + def find_kernel(self, local_system_path): + bootdir = os.path.join(local_system_path, 'boot') + image_names = ['vmlinuz', 'zImage', 'uImage'] + + for name in image_names: + try_path = os.path.join(bootdir, name) + if os.path.exists(try_path): + kernel_path = os.path.relpath(try_path, local_system_path) + break + else: + raise cliapp.AppException( + 'Could not find a kernel in the system: none of ' + '%s found' % ', '.join(image_names)) + return kernel_path + + def link_kernel_to_tftpboot_path(self, nfs_netloc, system_name, + version_label, kernel_relpath): + '''Create links for TFTP server for a system's kernel.''' + + remote_system_path = self.system_path(system_name, version_label) + kernel_dest = os.path.join(remote_system_path, kernel_relpath) + + self.status(msg='Creating links to %(name)s kernel in tftp directory', + name=system_name) + tftp_dir = os.path.join(self.nfsboot_root , 'tftp') + + versioned_kernel_name = "%s-%s" % (system_name, version_label) + kernel_name = system_name + + cliapp.ssh_runcmd(nfs_netloc, + ['ln', '-f', kernel_dest, + os.path.join(tftp_dir, versioned_kernel_name)]) + + cliapp.ssh_runcmd(nfs_netloc, + ['ln', '-sf', versioned_kernel_name, + os.path.join(tftp_dir, kernel_name)]) + + def set_remote_file_contents(self, nfs_netloc, path, text): + with tempfile.NamedTemporaryFile() as f: + f.write(text) + f.flush() + cliapp.runcmd( + ['scp', f.name, '%s:%s' % (nfs_netloc, path)]) + + def set_hostname(self, nfs_netloc, system_name, system_path): + hostname_path = os.path.join(system_path, 'etc', 'hostname') + self.set_remote_file_contents( + nfs_netloc, hostname_path, system_name + '\n') + + def write_distbuild_config(self, nfs_netloc, system_name, system_path, + git_server, shared_artifact_cache, trove_id, + worker_ssh_key_path, controller_name, + worker_names, host_map = {}): + '''Write /etc/distbuild/distbuild.conf on the node. + + This .write extension takes advantage of the 'generic' mode of + distbuild.configure. Each node is not configured until first-boot, + when distbuild-setup.service runs and configures the node based on the + contents of /etc/distbuild/distbuild.conf. + + ''' + def host(hostname): + return host_map.get(hostname, hostname) + + config = { + 'ARTIFACT_CACHE_SERVER': host(shared_artifact_cache), + 'CONTROLLERHOST': host(controller_name), + 'TROVE_HOST': host(git_server), + 'TROVE_ID': trove_id, + 'DISTBUILD_CONTROLLER': system_name == controller_name, + 'DISTBUILD_WORKER': system_name in worker_names, + 'WORKERS': ', '.join(map(host, worker_names)), + 'WORKER_SSH_KEY': '/etc/distbuild/worker.key', + } + + config_text = '\n'.join( + '%s: %s' % (key, value) for key, value in config.iteritems()) + config_text = \ + '# Generated by distbuild-trove-nfsboot.write\n' + \ + config_text + '\n' + path = os.path.join(system_path, 'etc', 'distbuild') + cliapp.ssh_runcmd( + nfs_netloc, ['mkdir', '-p', path]) + cliapp.runcmd( + ['scp', worker_ssh_key_path, '%s:%s' % (nfs_netloc, path)]) + self.set_remote_file_contents( + nfs_netloc, os.path.join(path, 'distbuild.conf'), config_text) + + def configure_nfs_exports(self, nfs_netloc, system_names): + '''Ensure the Trove is set up to export the NFS roots we need. + + This doesn't handle setting up the TFTP daemon. We assume that is + already running. + + ''' + for system_name in system_names: + exported_path = self.system_path(system_name) + exports_path = '/etc/exports' + + # Rather ugly SSH hackery follows to ensure each system path is + # listed in /etc/exports. + try: + cliapp.ssh_runcmd( + nfs_netloc, ['grep', '-q', exported_path, exports_path]) + except cliapp.AppException: + ip_mask = '*' + options = 'rw,no_subtree_check,no_root_squash,async' + exports_string = '%s %s(%s)\n' % (exported_path, ip_mask, + options) + exports_append_sh = '''\ + set -eu + target="$1" + temp=$(mktemp) + cat "$target" > "$temp" + cat >> "$temp" + mv "$temp" "$target" + ''' + cliapp.ssh_runcmd( + nfs_netloc, + ['sh', '-c', exports_append_sh, '--', exports_path], + feed_stdin=exports_string) + + cliapp.ssh_runcmd(nfs_netloc, + ['systemctl', 'restart', 'nfs-server.service']) + + def update_default_version(self, remote_netloc, system_name, + version_label): + self.status(msg='Linking \'default\' to %(version)s for %(system)s', + version=version_label, system=system_name) + system_path = self.system_path(system_name) + system_version_path = os.path.join(system_path, 'systems', + version_label) + default_path = os.path.join(system_path, 'systems', 'default') + + cliapp.ssh_runcmd(remote_netloc, + ['ln', '-sfn', system_version_path, default_path]) + + +DistbuildTroveNFSBootWriteExtension().run() |