# # Copyright (C) 2017 Codethink Limited # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library. If not, see . # # Authors: # Jonathan Maw """ tar - stage files from tar archives =================================== **Host dependencies:** * lzip (for .tar.lz files) **Usage:** .. code:: yaml # Specify the tar source kind kind: tar # Specify the tar url. Using an alias defined in your project # configuration is encouraged. 'bst source track' will update the # sha256sum in 'ref' to the downloaded file's sha256sum. url: upstream:foo.tar # Specify the ref. It's a sha256sum of the file you download. ref: 6c9f6f68a131ec6381da82f2bff978083ed7f4f7991d931bfa767b7965ebc94b # Specify a glob pattern to indicate the base directory to extract # from the tarball. The first matching directory will be used. # # Note that this is '*' by default since most standard release # tarballs contain a self named subdirectory at the root which # contains the files one normally wants to extract to build. # # To extract the root of the tarball directly, this can be set # to an empty string. base-dir: '*' See :ref:`built-in functionality doumentation ` for details on common configuration options for sources. """ import os import tarfile from contextlib import contextmanager from tempfile import TemporaryFile from buildstream import SourceError from buildstream import utils from ._downloadablefilesource import DownloadableFileSource class ReadableTarInfo(tarfile.TarInfo): """ The goal is to override `TarFile`'s `extractall` semantics by ensuring that on extraction, the files are readable by the owner of the file. This is done by overriding the accessor for the `mode` attribute in `TarInfo`, the class that encapsulates the internal meta-data of the tarball, so that the owner-read bit is always set. """ @property def mode(self): # Respect umask instead of the file mode stored in the archive. # The only bit used from the embedded mode is the executable bit for files. umask = utils.get_umask() if self.isdir() or bool(self.__permission | 0o100): return 0o777 & ~umask else: return 0o666 & ~umask @mode.setter def mode(self, permission): self.__permission = permission # pylint: disable=attribute-defined-outside-init class TarSource(DownloadableFileSource): # pylint: disable=attribute-defined-outside-init def configure(self, node): super().configure(node) self.base_dir = node.get_str("base-dir", "*") node.validate_keys(DownloadableFileSource.COMMON_CONFIG_KEYS + ["base-dir"]) def preflight(self): self.host_lzip = None if self.url.endswith(".lz"): self.host_lzip = utils.get_host_tool("lzip") def get_unique_key(self): return super().get_unique_key() + [self.base_dir] @contextmanager def _run_lzip(self): assert self.host_lzip with TemporaryFile() as lzip_stdout: with open(self._get_mirror_file(), "r") as lzip_file: self.call([self.host_lzip, "-d"], stdin=lzip_file, stdout=lzip_stdout) lzip_stdout.seek(0, 0) yield lzip_stdout @contextmanager def _get_tar(self): if self.url.endswith(".lz"): with self._run_lzip() as lzip_dec: with tarfile.open(fileobj=lzip_dec, mode="r:", tarinfo=ReadableTarInfo) as tar: yield tar else: with tarfile.open(self._get_mirror_file(), tarinfo=ReadableTarInfo) as tar: yield tar def stage(self, directory): try: with self._get_tar() as tar: base_dir = None if self.base_dir: base_dir = self._find_base_dir(tar, self.base_dir) if base_dir: tar.extractall(path=directory, members=self._extract_members(tar, base_dir, directory)) else: tar.extractall(path=directory) except (tarfile.TarError, OSError) as e: raise SourceError("{}: Error staging source: {}".format(self, e)) from e # Override and translate which filenames to extract def _extract_members(self, tar, base_dir, target_dir): # Assert that a tarfile is safe to extract; specifically, make # sure that we don't do anything outside of the target # directory (this is possible, if, say, someone engineered a # tarfile to contain paths that start with ..). def assert_safe(member): final_path = os.path.abspath(os.path.join(target_dir, member.path)) if not final_path.startswith(target_dir): raise SourceError( "{}: Tarfile attempts to extract outside the staging area: " "{} -> {}".format(self, member.path, final_path) ) if member.islnk(): linked_path = os.path.abspath(os.path.join(target_dir, member.linkname)) if not linked_path.startswith(target_dir): raise SourceError( "{}: Tarfile attempts to hardlink outside the staging area: " "{} -> {}".format(self, member.path, final_path) ) # Don't need to worry about symlinks because they're just # files here and won't be able to do much harm once we are # in a sandbox. if not base_dir.endswith(os.sep): base_dir = base_dir + os.sep L = len(base_dir) for member in tar.getmembers(): # First, ensure that a member never starts with `./` if member.path.startswith("./"): member.path = member.path[2:] if member.islnk() and member.linkname.startswith("./"): member.linkname = member.linkname[2:] # Now extract only the paths which match the normalized path if member.path.startswith(base_dir): # Hardlinks are smart and collapse into the "original" # when their counterpart doesn't exist. This means we # only need to modify links to files whose location we # change. # # Since we assert that we're not linking to anything # outside the target directory, this should only ever # be able to link to things inside the target # directory, so we should cover all bases doing this. # if member.islnk() and member.linkname.startswith(base_dir): member.linkname = member.linkname[L:] member.path = member.path[L:] assert_safe(member) yield member # We want to iterate over all paths of a tarball, but getmembers() # is not enough because some tarballs simply do not contain the leading # directory paths for the archived files. def _list_tar_paths(self, tar): visited = set() for member in tar.getmembers(): # Remove any possible leading './', offer more consistent behavior # across tarballs encoded with or without a leading '.' member_name = member.name.lstrip("./") if not member.isdir(): # Loop over the components of a path, for a path of a/b/c/d # we will first visit 'a', then 'a/b' and then 'a/b/c', excluding # the final component components = member_name.split("/") for i in range(len(components) - 1): dir_component = "/".join([components[j] for j in range(i + 1)]) if dir_component not in visited: visited.add(dir_component) try: # Dont yield directory members which actually do # exist in the archive _ = tar.getmember(dir_component) except KeyError: if dir_component != ".": yield dir_component continue # Avoid considering the '.' directory, if any is included in the archive # this is to avoid the default 'base-dir: *' value behaving differently # depending on whether the tarball was encoded with a leading '.' or not if member_name == ".": continue yield member_name def _find_base_dir(self, tar, pattern): paths = self._list_tar_paths(tar) matches = sorted(list(utils.glob(paths, pattern))) if not matches: raise SourceError("{}: Could not find base directory matching pattern: {}".format(self, pattern)) return matches[0] def setup(): return TarSource