#
# Copyright (C) 2017 Codethink Limited
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see .
#
# Authors:
# Jonathan Maw
"""
tar - stage files from tar archives
===================================
**Host dependencies:**
* lzip (for .tar.lz files)
**Usage:**
.. code:: yaml
# Specify the tar source kind
kind: tar
# Specify the tar url. Using an alias defined in your project
# configuration is encouraged. 'bst source track' will update the
# sha256sum in 'ref' to the downloaded file's sha256sum.
url: upstream:foo.tar
# Specify the ref. It's a sha256sum of the file you download.
ref: 6c9f6f68a131ec6381da82f2bff978083ed7f4f7991d931bfa767b7965ebc94b
# Specify a glob pattern to indicate the base directory to extract
# from the tarball. The first matching directory will be used.
#
# Note that this is '*' by default since most standard release
# tarballs contain a self named subdirectory at the root which
# contains the files one normally wants to extract to build.
#
# To extract the root of the tarball directly, this can be set
# to an empty string.
base-dir: '*'
See :ref:`built-in functionality doumentation ` for
details on common configuration options for sources.
"""
import os
import tarfile
from contextlib import contextmanager
from tempfile import TemporaryFile
from buildstream import SourceError
from buildstream import utils
from ._downloadablefilesource import DownloadableFileSource
class ReadableTarInfo(tarfile.TarInfo):
"""
The goal is to override `TarFile`'s `extractall` semantics by ensuring that on extraction, the
files are readable by the owner of the file. This is done by overriding the accessor for the
`mode` attribute in `TarInfo`, the class that encapsulates the internal meta-data of the tarball,
so that the owner-read bit is always set.
"""
@property
def mode(self):
# Respect umask instead of the file mode stored in the archive.
# The only bit used from the embedded mode is the executable bit for files.
umask = utils.get_umask()
if self.isdir() or bool(self.__permission | 0o100):
return 0o777 & ~umask
else:
return 0o666 & ~umask
@mode.setter
def mode(self, permission):
self.__permission = permission # pylint: disable=attribute-defined-outside-init
class TarSource(DownloadableFileSource):
# pylint: disable=attribute-defined-outside-init
def configure(self, node):
super().configure(node)
self.base_dir = node.get_str("base-dir", "*")
node.validate_keys(DownloadableFileSource.COMMON_CONFIG_KEYS + ["base-dir"])
def preflight(self):
self.host_lzip = None
if self.url.endswith(".lz"):
self.host_lzip = utils.get_host_tool("lzip")
def get_unique_key(self):
return super().get_unique_key() + [self.base_dir]
@contextmanager
def _run_lzip(self):
assert self.host_lzip
with TemporaryFile() as lzip_stdout:
with open(self._get_mirror_file(), "r") as lzip_file:
self.call([self.host_lzip, "-d"], stdin=lzip_file, stdout=lzip_stdout)
lzip_stdout.seek(0, 0)
yield lzip_stdout
@contextmanager
def _get_tar(self):
if self.url.endswith(".lz"):
with self._run_lzip() as lzip_dec:
with tarfile.open(fileobj=lzip_dec, mode="r:", tarinfo=ReadableTarInfo) as tar:
yield tar
else:
with tarfile.open(self._get_mirror_file(), tarinfo=ReadableTarInfo) as tar:
yield tar
def stage(self, directory):
try:
with self._get_tar() as tar:
base_dir = None
if self.base_dir:
base_dir = self._find_base_dir(tar, self.base_dir)
if base_dir:
tar.extractall(path=directory, members=self._extract_members(tar, base_dir, directory))
else:
tar.extractall(path=directory)
except (tarfile.TarError, OSError) as e:
raise SourceError("{}: Error staging source: {}".format(self, e)) from e
# Override and translate which filenames to extract
def _extract_members(self, tar, base_dir, target_dir):
# Assert that a tarfile is safe to extract; specifically, make
# sure that we don't do anything outside of the target
# directory (this is possible, if, say, someone engineered a
# tarfile to contain paths that start with ..).
def assert_safe(member):
final_path = os.path.abspath(os.path.join(target_dir, member.path))
if not final_path.startswith(target_dir):
raise SourceError(
"{}: Tarfile attempts to extract outside the staging area: "
"{} -> {}".format(self, member.path, final_path)
)
if member.islnk():
linked_path = os.path.abspath(os.path.join(target_dir, member.linkname))
if not linked_path.startswith(target_dir):
raise SourceError(
"{}: Tarfile attempts to hardlink outside the staging area: "
"{} -> {}".format(self, member.path, final_path)
)
# Don't need to worry about symlinks because they're just
# files here and won't be able to do much harm once we are
# in a sandbox.
if not base_dir.endswith(os.sep):
base_dir = base_dir + os.sep
L = len(base_dir)
for member in tar.getmembers():
# First, ensure that a member never starts with `./`
if member.path.startswith("./"):
member.path = member.path[2:]
if member.islnk() and member.linkname.startswith("./"):
member.linkname = member.linkname[2:]
# Now extract only the paths which match the normalized path
if member.path.startswith(base_dir):
# Hardlinks are smart and collapse into the "original"
# when their counterpart doesn't exist. This means we
# only need to modify links to files whose location we
# change.
#
# Since we assert that we're not linking to anything
# outside the target directory, this should only ever
# be able to link to things inside the target
# directory, so we should cover all bases doing this.
#
if member.islnk() and member.linkname.startswith(base_dir):
member.linkname = member.linkname[L:]
member.path = member.path[L:]
assert_safe(member)
yield member
# We want to iterate over all paths of a tarball, but getmembers()
# is not enough because some tarballs simply do not contain the leading
# directory paths for the archived files.
def _list_tar_paths(self, tar):
visited = set()
for member in tar.getmembers():
# Remove any possible leading './', offer more consistent behavior
# across tarballs encoded with or without a leading '.'
member_name = member.name.lstrip("./")
if not member.isdir():
# Loop over the components of a path, for a path of a/b/c/d
# we will first visit 'a', then 'a/b' and then 'a/b/c', excluding
# the final component
components = member_name.split("/")
for i in range(len(components) - 1):
dir_component = "/".join([components[j] for j in range(i + 1)])
if dir_component not in visited:
visited.add(dir_component)
try:
# Dont yield directory members which actually do
# exist in the archive
_ = tar.getmember(dir_component)
except KeyError:
if dir_component != ".":
yield dir_component
continue
# Avoid considering the '.' directory, if any is included in the archive
# this is to avoid the default 'base-dir: *' value behaving differently
# depending on whether the tarball was encoded with a leading '.' or not
if member_name == ".":
continue
yield member_name
def _find_base_dir(self, tar, pattern):
paths = self._list_tar_paths(tar)
matches = sorted(list(utils.glob(paths, pattern)))
if not matches:
raise SourceError("{}: Could not find base directory matching pattern: {}".format(self, pattern))
return matches[0]
def setup():
return TarSource