utils.py 42 KB
Newer Older
1
#
2
#  Copyright (C) 2016-2018 Codethink Limited
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU Lesser General Public
#  License as published by the Free Software Foundation; either
#  version 2 of the License, or (at your option) any later version.
#
#  This library is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
#  Lesser General Public License for more details.
#
#  You should have received a copy of the GNU Lesser General Public
#  License along with this library. If not, see <http://www.gnu.org/licenses/>.
#
#  Authors:
#        Tristan Van Berkom <tristan.vanberkom@codethink.co.uk>
19 20 21 22
"""
Utilities
=========
"""
23

Gökçen Nurlu's avatar
Gökçen Nurlu committed
24
import calendar
25
import errno
26 27
import functools
import gc
28
import hashlib
Gökçen Nurlu's avatar
Gökçen Nurlu committed
29
import os
30
import re
Gökçen Nurlu's avatar
Gökçen Nurlu committed
31 32 33
import shutil
import signal
import stat
34
from stat import S_ISDIR
Gökçen Nurlu's avatar
Gökçen Nurlu committed
35 36
import string
import subprocess
37
import tempfile
38
import itertools
39
from contextlib import contextmanager
Gökçen Nurlu's avatar
Gökçen Nurlu committed
40 41 42

import psutil

Jürg Billeter's avatar
Jürg Billeter committed
43
from . import _signals
44
from ._exceptions import BstError, ErrorDomain
45
from ._protos.build.bazel.remote.execution.v2 import remote_execution_pb2
46

47 48 49
# The magic number for timestamps: 2011-11-11 11:11:11
_magic_timestamp = calendar.timegm([2011, 11, 11, 11, 11, 11])

50

51 52
# The separator we use for user specified aliases
_ALIAS_SEPARATOR = ':'
53
_URI_SCHEMES = ["http", "https", "ftp", "file", "git", "sftp", "ssh"]
54 55


56 57 58 59 60 61 62 63
class UtilError(BstError):
    """Raised by utility functions when system calls fail.

    This will be handled internally by the BuildStream core,
    if you need to handle this error, then it should be reraised,
    or either of the :class:`.ElementError` or :class:`.SourceError`
    exceptions should be raised from this error.
    """
64 65
    def __init__(self, message, reason=None):
        super().__init__(message, domain=ErrorDomain.UTIL, reason=reason)
66 67 68 69 70 71 72


class ProgramNotFoundError(BstError):
    """Raised if a required program is not found.

    It is normally unneeded to handle this exception from plugin code.
    """
73 74
    def __init__(self, message, reason=None):
        super().__init__(message, domain=ErrorDomain.PROG_NOT_FOUND, reason=reason)
75 76


77 78 79 80 81
class DirectoryExistsError(OSError):
    """Raised when a `os.rename` is attempted but the destination is an existing directory.
    """


82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
class FileListResult():
    """An object which stores the result of one of the operations
    which run on a list of files.
    """

    def __init__(self):

        self.overwritten = []
        """List of files which were overwritten in the target directory"""

        self.ignored = []
        """List of files which were ignored, because they would have
        replaced a non empty directory"""

        self.failed_attributes = []
        """List of files for which attributes could not be copied over"""

99 100 101
        self.files_written = []
        """List of files that were written."""

102 103 104 105 106 107 108 109 110 111 112 113
    def combine(self, other):
        """Create a new FileListResult that contains the results of both.
        """
        ret = FileListResult()

        ret.overwritten = self.overwritten + other.overwritten
        ret.ignored = self.ignored + other.ignored
        ret.failed_attributes = self.failed_attributes + other.failed_attributes
        ret.files_written = self.files_written + other.files_written

        return ret

114

115
def list_relative_paths(directory):
116 117 118 119 120 121 122
    """A generator for walking directory relative paths

    This generator is useful for checking the full manifest of
    a directory.

    Symbolic links will not be followed, but will be included
    in the manifest.
123 124 125 126

    Args:
       directory (str): The directory to list files in

127 128
    Yields:
       Relative filenames in `directory`
129
    """
130
    for (dirpath, dirnames, filenames) in os.walk(directory):
131

132 133 134 135 136 137 138
        # os.walk does not decend into symlink directories, which
        # makes sense because otherwise we might have redundant
        # directories, or end up descending into directories outside
        # of the walk() directory.
        #
        # But symlinks to directories are still identified as
        # subdirectories in the walked `dirpath`, so we extract
139
        # these symlinks from `dirnames` and add them to `filenames`.
140
        #
141 142 143
        for d in dirnames:
            fullpath = os.path.join(dirpath, d)
            if os.path.islink(fullpath):
144 145 146 147 148 149 150 151 152 153 154 155
                filenames.append(d)

        # Modifying the dirnames directly ensures that the os.walk() generator
        # allows us to specify the order in which they will be iterated.
        dirnames.sort()
        filenames.sort()

        relpath = os.path.relpath(dirpath, directory)

        # We don't want "./" pre-pended to all the entries in the root of
        # `directory`, prefer to have no prefix in that case.
        basepath = relpath if relpath != '.' and dirpath != directory else ''
156

157 158 159
        # First yield the walked directory itself, except for the root
        if basepath != '':
            yield basepath
160 161

        # List the filenames in the walked directory
162
        for f in filenames:
163
            yield os.path.join(basepath, f)
164 165


166
# pylint: disable=anomalous-backslash-in-string
167 168 169 170 171 172 173 174 175
def glob(paths, pattern):
    """A generator to yield paths which match the glob pattern

    Args:
       paths (iterable): The paths to check
       pattern (str): A glob pattern

    This generator will iterate over the passed *paths* and
    yield only the filenames which matched the provided *pattern*.
176 177 178 179

    +--------+------------------------------------------------------------------+
    | Meta   | Description                                                      |
    +========+==================================================================+
180
    | \*     | Zero or more of any character, excepting path separators         |
181
    +--------+------------------------------------------------------------------+
182
    | \**    | Zero or more of any character, including path separators         |
183
    +--------+------------------------------------------------------------------+
184
    | ?      | One of any character, except for path separators                 |
185
    +--------+------------------------------------------------------------------+
186
    | [abc]  | One of any of the specified characters                           |
187
    +--------+------------------------------------------------------------------+
188
    | [a-z]  | One of the characters in the specified range                     |
189
    +--------+------------------------------------------------------------------+
190
    | [!abc] | Any single character, except the specified characters            |
191
    +--------+------------------------------------------------------------------+
192
    | [!a-z] | Any single character, except those in the specified range        |
193 194 195 196 197 198
    +--------+------------------------------------------------------------------+

    .. note::

       Escaping of the metacharacters is not possible

199
    """
200 201 202
    # Ensure leading slash, just because we want patterns
    # to match file lists regardless of whether the patterns
    # or file lists had a leading slash or not.
203 204 205
    if not pattern.startswith(os.sep):
        pattern = os.sep + pattern

206 207 208
    expression = _glob2re(pattern)
    regexer = re.compile(expression)

209 210 211 212 213
    for filename in paths:
        filename_try = filename
        if not filename_try.startswith(os.sep):
            filename_try = os.sep + filename_try

214
        if regexer.match(filename_try):
215 216 217
            yield filename


218 219 220 221 222 223 224 225 226 227
def sha256sum(filename):
    """Calculate the sha256sum of a file

    Args:
       filename (str): A path to a file on disk

    Returns:
       (str): An sha256 checksum string

    Raises:
228 229
       UtilError: In the case there was an issue opening
                  or reading `filename`
230
    """
231 232 233
    try:
        h = hashlib.sha256()
        with open(filename, "rb") as f:
234
            for chunk in iter(lambda: f.read(65536), b""):
235
                h.update(chunk)
236

237 238 239
    except OSError as e:
        raise UtilError("Failed to get a checksum of file '{}': {}"
                        .format(filename, e)) from e
240 241 242 243

    return h.hexdigest()


244
def safe_copy(src, dest, *, result=None):
245 246 247 248 249
    """Copy a file while preserving attributes

    Args:
       src (str): The source filename
       dest (str): The destination filename
250
       result (:class:`~.FileListResult`): An optional collective result
251

252
    Raises:
253
       UtilError: In the case of unexpected system call failures
254

255 256 257 258 259 260 261 262 263
    This is almost the same as shutil.copy2(), except that
    we unlink *dest* before overwriting it if it exists, just
    incase *dest* is a hardlink to a different file.
    """
    # First unlink the target if it exists
    try:
        os.unlink(dest)
    except OSError as e:
        if e.errno != errno.ENOENT:
264 265
            raise UtilError("Failed to remove destination file '{}': {}"
                            .format(dest, e)) from e
266

267 268 269 270 271 272 273 274 275 276 277 278
    shutil.copyfile(src, dest)
    try:
        shutil.copystat(src, dest)
    except PermissionError:
        # If we failed to copy over some file stats, dont treat
        # it as an unrecoverable error, but provide some feedback
        # we can use for a warning.
        #
        # This has a tendency of happening when attempting to copy
        # over extended file attributes.
        if result:
            result.failed_attributes.append(dest)
279

280 281 282
    except shutil.Error as e:
        raise UtilError("Failed to copy '{} -> {}': {}"
                        .format(src, dest, e)) from e
283 284


285
def safe_link(src, dest, *, result=None, _unlink=False):
286 287 288 289 290
    """Try to create a hardlink, but resort to copying in the case of cross device links.

    Args:
       src (str): The source filename
       dest (str): The destination filename
291
       result (:class:`~.FileListResult`): An optional collective result
292 293

    Raises:
294
       UtilError: In the case of unexpected system call failures
295 296
    """

297 298 299 300 301 302 303 304
    if _unlink:
        # First unlink the target if it exists
        try:
            os.unlink(dest)
        except OSError as e:
            if e.errno != errno.ENOENT:
                raise UtilError("Failed to remove destination file '{}': {}"
                                .format(dest, e)) from e
305 306 307 308 309

    # If we can't link it due to cross-device hardlink, copy
    try:
        os.link(src, dest)
    except OSError as e:
310 311 312 313
        if e.errno == errno.EEXIST and not _unlink:
            # Target exists already, unlink and try again
            safe_link(src, dest, result=result, _unlink=True)
        elif e.errno == errno.EXDEV:
314
            safe_copy(src, dest)
315
        else:
316 317
            raise UtilError("Failed to link '{} -> {}': {}"
                            .format(src, dest, e)) from e
318 319


320 321 322 323
def safe_remove(path):
    """Removes a file or directory

    This will remove a file if it exists, and will
324
    remove a directory if the directory is empty.
325 326 327 328 329 330 331 332 333

    Args:
       path (str): The path to remove

    Returns:
       True if `path` was removed or did not exist, False
       if `path` was a non empty directory.

    Raises:
334
       UtilError: In the case of unexpected system call failures
335
    """
336 337 338 339
    try:
        if S_ISDIR(os.lstat(path).st_mode):
            os.rmdir(path)
        else:
340
            os.unlink(path)
341 342 343 344 345 346 347 348 349 350 351 352 353 354

        # File removed/unlinked successfully
        return True

    except OSError as e:
        if e.errno == errno.ENOTEMPTY:
            # Path is non-empty directory
            return False
        elif e.errno == errno.ENOENT:
            # Path does not exist
            return True

        raise UtilError("Failed to remove '{}': {}"
                        .format(path, e))
355 356


357
def copy_files(src, dest, *, filter_callback=None, ignore_missing=False, report_written=False):
358 359 360 361 362
    """Copy files from source to destination.

    Args:
       src (str): The source file or directory
       dest (str): The destination directory
363 364 365 366
       filter_callback (callable): Optional filter callback. Called with the relative path as
                                   argument for every file in the source directory. The file is
                                   copied only if the callable returns True. If no filter callback
                                   is specified, all files will be copied.
367
       ignore_missing (bool): Dont raise any error if a source file is missing
368
       report_written (bool): Add to the result object the full list of files written
369 370

    Returns:
371
       (:class:`~.FileListResult`): The result describing what happened during this file operation
372

373
    Raises:
374
       UtilError: In the case of unexpected system call failures
375

376
    .. note::
377

378 379 380
       Directories in `dest` are replaced with files from `src`,
       unless the existing directory in `dest` is not empty in which
       case the path will be reported in the return value.
381 382

       UNIX domain socket files from `src` are ignored.
383
    """
384
    result = FileListResult()
385
    try:
386
        _process_list(src, dest, safe_copy, result,
387 388
                      filter_callback=filter_callback,
                      ignore_missing=ignore_missing,
389
                      report_written=report_written)
390 391 392
    except OSError as e:
        raise UtilError("Failed to copy '{} -> {}': {}"
                        .format(src, dest, e))
393
    return result
394 395


396
def link_files(src, dest, *, filter_callback=None, ignore_missing=False, report_written=False):
397 398 399 400 401
    """Hardlink files from source to destination.

    Args:
       src (str): The source file or directory
       dest (str): The destination directory
402 403 404 405
       filter_callback (callable): Optional filter callback. Called with the relative path as
                                   argument for every file in the source directory. The file is
                                   hardlinked only if the callable returns True. If no filter
                                   callback is specified, all files will be hardlinked.
406
       ignore_missing (bool): Dont raise any error if a source file is missing
407
       report_written (bool): Add to the result object the full list of files written
408 409

    Returns:
410
       (:class:`~.FileListResult`): The result describing what happened during this file operation
411

412
    Raises:
413
       UtilError: In the case of unexpected system call failures
414

415
    .. note::
416

417 418 419
       Directories in `dest` are replaced with files from `src`,
       unless the existing directory in `dest` is not empty in which
       case the path will be reported in the return value.
420

421
    .. note::
422

423 424
       If a hardlink cannot be created due to crossing filesystems,
       then the file will be copied instead.
425 426

       UNIX domain socket files from `src` are ignored.
427
    """
428
    result = FileListResult()
429
    try:
430
        _process_list(src, dest, safe_link, result,
431 432
                      filter_callback=filter_callback,
                      ignore_missing=ignore_missing,
433
                      report_written=report_written)
434 435 436 437
    except OSError as e:
        raise UtilError("Failed to link '{} -> {}': {}"
                        .format(src, dest, e))

438
    return result
439 440


441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
def get_host_tool(name):
    """Get the full path of a host tool

    Args:
       name (str): The name of the program to search for

    Returns:
       The full path to the program, if found

    Raises:
       :class:`.ProgramNotFoundError`
    """
    search_path = os.environ.get('PATH')
    program_path = shutil.which(name, path=search_path)

    if not program_path:
457
        raise ProgramNotFoundError("Did not find '{}' in PATH: {}".format(name, search_path))
458 459 460 461

    return program_path


462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
def url_directory_name(url):
    """Normalizes a url into a directory name

    Args:
       url (str): A url string

    Returns:
       A string which can be used as a directory name
    """
    valid_chars = string.digits + string.ascii_letters + '%_'

    def transl(x):
        return x if x in valid_chars else '_'

    return ''.join([transl(x) for x in url])


479 480 481 482 483 484 485 486
def get_bst_version():
    """Gets the major, minor release portion of the
    BuildStream version.

    Returns:
       (int): The major version
       (int): The minor version
    """
487
    # Import this only conditionally, it's not resolved at bash complete time
488
    from . import __version__  # pylint: disable=cyclic-import
489
    versions = __version__.split('.')[:2]
490

491 492 493 494
    if versions[0] == '0+untagged':
        raise UtilError("Your git repository has no tags - BuildStream can't "
                        "determine its version. Please run `git fetch --tags`.")

Will Salmon's avatar
Will Salmon committed
495 496 497 498 499 500 501 502 503 504
    try:
        return (int(versions[0]), int(versions[1]))
    except IndexError:
        raise UtilError("Cannot detect Major and Minor parts of the version\n"
                        "Version: {} not in XX.YY.whatever format"
                        .format(__version__))
    except ValueError:
        raise UtilError("Cannot convert version to integer numbers\n"
                        "Version: {} not in Integer.Integer.whatever format"
                        .format(__version__))
505 506


507
def move_atomic(source, destination, *, ensure_parents=True):
508 509 510 511 512 513
    """Move the source to the destination using atomic primitives.

    This uses `os.rename` to move a file or directory to a new destination.
    It wraps some `OSError` thrown errors to ensure their handling is correct.

    The main reason for this to exist is that rename can throw different errors
514 515
    for the same symptom (https://www.unix.com/man-page/POSIX/3posix/rename/)
    when we are moving a directory.
516 517

    We are especially interested here in the case when the destination already
518 519
    exists, is a directory and is not empty. In this case, either EEXIST or
    ENOTEMPTY can be thrown.
520 521 522 523 524 525 526 527 528

    In order to ensure consistent handling of these exceptions, this function
    should be used instead of `os.rename`

    Args:
      source (str or Path): source to rename
      destination (str or Path): destination to which to move the source
      ensure_parents (bool): Whether or not to create the parent's directories
                             of the destination (default: True)
529 530 531 532
    Raises:
      DirectoryExistsError: if the destination directory already exists and is
                            not empty
      OSError: if another filesystem level error occured
533 534 535 536 537 538 539 540 541 542 543 544
    """
    if ensure_parents:
        os.makedirs(os.path.dirname(str(destination)), exist_ok=True)

    try:
        os.rename(str(source), str(destination))
    except OSError as exc:
        if exc.errno in (errno.EEXIST, errno.ENOTEMPTY):
            raise DirectoryExistsError(*exc.args) from exc
        raise


545 546
@contextmanager
def save_file_atomic(filename, mode='w', *, buffering=-1, encoding=None,
547
                     errors=None, newline=None, closefd=True, opener=None, tempdir=None):
548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573
    """Save a file with a temporary name and rename it into place when ready.

    This is a context manager which is meant for saving data to files.
    The data is written to a temporary file, which gets renamed to the target
    name when the context is closed. This avoids readers of the file from
    getting an incomplete file.

    **Example:**

    .. code:: python

      with save_file_atomic('/path/to/foo', 'w') as f:
          f.write(stuff)

    The file will be called something like ``tmpCAFEBEEF`` until the
    context block ends, at which point it gets renamed to ``foo``. The
    temporary file will be created in the same directory as the output file.
    The ``filename`` parameter must be an absolute path.

    If an exception occurs or the process is terminated, the temporary file will
    be deleted.
    """
    # This feature has been proposed for upstream Python in the past, e.g.:
    # https://bugs.python.org/issue8604

    assert os.path.isabs(filename), "The utils.save_file_atomic() parameter ``filename`` must be an absolute path"
574 575 576
    if tempdir is None:
        tempdir = os.path.dirname(filename)
    fd, tempname = tempfile.mkstemp(dir=tempdir)
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
    os.close(fd)

    f = open(tempname, mode=mode, buffering=buffering, encoding=encoding,
             errors=errors, newline=newline, closefd=closefd, opener=opener)

    def cleanup_tempfile():
        f.close()
        try:
            os.remove(tempname)
        except FileNotFoundError:
            pass
        except OSError as e:
            raise UtilError("Failed to cleanup temporary file {}: {}".format(tempname, e)) from e

    try:
        with _signals.terminator(cleanup_tempfile):
            f.real_filename = filename
            yield f
            f.close()
            # This operation is atomic, at least on platforms we care about:
            # https://bugs.python.org/issue8828
            os.replace(tempname, filename)
599
    except Exception:
600 601 602 603
        cleanup_tempfile()
        raise


604 605 606 607
# _get_dir_size():
#
# Get the disk usage of a given directory in bytes.
#
608 609 610
# This function assumes that files do not inadvertantly
# disappear while this function is running.
#
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633
# Arguments:
#     (str) The path whose size to check.
#
# Returns:
#     (int) The size on disk in bytes.
#
def _get_dir_size(path):
    path = os.path.abspath(path)

    def get_size(path):
        total = 0

        for f in os.scandir(path):
            total += f.stat(follow_symlinks=False).st_size

            if f.is_dir(follow_symlinks=False):
                total += get_size(f.path)

        return total

    return get_size(path)


634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
# _get_volume_size():
#
# Gets the overall usage and total size of a mounted filesystem in bytes.
#
# Args:
#    path (str): The path to check
#
# Returns:
#    (int): The total number of bytes on the volume
#    (int): The number of available bytes on the volume
#
def _get_volume_size(path):
    try:
        stat_ = os.statvfs(path)
    except OSError as e:
        raise UtilError("Failed to retrieve stats on volume for path '{}': {}"
                        .format(path, e)) from e

    return stat_.f_bsize * stat_.f_blocks, stat_.f_bsize * stat_.f_bavail


655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688
# _parse_size():
#
# Convert a string representing data size to a number of
# bytes. E.g. "2K" -> 2048.
#
# This uses the same format as systemd's
# [resource-control](https://www.freedesktop.org/software/systemd/man/systemd.resource-control.html#).
#
# Arguments:
#     size (str) The string to parse
#     volume (str) A path on the volume to consider for percentage
#                  specifications
#
# Returns:
#     (int|None) The number of bytes, or None if 'infinity' was specified.
#
# Raises:
#     UtilError if the string is not a valid data size.
#
def _parse_size(size, volume):
    if size == 'infinity':
        return None

    matches = re.fullmatch(r'([0-9]+\.?[0-9]*)([KMGT%]?)', size)
    if matches is None:
        raise UtilError("{} is not a valid data size.".format(size))

    num, unit = matches.groups()

    if unit == '%':
        num = float(num)
        if num > 100:
            raise UtilError("{}% is not a valid percentage value.".format(num))

689
        disk_size, _ = _get_volume_size(volume)
690 691 692 693 694 695 696

        return disk_size * (num / 100)

    units = ('', 'K', 'M', 'G', 'T')
    return int(num) * 1024**units.index(unit)


697 698
# _pretty_size()
#
699
# Converts a number of bytes into a string representation in KiB, MiB, GiB, TiB
700 701 702 703 704 705 706 707 708 709 710
# represented as K, M, G, T etc.
#
# Args:
#   size (int): The size to convert in bytes.
#   dec_places (int): The number of decimal places to output to.
#
# Returns:
#   (str): The string representation of the number of bytes in the largest
def _pretty_size(size, dec_places=0):
    psize = size
    unit = 'B'
711 712
    units = ('B', 'K', 'M', 'G', 'T')
    for unit in units:
713 714
        if psize < 1024:
            break
715
        elif unit != units[-1]:
716 717 718
            psize /= 1024
    return "{size:g}{unit}".format(size=round(psize, dec_places), unit=unit)

Javier Jardón's avatar
Javier Jardón committed
719

720 721 722 723 724 725 726 727 728 729 730 731
# Main process pid
_main_pid = os.getpid()


# _is_main_process()
#
# Return whether we are in the main process or not.
#
def _is_main_process():
    assert _main_pid is not None
    return os.getpid() == _main_pid

732

733 734
# Recursively remove directories, ignoring file permissions as much as
# possible.
735 736
def _force_rmtree(rootpath, **kwargs):
    for root, dirs, _ in os.walk(rootpath):
737 738 739
        for d in dirs:
            path = os.path.join(root, d.lstrip('/'))
            if os.path.exists(path) and not os.path.islink(path):
740 741 742 743 744
                try:
                    os.chmod(path, 0o755)
                except OSError as e:
                    raise UtilError("Failed to ensure write permission on file '{}': {}"
                                    .format(path, e))
745

746 747
    try:
        shutil.rmtree(rootpath, **kwargs)
748
    except OSError as e:
749 750
        raise UtilError("Failed to remove cache directory '{}': {}"
                        .format(rootpath, e))
751 752 753


# Recursively make directories in target area
754
def _copy_directories(srcdir, destdir, target):
755 756
    this_dir = os.path.dirname(target)
    new_dir = os.path.join(destdir, this_dir)
757 758

    if not os.path.lexists(new_dir):
759
        if this_dir:
760
            yield from _copy_directories(srcdir, destdir, this_dir)
761

762
        old_dir = os.path.join(srcdir, this_dir)
763 764 765 766 767 768
        if os.path.lexists(old_dir):
            dir_stat = os.lstat(old_dir)
            mode = dir_stat.st_mode

            if stat.S_ISDIR(mode) or stat.S_ISLNK(mode):
                os.makedirs(new_dir)
769
                yield (new_dir, mode)
770
            else:
771 772
                raise UtilError('Source directory tree has file where '
                                'directory expected: {}'.format(old_dir))
773 774


775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
# _ensure_real_directory()
#
# Ensure `path` is a real directory and there are no symlink components.
#
# Symlink components are allowed in `root`.
#
def _ensure_real_directory(root, path):
    destpath = root
    for name in os.path.split(path):
        destpath = os.path.join(destpath, name)
        try:
            deststat = os.lstat(destpath)
            if not stat.S_ISDIR(deststat.st_mode):
                relpath = destpath[len(root):]

                if stat.S_ISLNK(deststat.st_mode):
                    filetype = 'symlink'
                elif stat.S_ISREG(deststat.st_mode):
                    filetype = 'regular file'
                else:
                    filetype = 'special file'

                raise UtilError('Destination is a {}, not a directory: {}'.format(filetype, relpath))
        except FileNotFoundError:
            os.makedirs(destpath)
800 801


802 803 804 805 806 807 808 809 810 811 812
# _process_list()
#
# Internal helper for copying/moving/linking file lists
#
# This will handle directories, symlinks and special files
# internally, the `actionfunc` will only be called for regular files.
#
# Args:
#    srcdir: The source base directory
#    destdir: The destination base directory
#    actionfunc: The function to call for regular files
813
#    result: The FileListResult
814
#    filter_callback: Optional callback to invoke for every directory entry
815 816 817
#    ignore_missing: Dont raise any error if a source file is missing
#
#
818
def _process_list(srcdir, destdir, actionfunc, result,
819
                  filter_callback=None,
820
                  ignore_missing=False, report_written=False):
821

822 823 824 825
    # Keep track of directory permissions, since these need to be set
    # *after* files have been written.
    permissions = []

826 827
    filelist = list_relative_paths(srcdir)

828 829 830
    if filter_callback:
        filelist = [path for path in filelist if filter_callback(path)]

831 832
    # Now walk the list
    for path in filelist:
833 834
        srcpath = os.path.join(srcdir, path)
        destpath = os.path.join(destdir, path)
835

836 837 838 839
        # Ensure that the parent of the destination path exists without symlink
        # components.
        _ensure_real_directory(destdir, os.path.dirname(path))

840 841 842 843
        # Add to the results the list of files written
        if report_written:
            result.files_written.append(path)

844 845
        # Collect overlaps
        if os.path.lexists(destpath) and not os.path.isdir(destpath):
846
            result.overwritten.append(path)
847

848
        # The destination directory may not have been created separately
849
        permissions.extend(_copy_directories(srcdir, destdir, path))
850

851 852 853
        try:
            file_stat = os.lstat(srcpath)
            mode = file_stat.st_mode
854

855
        except FileNotFoundError as e:
856 857 858 859
            # Skip this missing file
            if ignore_missing:
                continue
            else:
860
                raise UtilError("Source file is missing: {}".format(srcpath)) from e
861 862

        if stat.S_ISDIR(mode):
863
            # Ensure directory exists in destination
864
            _ensure_real_directory(destdir, path)
865
            permissions.append((destpath, os.stat(srcpath).st_mode))
866 867

        elif stat.S_ISLNK(mode):
868
            if not safe_remove(destpath):
869
                result.ignored.append(path)
870
                continue
871

872 873 874 875 876
            target = os.readlink(srcpath)
            os.symlink(target, destpath)

        elif stat.S_ISREG(mode):
            # Process the file.
877
            if not safe_remove(destpath):
878
                result.ignored.append(path)
879
                continue
880

881
            actionfunc(srcpath, destpath, result=result)
882 883 884

        elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
            # Block or character device. Put contents of st_dev in a mknod.
885
            if not safe_remove(destpath):
886
                result.ignored.append(path)
887
                continue
888

889 890 891 892 893
            if os.path.lexists(destpath):
                os.remove(destpath)
            os.mknod(destpath, file_stat.st_mode, file_stat.st_rdev)
            os.chmod(destpath, file_stat.st_mode)

894 895 896 897 898 899 900
        elif stat.S_ISFIFO(mode):
            os.mkfifo(destpath, mode)

        elif stat.S_ISSOCK(mode):
            # We can't duplicate the process serving the socket anyway
            pass

901 902
        else:
            # Unsupported type.
903
            raise UtilError('Cannot extract {} into staging-area. Unsupported type.'.format(srcpath))
904

905 906 907 908
    # Write directory permissions now that all files have been written
    for d, perms in permissions:
        os.chmod(d, perms)

909

910 911 912 913 914 915 916 917 918 919 920 921 922 923
# _set_deterministic_user()
#
# Set the uid/gid for every file in a directory tree to the process'
# euid/guid.
#
# Args:
#    directory (str): The directory to recursively set the uid/gid on
#
def _set_deterministic_user(directory):
    user = os.geteuid()
    group = os.getegid()

    for root, dirs, files in os.walk(directory.encode("utf-8"), topdown=False):
        for filename in files:
924
            os.chown(os.path.join(root, filename), user, group, follow_symlinks=False)
925 926

        for dirname in dirs:
927
            os.chown(os.path.join(root, dirname), user, group, follow_symlinks=False)
928 929


930 931 932 933 934 935 936 937 938 939 940 941 942 943 944
# _set_deterministic_mtime()
#
# Set the mtime for every file in a directory tree to the same.
#
# Args:
#    directory (str): The directory to recursively set the mtime on
#
def _set_deterministic_mtime(directory):
    for dirname, _, filenames in os.walk(directory.encode("utf-8"), topdown=False):
        for filename in filenames:
            pathname = os.path.join(dirname, filename)

            # Python's os.utime only ever modifies the timestamp
            # of the target, it is not acceptable to set the timestamp
            # of the target here, if we are staging the link target we
945
            # will also set its timestamp.
946 947 948 949 950 951 952 953 954 955
            #
            # We should however find a way to modify the actual link's
            # timestamp, this outdated python bug report claims that
            # it is impossible:
            #
            #   http://bugs.python.org/issue623782
            #
            # However, nowadays it is possible at least on gnuish systems
            # with with the lutimes glibc function.
            if not os.path.islink(pathname):
956
                os.utime(pathname, (_magic_timestamp, _magic_timestamp))
957

958
        os.utime(dirname, (_magic_timestamp, _magic_timestamp))
959 960


961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977
# _tempdir()
#
# A context manager for doing work in a temporary directory.
#
# Args:
#    dir (str): A path to a parent directory for the temporary directory
#    suffix (str): A suffix for the temproary directory name
#    prefix (str): A prefix for the temporary directory name
#
# Yields:
#    (str): The temporary directory
#
# In addition to the functionality provided by python's
# tempfile.TemporaryDirectory() context manager, this one additionally
# supports cleaning up the temp directory on SIGTERM.
#
@contextmanager
978
def _tempdir(suffix="", prefix="tmp", dir=None):  # pylint: disable=redefined-builtin
979 980 981 982
    tempdir = tempfile.mkdtemp(suffix=suffix, prefix=prefix, dir=dir)

    def cleanup_tempdir():
        if os.path.isdir(tempdir):
983
            _force_rmtree(tempdir)
984

985 986 987 988 989
    try:
        with _signals.terminator(cleanup_tempdir):
            yield tempdir
    finally:
        cleanup_tempdir()
990 991


992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
# _tempnamedfile()
#
# A context manager for doing work on an open temporary file
# which is guaranteed to be named and have an entry in the filesystem.
#
# Args:
#    dir (str): A path to a parent directory for the temporary file
#    suffix (str): A suffix for the temproary file name
#    prefix (str): A prefix for the temporary file name
#
# Yields:
#    (str): The temporary file handle
#
# Do not use tempfile.NamedTemporaryFile() directly, as this will
# leak files on the filesystem when BuildStream exits a process
# on SIGTERM.
#
@contextmanager
def _tempnamedfile(suffix="", prefix="tmp", dir=None):  # pylint: disable=redefined-builtin
    temp = None

    def close_tempfile():
        if temp is not None:
            temp.close()

    with _signals.terminator(close_tempfile), \
        tempfile.NamedTemporaryFile(suffix=suffix, prefix=prefix, dir=dir) as temp:
        yield temp


1022 1023
# _kill_process_tree()
#
1024
# Brutally murder a process and all of its children
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039
#
# Args:
#    pid (int): Process ID
#
def _kill_process_tree(pid):
    proc = psutil.Process(pid)
    children = proc.children(recursive=True)

    def kill_proc(p):
        try:
            p.kill()
        except psutil.AccessDenied:
            # Ignore this error, it can happen with
            # some setuid bwrap processes.
            pass
1040 1041 1042 1043 1044
        except psutil.NoSuchProcess:
            # It is certain that this has already been sent
            # SIGTERM, so there is a window where the process
            # could have exited already.
            pass
1045 1046 1047 1048 1049

    # Bloody Murder
    for child in children:
        kill_proc(child)
    kill_proc(proc)
Jürg Billeter's avatar
Jürg Billeter committed
1050 1051 1052 1053


# _call()
#
1054
# A wrapper for subprocess.call() supporting suspend and resume
Jürg Billeter's avatar
Jürg Billeter committed
1055 1056 1057
#
# Args:
#    popenargs (list): Popen() arguments
1058
#    terminate (bool): Whether to attempt graceful termination before killing
Jürg Billeter's avatar
Jürg Billeter committed
1059 1060 1061 1062 1063 1064
#    rest_of_args (kwargs): Remaining arguments to subprocess.call()
#
# Returns:
#    (int): The process exit code.
#    (str): The program output.
#
1065
def _call(*popenargs, terminate=False, **kwargs):
Jürg Billeter's avatar
Jürg Billeter committed
1066 1067 1068

    kwargs['start_new_session'] = True

1069 1070
    process = None

1071 1072 1073 1074 1075 1076 1077 1078 1079
    old_preexec_fn = kwargs.get('preexec_fn')
    if 'preexec_fn' in kwargs:
        del kwargs['preexec_fn']

    def preexec_fn():
        os.umask(stat.S_IWGRP | stat.S_IWOTH)
        if old_preexec_fn is not None:
            old_preexec_fn()

Jürg Billeter's avatar
Jürg Billeter committed
1080 1081 1082
    # Handle termination, suspend and resume
    def kill_proc():
        if process:
1083 1084 1085 1086 1087 1088 1089 1090

            # Some callers know that their subprocess can be
            # gracefully terminated, make an attempt first
            if terminate:
                proc = psutil.Process(process.pid)
                proc.terminate()

                try:
1091
                    proc.wait(20)
1092
                except psutil.TimeoutExpired:
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
                    # Did not terminate within the timeout: murder
                    _kill_process_tree(process.pid)

            else:
                # FIXME: This is a brutal but reliable approach
                #
                # Other variations I've tried which try SIGTERM first
                # and then wait for child processes to exit gracefully
                # have not reliably cleaned up process trees and have
                # left orphaned git or ssh processes alive.
                #
                # This cleans up the subprocesses reliably but may
                # cause side effects such as possibly leaving stale
                # locks behind. Hopefully this should not be an issue
                # as long as any child processes only interact with
                # the temp directories which we control and cleanup
                # ourselves.
                #
                _kill_process_tree(process.pid)
Jürg Billeter's avatar
Jürg Billeter committed
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123

    def suspend_proc():
        if process:
            group_id = os.getpgid(process.pid)
            os.killpg(group_id, signal.SIGSTOP)

    def resume_proc():
        if process:
            group_id = os.getpgid(process.pid)
            os.killpg(group_id, signal.SIGCONT)

    with _signals.suspendable(suspend_proc, resume_proc), _signals.terminator(kill_proc):
1124
        process = subprocess.Popen(  # pylint: disable=subprocess-popen-preexec-fn
1125
            *popenargs, preexec_fn=preexec_fn, universal_newlines=True, **kwargs)
Jürg Billeter's avatar
Jürg Billeter committed
1126 1127 1128 1129
        output, _ = process.communicate()
        exit_code = process.poll()

    return (exit_code, output)
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158


# _glob2re()
#
# Function to translate a glob style pattern into a regex
#
# Args:
#    pat (str): The glob pattern
#
# This is a modified version of the python standard library's
# fnmatch.translate() function which supports path like globbing
# a bit more correctly, and additionally supports recursive glob
# patterns with double asterisk.
#
# Note that this will only support the most basic of standard
# glob patterns, and additionally the recursive double asterisk.
#
# Support includes:
#
#   *          Match any pattern except a path separator
#   **         Match any pattern, including path separators
#   ?          Match any single character
#   [abc]      Match one of the specified characters
#   [A-Z]      Match one of the characters in the specified range
#   [!abc]     Match any single character, except the specified characters
#   [!A-Z]     Match any single character, except those in the specified range
#
def _glob2re(pat):
    i, n = 0, len(pat)
1159
    res = '(?ms)'
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192
    while i < n:
        c = pat[i]
        i = i + 1
        if c == '*':
            # fnmatch.translate() simply uses the '.*' separator here,
            # we only want that for double asterisk (bash 'globstar' behavior)
            #
            if i < n and pat[i] == '*':
                res = res + '.*'
                i = i + 1
            else:
                res = res + '[^/]*'
        elif c == '?':
            # fnmatch.translate() simply uses the '.' wildcard here, but
            # we dont want to match path separators here
            res = res + '[^/]'
        elif c == '[':
            j = i
            if j < n and pat[j] == '!':
                j = j + 1
            if j < n and pat[j] == ']':
                j = j + 1
            while j < n and pat[j] != ']':
                j = j + 1
            if j >= n:
                res = res + '\\['
            else:
                stuff = pat[i:j].replace('\\', '\\\\')
                i = j + 1
                if stuff[0] == '!':
                    stuff = '^' + stuff[1:]
                elif stuff[0] == '^':
                    stuff = '\\' + stuff
1193
                res = '{}[{}]'.format(res, stuff)
1194 1195
        else:
            res = res + re.escape(c)
1196
    return res + r'\Z'
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225


# _deduplicate()
#
# Remove duplicate entries in a list or other iterable.
#
# Copied verbatim from the unique_everseen() example at
# https://docs.python.org/3/library/itertools.html#itertools-recipes
#
# Args:
#    iterable (iterable): What to deduplicate
#    key (callable): Optional function to map from list entry to value
#
# Returns:
#    (generator): Generator that produces a deduplicated version of 'iterable'
#
def _deduplicate(iterable, key=None):
    seen = set()
    seen_add = seen.add
    if key is None:
        for element in itertools.filterfalse(seen.__contains__, iterable):
            seen_add(element)
            yield element
    else:
        for element in iterable:
            k = key(element)
            if k not in seen:
                seen_add(k)
                yield element
1226 1227 1228 1229 1230 1231 1232 1233


# Like os.path.getmtime(), but returns the mtime of a link rather than
# the target, if the filesystem supports that.
#
def _get_link_mtime(path):
    path_stat = os.lstat(path)
    return path_stat.st_mtime
1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249


# _message_digest()
#
# Args:
#    message_buffer (str): String to create digest of
#
# Returns:
#    (remote_execution_pb2.Digest): Content digest
#
def _message_digest(message_buffer):
    sha = hashlib.sha256(message_buffer)
    digest = remote_execution_pb2.Digest()
    digest.hash = sha.hexdigest()
    digest.size_bytes = len(message_buffer)
    return digest
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280


# _search_upward_for_files()
#
# Searches upwards (from directory, then directory's parent directory...)
# for any of the files listed in `filenames`.
#
# If multiple filenames are specified, and present in the same directory,
# the first filename in the list will be returned.
#
# Args:
#    directory (str): The directory to begin searching for files from
#    filenames (list of str): The names of files to search for
#
# Returns:
#    (str): The directory a file was found in, or None
#    (str): The name of the first file that was found in that directory, or None
#
def _search_upward_for_files(directory, filenames):
    directory = os.path.abspath(directory)
    while True:
        for filename in filenames:
            file_path = os.path.join(directory, filename)
            if os.path.isfile(file_path):
                return directory, filename

        parent_dir = os.path.dirname(directory)
        if directory == parent_dir:
            # i.e. we've reached the root of the filesystem
            return None, None
        directory = parent_dir
1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309


# _with_gc_disabled()
#
# Decorate a function to disable the garbage collector across its execution.
#
# In general, disabling the garbage collector should be considered to be an
# extreme action.  Only use this in carefully selected subsets of the code
# where we generally create a lot more objects than we throw away.  For example
# in loading the stream.
#
# Args:
#    func (callable): The callable to disable the GC for
#
# Returns:
#    (callable): The decorated callable
#
def _with_gc_disabled(func):
    @functools.wraps(func)
    def _gc_disabled(*args, **kwargs):
        try:
            gc.disable()
            return func(*args, **kwargs)
        finally:
            gc.enable()
            # Clean up to ensure we don't grow any more, freeing up room to be
            # used by other objects during the course of running BuildStream.
            gc.collect()
    return _gc_disabled
1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324


# _deterministic_umask()
#
# Context managed to apply a umask to a section that may be affected by a users
# umask. Restores old mask afterwards.
#
@contextmanager
def _deterministic_umask():
    old_umask = os.umask(0o022)

    try:
        yield
    finally:
        os.umask(old_umask)