prune_index.py 7.67 KB
Newer Older
Stefan Scherfke's avatar
Stefan Scherfke committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
"""
Prune packages in the given directory.

Apply the following rules:

- own-* packages:
  - Only keep the latest build for each version
  - Keep all packages of the last month
  - Keep one package per week for the last 6 months
  - Delete everything older
  - Keep at least one version of a package (even if too old)

- external packages:
  - ONly keep the lastest build for each version

..   - Delete if no recipe in external-recipes

.. code-block:: console

    $ ownconda prune-index stable/linux-64
    Keep  stable/linux/eggs-2.2-0.tar.bz2
    Prune stable/linux/spam-1.2-0.tar.bz2
    Keep  stable/linux/spam-1.2-1.tar.bz2
    1/3 would be deleted.  Pass the "--force" option to actually delete them.

    $ ownconda prune-index stable/linux-64 --force
    Deleted 1/3 packages

"""
import itertools
import json
import os
import pathlib

import click
import pendulum

from .. import click_util, recipes, util, version


def collect_pkgs(path):
    # Check if files in the FS and contents of repodata.json are in sync
    files_from_dir = sorted(f for f in path.glob('*.tar.bz2'))
    repodata = json.loads((path / 'repodata.json').read_text())
    files_from_repodata = sorted(path / fn for fn in repodata['packages'])
    if files_from_dir != files_from_repodata:
        raise click.ClickException('Filesystem and repodata.json are not in sync')

    files = files_from_repodata

    pkgs = []
    for file in files:
        data = repodata['packages'][file.name]
        data['filename'] = file
        data['internal'] = data['name'].startswith('own-')
        # Timestamps in repodata.json are in "ms", in repodata2.json its "s"
        try:
            data['datetime'] = pendulum.from_timestamp(data['timestamp'] // 1000)
        except KeyError:
            click_util.warning(f'WARNING: No timestamp set for "{file.name}"')
            data['datetime'] = pendulum.from_timestamp(0)
        pkgs.append(data)

    return pkgs


def prune_builds(pkgs):
    # Group by package name, version, and build string w/o number
    def group_key(p):
        if p['build'] == str(p['build_number']):
            return (p['name'], p['version'], '')
        else:
            # Strip num from build and only use the first part (e.g., "py37"):
            build_parts = p['build'].rsplit('_', 1)
            return (p['name'], p['version'], build_parts[0])

    pkgs = sorted(pkgs, key=group_key)
    groups = [
        sorted(group, key=lambda pkg: pkg['build_number'], reverse=True)
        for _, group in itertools.groupby(pkgs, group_key)
    ]

    fnames = set()
    for group in groups:
        keep = group[0]['build_number']
        delete = {pkg['filename'] for pkg in group if pkg['build_number'] != keep}
        fnames |= delete

    return fnames


def prune_outdated(pkgs, keep_after, prune_before, filter_fn=lambda p: True):
    """Prune old packages from *pkgs*.

    Keep all packages newer than :class:`datetime.datetime` *keep_after*.
    Delete all packages older than :class:`datetime.datetime` *prune_before*.
    Keep one package per week (as of Monday 00:00) for the time span between
    *prune_before* and *keep_after*.

    Only prune packages for which *filter_fn* returns ``True``.

    Make sure, at least one version of each packages remains.

    """
    # Filter packages
    pkgs = (pkg for pkg in pkgs if filter_fn(pkg))

    # Group by package name and build string w/o number
    def group_key(p):
        if p['build'] == str(p['build_number']):
            return (p['name'], '')
        else:
            # Strip num from build and only use the first part (e.g., "py37"):
            build_parts = p['build'].rsplit('_', 1)
            return (p['name'], build_parts[0])

    pkgs = sorted(pkgs, key=group_key)
    groups = [list(g) for _, g in itertools.groupby(pkgs, group_key)]

    fnames = set()
    for group in groups:
        # Keep at least one pkg!
        group = sorted(group, key=lambda p: p['datetime'])[:-1]

        weeks = {}
        for pkg in group:
            if pkg['datetime'] >= keep_after:
                # Keep!
                continue
            elif pkg['datetime'] < prune_before:
                # Delete!
                fnames.add(pkg['filename'])
            else:
                # Only keep one per week (see below)
                _, week_num, _ = pkg['datetime'].isocalendar()
                weeks.setdefault(week_num, []).append(pkg)

        # Only keep one per week (only applies to internal packages)
        for week_pkgs in weeks.values():
            week_pkgs = week_pkgs[:-1]  # Keep newest per week
            fnames |= {p['filename'] for p in week_pkgs if p['internal']}

    return fnames


def prune_deleted_external(pkgs, recipe_data):
    pkg_names = {recipe['package']['name'] for recipe, _ in recipe_data}
    fnames = set()
    for pkg in pkgs:
        if pkg['internal']:
            continue
        if pkg['name'] not in pkg_names:
            fnames.add(pkg['filename'])
    return fnames


@click.command()
@click.pass_obj
@click.argument(
    'path',
    type=click.Path(exists=True, file_okay=False),
    callback=lambda c, p, v: pathlib.Path(v),
)
@click.option(
    '--force',
    '-f',
    'dry_run',
    flag_value=False,
    default=True,
    help='Actually delete the packagages.',
)
@click.option(
    '--external-recipes',
    type=click_util.Path(exists=True, file_okay=False),
    help=(
        'Path to the external-recipes repository.  If provided, only keep'
        'packages that are listed there.'
    ),
)
@click.option(
    '--prune-internal-before',
    type=click.IntRange(min=0),
    default=6,
    show_default=True,
    help='Delete internal packages older than this amount of months.',
)
@click.option(
    '--prune-external-before',
    type=click.IntRange(min=0),
    default=12,
    show_default=True,
    help='Delete external packages older than this amount of months.',
)
def cli(
        info, path, dry_run, external_recipes, prune_internal_before,
        prune_external_before
):
    """Delete old packages from the local Conda index at PATH.

    By default, the command will only print what would be deleted.  You must
    explicitly pass the "--force" option in order to actually delete packages.

    """
    pkgs = collect_pkgs(path)
    pkgs = sorted(pkgs, key=lambda p: (p['name'], version.parse(p['version']),
                                       p['build_number']))

    to_delete = prune_builds(pkgs)

    now = pendulum.now(tz='UTC')
    keep_after = now.subtract(months=1)
    prune_internal_before = now.subtract(months=prune_internal_before)
    prune_external_before = now.subtract(months=prune_external_before)
    to_delete |= prune_outdated(
        pkgs,
        keep_after=keep_after,
        prune_before=prune_internal_before,
        filter_fn=lambda pkg: pkg['internal'],
    )
    to_delete |= prune_outdated(
        pkgs,
        keep_after=keep_after,
        prune_before=prune_external_before,
        filter_fn=lambda pkg: not pkg['internal'],
    )
    if external_recipes:
        recipe_data = recipes.load_recipes(external_recipes)
        to_delete |= prune_deleted_external(pkgs, recipe_data)

    if dry_run:
        for pkg in pkgs:
            fname = pkg['filename']
            if fname in to_delete:
                click_util.echo(f'Prune {fname}', fg='red')
            else:
                click_util.echo(f'Keep  {fname}', fg='green')
        click_util.info(
            f'{len(to_delete)}/{len(pkgs)} would be deleted.  Pass the "--force" '
            f'option to actually delete them.'
        )
    else:
        for fname in to_delete:
            click_util.status(f'Deleting {fname}')
            os.remove(fname)
        click_util.info(f'Deleted {len(to_delete)}/{len(pkgs)} packages')
        util.run([info.conda_exe, 'index', str(path)])