Source code for mhcflurry.downloads

"""
Manage local downloaded data.
"""

from __future__ import (
    print_function,
    division,
    absolute_import,
)
import logging
import yaml
from os.path import join, exists
from os import environ
from pipes import quote
from collections import OrderedDict
from appdirs import user_data_dir
from pkg_resources import resource_string

import pandas

ENVIRONMENT_VARIABLES = [
    "MHCFLURRY_DATA_DIR",
    "MHCFLURRY_DOWNLOADS_CURRENT_RELEASE",
    "MHCFLURRY_DOWNLOADS_DIR",
    "MHCFLURRY_DEFAULT_CLASS1_MODELS"
]

_DOWNLOADS_DIR = None
_CURRENT_RELEASE = None
_METADATA = None
_MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR = environ.get(
    "MHCFLURRY_DEFAULT_CLASS1_MODELS")
_MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR = environ.get(
    "MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR")
_MHCFLURRY_DEFAULT_CLASS1_PROCESSING_MODELS_DIR = environ.get(
    "MHCFLURRY_DEFAULT_CLASS1_PROCESSING_MODELS_DIR")


[docs]def get_downloads_dir():
    """
    Return the path to local downloaded data
    """
    return _DOWNLOADS_DIR


[docs]def get_current_release():
    """
    Return the current downloaded data release
    """
    return _CURRENT_RELEASE


[docs]def get_downloads_metadata():
    """
    Return the contents of downloads.yml as a dict
    """
    global _METADATA
    if _METADATA is None:
        _METADATA = yaml.safe_load(resource_string(__name__, "downloads.yml"))
    return _METADATA


[docs]def get_default_class1_models_dir(test_exists=True):
    """
    Return the absolute path to the default class1 models dir.

    If environment variable MHCFLURRY_DEFAULT_CLASS1_MODELS is set to an
    absolute path, return that path. If it's set to a relative path (i.e. does
    not start with /) then return that path taken to be relative to the mhcflurry
    downloads dir.

    If environment variable MHCFLURRY_DEFAULT_CLASS1_MODELS is NOT set,
    then return the path to downloaded models in the "models_class1" download.

    Parameters
    ----------

    test_exists : boolean, optional
        Whether to raise an exception of the path does not exist

    Returns
    -------
    string : absolute path
    """
    if _MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR:
        result = join(get_downloads_dir(), _MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR)
        if test_exists and not exists(result):
            raise IOError("No such directory: %s" % result)
        return result
    return get_path(
        "models_class1_pan", "models.combined", test_exists=test_exists)


[docs]def get_default_class1_presentation_models_dir(test_exists=True):
    """
    Return the absolute path to the default class1 presentation models dir.

    See `get_default_class1_models_dir`.

    If environment variable MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS is set
    to an absolute path, return that path. If it's set to a relative path (does
    not start with /) then return that path taken to be relative to the mhcflurry
    downloads dir.

    Parameters
    ----------

    test_exists : boolean, optional
        Whether to raise an exception of the path does not exist

    Returns
    -------
    string : absolute path
    """
    if _MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR:
        result = join(
            get_downloads_dir(),
            _MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR)
        if test_exists and not exists(result):
            raise IOError("No such directory: %s" % result)
        return result
    return get_path(
        "models_class1_presentation", "models", test_exists=test_exists)


[docs]def get_default_class1_processing_models_dir(test_exists=True):
    """
    Return the absolute path to the default class1 processing models dir.

    See `get_default_class1_models_dir`.

    If environment variable MHCFLURRY_DEFAULT_CLASS1_PROCESSING_MODELS is set
    to an absolute path, return that path. If it's set to a relative path (does
    not start with /) then return that path taken to be relative to the mhcflurry
    downloads dir.

    Parameters
    ----------

    test_exists : boolean, optional
        Whether to raise an exception of the path does not exist

    Returns
    -------
    string : absolute path
    """
    if _MHCFLURRY_DEFAULT_CLASS1_PROCESSING_MODELS_DIR:
        result = join(
            get_downloads_dir(),
            _MHCFLURRY_DEFAULT_CLASS1_PROCESSING_MODELS_DIR)
        if test_exists and not exists(result):
            raise IOError("No such directory: %s" % result)
        return result
    return get_path(
        "models_class1_processing", "models", test_exists=test_exists)


[docs]def get_current_release_downloads():
    """
    Return a dict of all available downloads in the current release.

    The dict keys are the names of the downloads. The values are a dict
    with two entries:

    downloaded : bool
        Whether the download is currently available locally

    metadata : dict
        Info about the download from downloads.yml such as URL

    up_to_date : bool or None
        Whether the download URL(s) match what was used to download the current
        data. This is None if it cannot be determined.
    """
    downloads = (
        get_downloads_metadata()
        ['releases']
        [get_current_release()]
        ['downloads'])

    def up_to_date(dir, urls):
        try:
            df = pandas.read_csv(join(dir, "DOWNLOAD_INFO.csv"))
            return list(df.url) == list(urls)
        except IOError:
            return None

    return OrderedDict(
        (download["name"], {
            'downloaded': exists(join(get_downloads_dir(), download["name"])),
            'up_to_date': up_to_date(
                join(get_downloads_dir(), download["name"]),
                [download['url']] if 'url' in download else download['part_urls']),
            'metadata': download,
        }) for download in downloads
    )


[docs]def get_path(download_name, filename='', test_exists=True):
    """
    Get the local path to a file in a MHCflurry download

    Parameters
    -----------
    download_name : string

    filename : string
        Relative path within the download to the file of interest

    test_exists : boolean
        If True (default) throw an error telling the user how to download the
        data if the file does not exist

    Returns
    -----------
    string giving local absolute path
    """
    assert '/' not in download_name, "Invalid download: %s" % download_name
    path = join(get_downloads_dir(), download_name, filename)
    if test_exists and not exists(path):
        raise RuntimeError(
            "Missing MHCflurry downloadable file: %s. "
            "To download this data, run:\n\tmhcflurry-downloads fetch %s\n"
            "in a shell."
            % (quote(path), download_name))
    return path


[docs]def configure():
    """
    Setup various global variables based on environment variables.
    """
    global _DOWNLOADS_DIR
    global _CURRENT_RELEASE

    _CURRENT_RELEASE = None
    _DOWNLOADS_DIR = environ.get("MHCFLURRY_DOWNLOADS_DIR")
    if not _DOWNLOADS_DIR:
        metadata = get_downloads_metadata()
        _CURRENT_RELEASE = environ.get("MHCFLURRY_DOWNLOADS_CURRENT_RELEASE")
        if not _CURRENT_RELEASE:
            _CURRENT_RELEASE = metadata['current-release']

        current_release_compatability = (
            metadata["releases"][_CURRENT_RELEASE]["compatibility-version"])
        current_compatability = metadata["current-compatibility-version"]
        if current_release_compatability != current_compatability:
            logging.warning(
                "The specified downloads are not compatible with this version "
                "of the MHCflurry codebase. Downloads: release %s, "
                "compatability version: %d. Code compatability version: %d",
                _CURRENT_RELEASE,
                current_release_compatability,
                current_compatability)

        data_dir = environ.get("MHCFLURRY_DATA_DIR")
        if not data_dir:
            # increase the version every time we make a breaking change in
            # how the data is organized. For changes to e.g. just model
            # serialization, the downloads release numbers should be used.
            data_dir = user_data_dir("mhcflurry", version="4")
        _DOWNLOADS_DIR = join(data_dir, _CURRENT_RELEASE)

    logging.debug("Configured MHCFLURRY_DOWNLOADS_DIR: %s", _DOWNLOADS_DIR)


configure()