Source code for mhcflurry.predict_command

'''
Run MHCflurry predictor on specified peptides.

By default, the presentation predictor is used, and predictions for
MHC I binding affinity, antigen processing, and the composite presentation score
are returned. If you just want binding affinity predictions, pass
--affinity-only.

Examples:

Write a CSV file containing the contents of INPUT.csv plus additional columns
giving MHCflurry predictions:

$ mhcflurry-predict INPUT.csv --out RESULT.csv

The input CSV file is expected to contain columns "allele", "peptide", and,
optionally, "n_flank", and "c_flank".

If `--out` is not specified, results are written to stdout.

You can also run on alleles and peptides specified on the commandline, in
which case predictions are written for *all combinations* of alleles and
peptides:

$ mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL

Instead of individual alleles (in a CSV or on the command line), you can also
give a comma separated list of alleles giving a sample genotype. In this case,
the tightest binding affinity across the alleles for the sample will be
returned. For example:

$ mhcflurry-predict --peptides SIINFEKL DENDREKLLL \
    --alleles \
        HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \
        HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01

will give the tightest predicted affinities across alleles for each of the two
genotypes specified for each peptide.
'''
from __future__ import (
    print_function,
    division,
    absolute_import,
)

import sys
import argparse
import itertools
import logging
import os

import pandas

from .downloads import get_default_class1_presentation_models_dir
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_presentation_predictor import Class1PresentationPredictor
from .version import __version__


parser = argparse.ArgumentParser(
    description=__doc__,
    formatter_class=argparse.RawDescriptionHelpFormatter,
    add_help=False)


helper_args = parser.add_argument_group(title="Help")
helper_args.add_argument(
    "-h", "--help",
    action="help",
    help="Show this help message and exit"
)
helper_args.add_argument(
    "--list-supported-alleles",
    action="store_true",
    default=False,
    help="Prints the list of supported alleles and exits"
)
helper_args.add_argument(
    "--list-supported-peptide-lengths",
    action="store_true",
    default=False,
    help="Prints the list of supported peptide lengths and exits"
)
helper_args.add_argument(
    "--version",
    action="version",
    version="mhcflurry %s" % __version__,
)

input_args = parser.add_argument_group(title="Input (required)")
input_args.add_argument(
    "input",
    metavar="INPUT.csv",
    nargs="?",
    help="Input CSV")
input_args.add_argument(
    "--alleles",
    metavar="ALLELE",
    nargs="+",
    help="Alleles to predict (exclusive with passing an input CSV)")
input_args.add_argument(
    "--peptides",
    metavar="PEPTIDE",
    nargs="+",
    help="Peptides to predict (exclusive with passing an input CSV)")

input_mod_args = parser.add_argument_group(title="Input options")
input_mod_args.add_argument(
    "--allele-column",
    metavar="NAME",
    default="allele",
    help="Input column name for alleles. Default: '%(default)s'")
input_mod_args.add_argument(
    "--peptide-column",
    metavar="NAME",
    default="peptide",
    help="Input column name for peptides. Default: '%(default)s'")
input_mod_args.add_argument(
    "--n-flank-column",
    metavar="NAME",
    default="n_flank",
    help="Column giving N-terminal flanking sequence. Default: '%(default)s'")
input_mod_args.add_argument(
    "--c-flank-column",
    metavar="NAME",
    default="c_flank",
    help="Column giving C-terminal flanking sequence. Default: '%(default)s'")
input_mod_args.add_argument(
    "--no-throw",
    action="store_true",
    default=False,
    help="Return NaNs for unsupported alleles or peptides instead of raising")

output_args = parser.add_argument_group(title="Output options")
output_args.add_argument(
    "--out",
    metavar="OUTPUT.csv",
    help="Output CSV")
output_args.add_argument(
    "--prediction-column-prefix",
    metavar="NAME",
    default="mhcflurry_",
    help="Prefix for output column names. Default: '%(default)s'")
output_args.add_argument(
    "--output-delimiter",
    metavar="CHAR",
    default=",",
    help="Delimiter character for results. Default: '%(default)s'")
output_args.add_argument(
    "--no-affinity-percentile",
    default=False,
    action="store_true",
    help="Do not include affinity percentile rank")
output_args.add_argument(
    "--always-include-best-allele",
    default=False,
    action="store_true",
    help="Always include the best_allele column even when it is identical "
    "to the allele column (i.e. all queries are monoallelic).")

model_args = parser.add_argument_group(title="Model options")
model_args.add_argument(
    "--models",
    metavar="DIR",
    default=None,
    help="Directory containing models. Either a binding affinity predictor or "
    "a presentation predictor can be used. "
    "Default: %s" % get_default_class1_presentation_models_dir(
        test_exists=False))
model_args.add_argument(
    "--affinity-only",
    action="store_true",
    default=False,
    help="Affinity prediction only (no antigen processing or presentation)")
model_args.add_argument(
    "--no-flanking",
    action="store_true",
    default=False,
    help="Do not use flanking sequence information even when available")


[docs]def run(argv=sys.argv[1:]): logging.getLogger('tensorflow').disabled = True if not argv: parser.print_help() parser.exit(1) args = parser.parse_args(argv) # It's hard to pass a tab in a shell, so we correct a common error: if args.output_delimiter == "\\t": args.output_delimiter = "\t" models_dir = args.models if models_dir is None: # The reason we set the default here instead of in the argument parser # is that we want to test_exists at this point, so the user gets a # message instructing them to download the models if needed. models_dir = get_default_class1_presentation_models_dir(test_exists=True) if os.path.exists(os.path.join(models_dir, "weights.csv")): # Using a presentation predictor. predictor = Class1PresentationPredictor.load(models_dir) else: # Using just an affinity predictor. affinity_predictor = Class1AffinityPredictor.load(models_dir) predictor = Class1PresentationPredictor( affinity_predictor=affinity_predictor) if not args.affinity_only: logging.warning( "Specified models are an affinity predictor, which implies " "--affinity-only. Specify this argument to silence this warning.") args.affinity_only = True if args.list_supported_alleles: print("\n".join(predictor.supported_alleles)) return if args.list_supported_peptide_lengths: min_len, max_len = predictor.supported_peptide_lengths print("\n".join([str(l) for l in range(min_len, max_len+1)])) return if args.input: if args.alleles or args.peptides: parser.error( "If an input file is specified, do not specify --alleles " "or --peptides") df = pandas.read_csv(args.input) print("Read input CSV with %d rows, columns are: %s" % ( len(df), ", ".join(df.columns))) for col in [args.allele_column, args.peptide_column]: if col not in df.columns: raise ValueError( "No such column '%s' in CSV. Columns are: %s" % ( col, ", ".join(["'%s'" % c for c in df.columns]))) else: if not args.alleles or not args.peptides: parser.error( "Specify either an input CSV file or both the " "--alleles and --peptides arguments") pairs = list(itertools.product(args.alleles, args.peptides)) df = pandas.DataFrame({ "allele": [p[0] for p in pairs], "peptide": [p[1] for p in pairs], }) logging.info( "Predicting for %d alleles and %d peptides = %d predictions" % ( len(args.alleles), len(args.peptides), len(df))) allele_string_to_alleles = ( df.drop_duplicates(args.allele_column).set_index( args.allele_column, drop=False)[ args.allele_column ].str.split(r"[,\s]+")).to_dict() if args.affinity_only: predictions = predictor.predict_affinity( peptides=df[args.peptide_column].values, alleles=allele_string_to_alleles, sample_names=df[args.allele_column], throw=not args.no_throw, include_affinity_percentile=not args.no_affinity_percentile) else: n_flanks = None c_flanks = None if not args.no_flanking: if args.n_flank_column in df.columns and args.c_flank_column in df.columns: n_flanks = df[args.n_flank_column] c_flanks = df[args.c_flank_column] else: logging.warning( "No flanking information provided. Specify --no-flanking " "to silence this warning") predictions = predictor.predict( peptides=df[args.peptide_column].values, n_flanks=n_flanks, c_flanks=c_flanks, alleles=allele_string_to_alleles, sample_names=df[args.allele_column], throw=not args.no_throw, include_affinity_percentile=not args.no_affinity_percentile) # If each query is just for a single allele, the "best_allele" column # is redundant so we remove it. if not args.always_include_best_allele: if all(len(a) == 1 for a in allele_string_to_alleles.values()): del predictions["best_allele"] for col in predictions.columns: if col not in ("allele", "peptide", "sample_name", "peptide_num"): df[args.prediction_column_prefix + col] = predictions[col] if args.out: df.to_csv(args.out, index=False, sep=args.output_delimiter) print("Wrote: %s" % args.out) else: df.to_csv(sys.stdout, index=False, sep=args.output_delimiter)