Source code for mhcflurry.train_presentation_models_command

"""
Train Class1 presentation models.
"""
from __future__ import print_function
import argparse
import os
import signal
import sys
import time
import traceback

import pandas
import tqdm  # progress bar
tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481

from .class1_processing_predictor import Class1ProcessingPredictor
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_presentation_predictor import Class1PresentationPredictor
from .common import configure_logging

parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
    "--data",
    metavar="FILE.csv",
    help="Training data CSV. Expected columns: peptide, n_flank, c_flank, hit")
parser.add_argument(
    "--out-models-dir",
    metavar="DIR",
    required=True,
    help="Directory to write models and manifest")
parser.add_argument(
    "--affinity-predictor",
    metavar="DIR",
    required=True,
    help="Affinity predictor models dir")
parser.add_argument(
    "--processing-predictor-with-flanks",
    metavar="DIR",
    required=True,
    help="Processing predictor with flanks")
parser.add_argument(
    "--processing-predictor-without-flanks",
    metavar="DIR",
    required=True,
    help="Processing predictor without flanks")
parser.add_argument(
    "--verbosity",
    type=int,
    help="Default: %(default)s",
    default=1)
parser.add_argument(
    "--debug",
    action="store_true",
    default=False,
    help="Launch python debugger on error")
parser.add_argument(
    "--hla-column",
    default="hla",
    help="Column in data giving space-separated MHC I alleles")
parser.add_argument(
    "--target-column",
    default="hit",
    help="Column in data giving hit (1) vs decoy (0)")

[docs]def run(argv=sys.argv[1:]): # On sigusr1 print stack trace print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) args = parser.parse_args(argv) if args.debug: try: return main(args) except Exception as e: print(e) import ipdb # pylint: disable=import-error ipdb.set_trace() raise else: return main(args)
[docs]def main(args): print("Arguments:") print(args) args.out_models_dir = os.path.abspath(args.out_models_dir) configure_logging(verbose=args.verbosity > 1) df = pandas.read_csv(args.data) print("Loaded training data: %s" % (str(df.shape))) df = df.loc[ (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) ] print("Subselected to 8-15mers: %s" % (str(df.shape))) df["experiment_id"] = df[args.hla_column] experiment_to_alleles = dict(( key, key.split()) for key in df.experiment_id.unique()) if not os.path.exists(args.out_models_dir): print("Attempting to create directory: %s" % args.out_models_dir) os.mkdir(args.out_models_dir) print("Done.") affinity_predictor = Class1AffinityPredictor.load( args.affinity_predictor, optimization_level=0) processing_predictor_with_flanks = Class1ProcessingPredictor.load( args.processing_predictor_with_flanks) processing_predictor_without_flanks = Class1ProcessingPredictor.load( args.processing_predictor_without_flanks) print("Loaded affinity predictor", affinity_predictor) print( "Loaded processing_predictor_with_flanks", processing_predictor_with_flanks) print("Loaded processing_predictor_without_flanks", processing_predictor_without_flanks) predictor = Class1PresentationPredictor( affinity_predictor=affinity_predictor, processing_predictor_with_flanks=processing_predictor_with_flanks, processing_predictor_without_flanks=processing_predictor_without_flanks) # We want to predict using an optimized Class1AffinityPredictor but # save the presentation models using an un-optimized Class1AffinityPredictor, # since there seems to be issues saving and loading giant tensorflow # graphs (which is what the optimization routine produces). print("Before fit: saving affinity and processing predictors.") predictor.save( args.out_models_dir, write_affinity_predictor = True, write_processing_predictor = True, write_weights = False, write_percent_ranks = False, write_info = False, write_metdata = False) print("Done writing: ", args.out_models_dir) print("Optimizing affinity predictor.") optimized = affinity_predictor.optimize() print("Optimization performed: ", optimized) print("Fitting.") start = time.time() predictor.fit( targets=df[args.target_column].values, peptides=df.peptide.values, alleles=experiment_to_alleles, sample_names=df.experiment_id, n_flanks=df.n_flank.values, c_flanks=df.c_flank.values, verbose=args.verbosity) print("Done fitting in", time.time() - start, "seconds") print("Saving weights and metadata.") predictor.save( args.out_models_dir, write_affinity_predictor = False, write_processing_predictor = False, write_weights = True, write_percent_ranks = True, write_info = True, write_metdata = True) print("Wrote", args.out_models_dir)
if __name__ == '__main__': run()