#!/usr/bin/env python # -*- coding: utf-8 -*- #------------------------------------------------------------------------------- # reference.py: builds HLA references for genotyping. #------------------------------------------------------------------------------- #------------------------------------------------------------------------------- # This file is part of arcasHLA. # # arcasHLA is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # arcasHLA is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with arcasHLA. If not, see <https://www.gnu.org/licenses/>. #------------------------------------------------------------------------------- import sys import re import pickle import argparse import logging as log from argparse import RawTextHelpFormatter from os.path import isfile, isdir, dirname, realpath from subprocess import PIPE, run from textwrap import wrap from scipy import stats from collections import defaultdict from Bio.Seq import Seq from Bio import SeqIO from Bio.SeqRecord import SeqRecord from arcas_utilities import * __version__ = '0.2.0' __date__ = '2019-06-26' #------------------------------------------------------------------------------- # Paths and fileames #------------------------------------------------------------------------------- rootDir = dirname(realpath(__file__)) + '/../' IMGTHLA = rootDir + 'dat/IMGTHLA/' IMGTHLA_git = 'https://github.com/ANHIG/IMGTHLA.git' hla_dat = rootDir + 'dat/IMGTHLA/hla.dat' hla_nom_g = rootDir + 'dat/IMGTHLA/wmda/hla_nom_g.txt' hla_nom_p = rootDir + 'dat/IMGTHLA/wmda/hla_nom_p.txt' hla_convert = rootDir + 'dat/ref/hla.convert.p' hla_fa = rootDir + 'dat/ref/hla.fasta' partial_fa = rootDir + 'dat/ref/hla_partial.fasta' hla_p = rootDir + 'dat/ref/hla.p' partial_p = rootDir + 'dat/ref/hla_partial.p' hla_idx = rootDir + 'dat/ref/hla.idx' partial_idx = rootDir + 'dat/ref/hla_partial.idx' parameters = rootDir + 'dat/info/parameters.p' #------------------------------------------------------------------------------- # Fetch and process IMGTHLA database #------------------------------------------------------------------------------- def get_mode(lengths): return stats.mode(lengths)[0][0] def check_ref(): '''Check if IMGTHLA and constructed HLA references exist.''' if not isfile(hla_dat): fetch_hla_dat() build_convert(False) build_fasta() def fetch_hla_dat(): '''Clones IMGTHLA github to database.''' if isdir(IMGTHLA): run_command(['rm', '-rf', IMGTHLA]) command = ['git', 'lfs clone', IMGTHLA_git, IMGTHLA] run_command(command, '[reference] Cloning IMGT/HLA database:') def checkout_version(commithash, verbose = True): '''Checks out a specific IMGTHLA github version given a commithash.''' if not isfile(hla_dat): fetch_hla_dat() command = ['git', '-C', IMGTHLA, 'lfs checkout', commithash] if verbose: run_command(command, '[reference] Checking out IMGT/HLA:') else: run_command(command) def hla_dat_version(print_version = False): '''Returns commithash of downloaded IMGTHLA database.''' results = run_command(['git', '-C', IMGTHLA, 'rev-parse HEAD']) commit = results.stdout.decode() if print_version: log.info(commit) return commit[:-1] def process_hla_dat(): '''Processes IMGTHLA database, returning HLA sequences, exon locations, lists of complete and partial alleles and possible exon combinations. ''' sequences = dict() utrs = defaultdict(dict) exons = defaultdict(dict) gene_exons = defaultdict(set) sequence = partial = utr = exon = False gene_set = set() complete_alleles = set() complete_2fields = set() partial_alleles = set() with open(hla_dat, 'r', encoding='UTF-8') as file: lines = file.read().splitlines() # Check if hla.dat failed to download if len(lines) < 10: sys.exit('[reference] Error: dat/IMGTHLA/hla.dat empty or corrupted.') for line in lines: # Denotes end of sequence, add allele to database if line.startswith('//'): if sequence and allele in exons: sequences[allele] = seq gene_exons[gene].add(number) gene_set.add(gene) if not partial: complete_alleles.add(allele) complete_2fields.add(process_allele(allele,2)) else: partial_alleles.add(allele) partial = False # Denotes partial alleles elif line.startswith('FT') and 'partial' in line: partial = True # Allele name and gene elif line.startswith('FT') and re.search('allele\="HLA-', line): allele = re.split('HLA-', re.sub('["\n]','',line))[1] gene = get_gene(allele) exon = sequence = False seq = '' # Exon coordinates elif line.startswith('FT') and re.search('exon',line): info = re.split('\s+', line) start = int(info[2].split('..')[0]) - 1 stop = int(info[2].split('..')[1]) exon_coord = [start, stop] exon = True # Exon number on following line elif exon: number = re.split('"', line)[1] exons[allele][number] = exon_coord exon = False # UTRs elif line.startswith('FT') and (re.search('\sUTR\s',line)): info = re.split('\s+', line) start = int(info[2].split('..')[0]) - 1 stop = int(info[2].split('..')[1]) utr_coord = [start, stop] if allele not in exons: utrs[allele]['utr5'] = utr_coord else: utrs[allele]['utr3'] = utr_coord # Start of sequence elif line.startswith('SQ'): sequence = True elif sequence and line.startswith(' '): seq += ''.join(line.split()[:-1]).upper() # select only 2-field partial alleles partial_alleles = {allele for allele in partial_alleles if process_allele(allele,2) not in complete_2fields} # get most common final exon length to truncate stop-loss alleles final_exon_length = defaultdict(list) for allele in complete_alleles: gene = get_gene(allele) exon = sorted(gene_exons[gene])[-1] if exon not in exons[allele]: continue start, stop = exons[allele][exon] final_exon_length[gene].append(stop-start) for gene, lengths in final_exon_length.items(): exon = sorted(gene_exons[gene])[-1] length = get_mode(lengths) final_exon_length[gene] = [exon,length] return (complete_alleles, partial_alleles, gene_set, sequences, utrs, exons, final_exon_length) def process_hla_nom(hla_nom): '''Processes nomenclature files for arcasHLA convert.''' allele_to_group = defaultdict(dict) single_alleles = set() grouped_alleles = set() for line in open(hla_nom, 'r', encoding='UTF-8'): if line.startswith('#'): continue gene, alleles, group = line.split(';') alleles = [gene + allele for allele in alleles.split('/')] if len(group) == 1: single_alleles.add(alleles[0]) continue group = gene + group[:-1] for allele in alleles: grouped_alleles.add(process_allele(allele,2)) for i in range(2,5): allele_to_group[i][process_allele(allele,i)] = group # Alleles not included in a group for allele in single_alleles: # Checks if 2-field allele already represented by a group if process_allele(allele,2) not in grouped_alleles: for i in range(2,5): allele_to_group[i][process_allele(allele,i)] = process_allele(allele,2) else: allele_to_group[2][allele] = process_allele(allele,3) allele_to_group[3][allele] = process_allele(allele,3) allele_to_group[4][allele] = allele return allele_to_group #------------------------------------------------------------------------------- # Saving reference files #------------------------------------------------------------------------------- def write_reference(sequences, info, fasta, idx, database, type): '''Writes and idxes HLA references.''' with open(fasta,'w') as file: SeqIO.write(sequences, file, 'fasta') commithash = hla_dat_version() with open(database,'wb') as file: pickle.dump([commithash,info],file) run_command(['kallisto', 'index', '-i', idx, fasta] , '[reference] indexing ' + type + ' reference with Kallisto:') #------------------------------------------------------------------------------- # Constructing reference #------------------------------------------------------------------------------- def get_exon_combinations(): '''Generates exon combinations used in partial allele typing.''' exon_combinations = [] exon_set = set() for i in range(2,8): exon_set |= {str(i)} exon_combinations.append(sorted(exon_set)) if i > 2: exon_combinations.append(sorted(exon_set | {'1'})) return exon_combinations def build_fasta(): '''Constructs HLA reference from processed sequences and exon locations.''' log.info('[reference] IMGT/HLA database version:\n') hla_dat_version(True) log.info('[reference] Processing IMGT/HLA database') # Constructs cDNA sequences for alleles and adds UTRs to the set of # non-coding sequences def build_complete(allele): gene = get_gene(allele) allele_exons = sorted(exons[allele].items()) coords = [[start,stop] for n,(start,stop) in allele_exons] seq = [sequences[allele][start:stop] for start,stop in coords] seq = ''.join(seq) exon, exon_length = final_exon_length[gene] if exon in exons[allele]: start, stop = exons[allele][exon] if stop-start > exon_length: seq = seq[:exon_length - (stop - start) + 1] cDNA[seq].add(allele) gene_length[gene].append(len(seq)) if allele in utrs: for (start,stop) in utrs[allele].values(): seq = sequences[allele][start:stop] other.add(seq) # Constructs exon combination sequences for complete and # partial alleles def build_combination(allele): for exon_group in exon_combinations: if set(exon_group) & set(exons[allele]) != set(exon_group): continue coords = [] for n in exon_group: coords.append(exons[allele][n]) seq = [sequences[allele][start:stop] for start,stop in coords] seq = ''.join(seq) combo[str(exon_group)][seq].add(allele) # Adds cDNA sequences of complete alleles and separate UTRs to a # list of sequence records def complete_records(cDNA, other): seq_out = [] allele_idx = dict() lengths = dict() # Adds coding sequences cDNA = sorted(cDNA.items(), key=lambda x:x[1]) for i,(seq,alleles) in enumerate(cDNA): idx = str(i) record = SeqRecord(Seq(seq), id=str(idx), description='') seq_out.append(record) allele_idx[idx] = sorted(alleles) lengths[idx] = len(seq) # Adds UTRs offset = i + 1 for i, seq in enumerate(other): idx = str(i + offset) record = SeqRecord(Seq(seq), id=str(idx), description='') seq_out.append(record) allele_idx[idx] = None return seq_out, allele_idx, lengths # Adds exon combination sequences of complete alleles to list of # sequence records, including UTRs def partial_records(sequences, other): seq_out = [] exon_idx = dict() allele_idx = dict() lengths = dict() # Adds exon combination sequences offset = 0 for exon in sorted(sequences): exon_sequences = sorted(sequences[exon].items(),key=lambda x:x[1]) for i,(seq,alleles) in enumerate(exon_sequences): idx = str(i + offset) length = len(seq) record = SeqRecord(Seq(seq), id=str(idx), description='') seq_out.append(record) allele_idx[idx] = sorted(alleles) lengths[idx] = len(seq) exon_idx[idx] = exon offset += i + 1 # Adds UTRs for i, seq in enumerate(other): idx = str(i + offset) record = SeqRecord(Seq(seq), id=str(idx), description='') seq_out.append(record) allele_idx[idx] = None return seq_out, allele_idx, lengths, exon_idx (complete_alleles, partial_alleles, gene_set, sequences, utrs, exons, final_exon_length) = process_hla_dat() exon_combinations = get_exon_combinations() gene_length = defaultdict(list) cDNA = defaultdict(set) combo = {str(i):defaultdict(set) for i in exon_combinations} other = set() # Build sequences for each allele for allele in complete_alleles: build_complete(allele) build_combination(allele) for allele in partial_alleles: build_combination(allele) cDNA = {seq:sorted(alleles) for seq, alleles in cDNA.items()} other = list(other) gene_length = {g:get_mode(lengths) for g, lengths in gene_length.items()} log.info('[reference] Building HLA database') seq_out, allele_idx, lengths = complete_records(cDNA, other) write_reference(seq_out, [gene_set, allele_idx, lengths, gene_length], hla_fa, hla_idx, hla_p, 'complete') log.info('[reference] Building partial HLA database') seq_out, allele_idx, lengths, exon_idx = partial_records(combo, other) partial_exons = {allele:exons[allele] for allele in partial_alleles} write_reference(seq_out, [gene_set, allele_idx, exon_idx, lengths, partial_exons, partial_alleles], partial_fa, partial_idx, partial_p, 'partial') def build_convert(reset=False): '''Creates conversion tables for arcasHLA convert.''' log.info('[reference] Building nomenclature conversion tables.') if reset: commit = hla_dat_version() checkout_version('origin', False) p_group = process_hla_nom(hla_nom_p) g_group = process_hla_nom(hla_nom_g) with open(hla_convert, 'wb') as file: pickle.dump([p_group,g_group], file) if reset: checkout_version(commit, False) #------------------------------------------------------------------------------- # Main #------------------------------------------------------------------------------- if __name__ == '__main__': with open(parameters, 'rb') as file: _, _, versions = pickle.load(file) parser = argparse.ArgumentParser(prog='arcasHLA reference', usage='%(prog)s [options]', add_help=False, formatter_class=RawTextHelpFormatter) parser.add_argument('-h', '--help', action = 'help', help='show this help message and exit\n\n', default=argparse.SUPPRESS) parser.add_argument('--update', action = 'count', help='update to latest IMGT/HLA version\n\n') parser.add_argument('--rebuild', help='rebuild HLA database\n\n', action='count') parser.add_argument('--version', type = str, help='checkout IMGT/HLA version using version\n' + \ '\n'.join(wrap('options: ' + ', '.join(sorted(versions.keys())), 60)) +'\n\n', default=False, metavar='',) parser.add_argument('--commit', type = str, help='checkout IMGT/HLA version using commithash\n\n', default=False, metavar='',) parser.add_argument('-v', '--verbose', action = 'count', default=False) args = parser.parse_args() if args.verbose: log.basicConfig(level = log.DEBUG, format = '%(message)s') else: handlers = [log.StreamHandler()] log.basicConfig(format = '%(message)s') log.info('') hline() check_path(rootDir + 'dat/ref') if args.update: log.info('[reference] Updating HLA reference') checkout_version('origin') build_convert(False) build_fasta() elif args.rebuild: build_convert() build_fasta() elif args.version: if args.version not in versions: sys.exit('[reference] Error: invalid version.') checkout_version(versions[args.version]) build_fasta() build_convert() elif args.commit: check_ref() checkout_version(args.commit) build_fasta() build_convert() else: check_ref() hla_dat_version(True) hline() log.info('') #-------------------------------------------------------------------------------