"""Creation of neural network input features.""" import abc from collections import defaultdict import concurrent.futures import functools import inspect import itertools import os import time from timeit import default_timer as now import numpy as np import libmedaka import medaka.common import medaka.datastore import medaka.labels def _plp_data_to_numpy(plp_data, n_rows): """Create numpy representation of feature data. Copy the feature matrix and alignment column names from a `plp_data` structure returned from C library function calls. :param plp_data: a cffi proxy to a `plp_data*` pointer :param nrows: the number of rows in the plp_data.matrix (the number of elements in the feature per pileup column). :returns: pileup counts numpy array, reference positions """ ffi = libmedaka.ffi size_sizet = np.dtype(np.uintp).itemsize np_counts = np.frombuffer(ffi.buffer( plp_data.matrix, size_sizet * plp_data.n_cols * n_rows), dtype=np.uintp ).reshape(plp_data.n_cols, n_rows).copy() positions = np.empty(plp_data.n_cols, dtype=[ ('major', int), ('minor', int)]) np.copyto( positions['major'], np.frombuffer( ffi.buffer(plp_data.major, size_sizet * plp_data.n_cols), dtype=np.uintp)) np.copyto( positions['minor'], np.frombuffer(ffi.buffer( plp_data.minor, size_sizet * plp_data.n_cols), dtype=np.uintp)) return np_counts, positions def __enforce_pileup_chunk_contiguity(pileups): """Split and join ordered pileup chunks to ensure contiguity. :param pileups: iterable of (counts, pileups) as constructed by `_plp_data_to_numpy`. :returns: a list of reconstituted (counts, pileups) where discontinuities in the inputs cause breaks and abutting inputs are joined. """ split_results = list() # First pass: need to check for discontinuities within chunks, # these show up as >1 changes in the major coordinate for counts, positions in pileups: move = np.ediff1d(positions['major']) gaps = np.where(move > 1)[0] + 1 if len(gaps) == 0: split_results.append((counts, positions)) else: start = 0 for i in gaps: split_results.append((counts[start:i], positions[start:i])) start = i split_results.append((counts[start:], positions[start:])) # Second pass: stitch abutting chunks together, anything not neighbouring # is kept separate whether it came from the same chunk originally or not def _finalize_chunk(c_buf, p_buf): chunk_counts = np.concatenate(c_buf) chunk_positions = np.concatenate(p_buf) return chunk_counts, chunk_positions counts_buffer, positions_buffer = list(), list() chunk_results = list() last = None for counts, positions in split_results: if len(positions) == 0: continue first = positions['major'][0] if len(counts_buffer) == 0 or first - last == 1: # new or contiguous counts_buffer.append(counts) positions_buffer.append(positions) last = positions['major'][-1] else: # discontinuity chunk_results.append(_finalize_chunk( counts_buffer, positions_buffer)) counts_buffer = [counts] positions_buffer = [positions] last = positions['major'][-1] if len(counts_buffer) != 0: chunk_results.append(_finalize_chunk(counts_buffer, positions_buffer)) return chunk_results def _tidy_libfunc_args( dtype_prefixes, tag_name, tag_value, keep_missing, read_group): ffi = libmedaka.ffi if dtype_prefixes is None or isinstance(dtype_prefixes, str) \ or len(dtype_prefixes) == 1: num_dtypes, dtypes, _dtypes = 1, ffi.NULL, [ffi.NULL] else: num_dtypes = len(dtype_prefixes) _dtypes = [ffi.new("char[]", d.encode()) for d in dtype_prefixes] dtypes = ffi.new("char *[]", _dtypes) if tag_name is None: tag_name = ffi.new("char[2]", "".encode()) tag_value = 0 keep_missing = False elif len(tag_name) != 2: raise ValueError("'tag_name' must be a length-2 string.") else: tag_name = ffi.new("char[2]", tag_name.encode()) if read_group is None: read_group = ffi.NULL else: read_group = ffi.new("char[]", read_group.encode()) return (num_dtypes, dtypes, _dtypes, tag_name, tag_value, keep_missing, read_group) def pileup_counts( region, bam, dtype_prefixes=None, region_split=100000, workers=8, tag_name=None, tag_value=None, keep_missing=False, num_qstrat=1, weibull_summation=False, read_group=None): """Create pileup counts feature array for region. :param region: `medaka.common.Region` object :param bam: .bam file with alignments. :param dtype_prefixes: prefixes for query names which to separate counts. If `None` (or of length 1), counts are not split. :param region_split: largest region to process in single thread. :param workers: worker threads for calculating pileup. :param tag_name: two letter tag name by which to filter reads. :param tag_value: integer value of tag for reads to keep. :param keep_missing: whether to keep reads when tag is missing. :param num_qstrat: number of layers for qscore stratification. :param weibull_summation: use a Weibull partial-counts approach, requires 'WL' and 'WK' float-array tags. :returns: pileup counts array, reference positions, insertion positions """ lib = libmedaka.lib featlen = lib.featlen (num_dtypes, dtypes, _dtypes, tag_name, tag_value, keep_missing, read_group) = _tidy_libfunc_args( dtype_prefixes, tag_name, tag_value, keep_missing, read_group) def _process_region(reg): # htslib start is 1-based, medaka.common.Region object is 0-based region_str = '{}:{}-{}'.format(reg.ref_name, reg.start + 1, reg.end) counts = lib.calculate_pileup( region_str.encode(), bam.encode(), num_dtypes, dtypes, num_qstrat, tag_name, tag_value, keep_missing, weibull_summation, read_group ) np_counts, positions = _plp_data_to_numpy( counts, featlen * num_dtypes * num_qstrat) lib.destroy_plp_data(counts) return np_counts, positions # split large regions for performance regions = region.split(region_split, fixed_size=False) with concurrent.futures.ThreadPoolExecutor(max_workers=workers) \ as executor: results = executor.map(_process_region, regions) chunk_results = __enforce_pileup_chunk_contiguity(results) return chunk_results def get_trimmed_reads( region, bam, dtype_prefixes=None, region_split=750, chunk_overlap=150, workers=8, tag_name=None, tag_value=None, keep_missing=False, partial=True, num_qstrat=1, read_group=None): """Fetch reads trimmed to a region. Overlapping chunks of the imput region will be produced, with each chunk having its reads trimmed to the reference sequence coordinates of th chunk. :param region: `medaka.common.Region` object :param bam: .bam file with alignments. :param dtype_prefixes: prefixes for query names which to separate counts. If `None` (or of length 1), counts are not split. :param region_split: largest region to process in single thread. :param chunk_overlap: overlap between chunks. :param workers: worker threads for calculating pileup. :param tag_name: two letter tag name by which to filter reads. :param tag_value: integer value of tag for reads to keep. :param keep_missing: whether to keep reads when tag is missing. :param partial: whether to keep reads which don't fully span the region. :param num_qstrat: number of layers for qscore stratification. :returns: lists of trimmed reads. """ ffi, lib = libmedaka.ffi, libmedaka.lib (num_dtypes, dtypes, _dtypes, tag_name, tag_value, keep_missing, read_group) = _tidy_libfunc_args( dtype_prefixes, tag_name, tag_value, keep_missing, read_group) def _process_region(reg): # htslib start is 1-based, medaka.common.Region object is 0-based region_str = '{}:{}-{}'.format(reg.ref_name, reg.start + 1, reg.end) stuff = lib.PY_retrieve_trimmed_reads( region_str.encode(), bam.encode(), num_dtypes, dtypes, tag_name, tag_value, keep_missing, partial, read_group, ) # last string is reference seqs = [(False, ffi.string(stuff.seqs[stuff.n_seqs - 1]).decode())] for i in range(stuff.n_seqs - 1): seqs.append((stuff.is_rev[i], ffi.string(stuff.seqs[i]).decode())) lib.PY_destroy_reads(stuff) return reg, seqs # split large regions for performance regions = region.split(region_split, chunk_overlap) with concurrent.futures.ThreadPoolExecutor(max_workers=workers) \ as executor: results = executor.map(_process_region, regions) return results def pileup_counts_norm_indices(dtypes, num_qstrat=1): """Calculate feature vector normalization groups. Calculates (per-datatype, per-read-orientation) locations of bases in `pileup_counts` output for the purpose of various normalization strategies. e.g. For two datatype the feature vector is counts of bases: (acgtACGTdDacgtACGTdD), i.e. the first datatype followed by the second, and where lowercase denotes reverse reads and the base ordering is defined in the library consts `plp_bases`. The resultant dictionary contains entries such as: (datatype1, False):[4,5,6,7,9], for the datatype1 forward bases. :param dtypes: list of datatype names. :param num_qstrat: number of layers for qscore stratification. :returns: a dictionary of the form `{(datatype, is_rev): [base1_index, base2_index, ...]}`. """ ffi, lib = libmedaka.ffi, libmedaka.lib plp_bases = lib.plp_bases featlen = lib.featlen indices = defaultdict(list) codes = ffi.string(plp_bases).decode() assert len(codes) == featlen # from the C code: # pileup->matrix[major_col + featlen * dtype * num_qstrat + \ # featlen * qstrat + base_i] += 1; # where j is the insert index, so feature is ordered: datatype > base for dti, dt in enumerate(dtypes): for qindex in range(num_qstrat): for base_i, code in enumerate(codes): is_rev = code.islower() indices[dt, is_rev].append( base_i + dti * num_qstrat * len(codes) + qindex * len(codes)) return dict(indices) feature_encoders = dict() class FeatureEncoderRegistrar(type): """Class for registering feature encoders.""" def __new__(cls, clsname, bases, attrs): """Register class to `feature_encoders` dict upon instantiation.""" newclass = super(FeatureEncoderRegistrar, cls).__new__( cls, clsname, bases, attrs) cls.register_feature_encoder(clsname, newclass) return newclass def register_feature_encoder(clsname, cls): """Add `FeatureEncoder` to `feature_encoders` dict.""" # do not display base class as command line option if clsname != 'BaseFeatureEncoder': feature_encoders[clsname] = cls class FeatureEncoderMeta(abc.ABC, FeatureEncoderRegistrar): """Metaclass facilitating registration of `FeatureEncoder` s.""" pass class BaseFeatureEncoder(metaclass=FeatureEncoderMeta): """Base class for creation of feature arrays from a `.bam` file.""" @abc.abstractmethod def __init__(*args, **kwargs): """Initialize feature encoder.""" raise NotImplementedError @abc.abstractmethod def _pileup_function(self, region, reads_bam): # Called to create pileup matrix and position arrays raise NotImplementedError @abc.abstractmethod def _post_process_pileup(self, matrix, positions, region): # A chance to mutate the output of pileup_function raise NotImplementedError @abc.abstractmethod def bams_to_training_samples( self, truth_bam, bam, region, label_scheme, truth_haplotag=None, min_length=1000): """Create labelled samples, should internally call bam_to_sample.""" # TODO: should be moved outside this class? raise NotImplementedError @property @abc.abstractmethod def feature_vector_length(self): """Return size of a single feature vector. The length of a neural network input at a single time point. """ raise NotImplementedError # this shouldn't be overridden def bam_to_sample(self, reads_bam, region): """Convert a section of an alignment pileup to a sample. :param reads_bam: (sorted indexed) bam with read alignment to reference :param region: `medaka.common.Region` object with ref_name, start and end attributes. :returns: `medaka.common.Sample` object """ pileups = self._pileup_function(region, reads_bam) samples = list() for counts, positions in pileups: if len(counts) == 0: msg = ( 'Pileup-feature is zero-length for {} indicating no ' 'reads in this region.').format(region) self.logger.warning(msg) samples.append( medaka.common.Sample( ref_name=region.ref_name, features=None, labels=None, ref_seq=None, positions=positions, label_probs=None ) ) continue samples.append(self._post_process_pileup( counts, positions, region)) return samples def __getstate__(self): """Modify object so it is pickleable.""" # Logs are not picklable in python < 3.7 state = self.__dict__.copy() del state['logger'] return state def __setstate__(self, state): """Modify object after unpickling.""" self.__dict__.update(state) self.logger = medaka.common.get_named_logger('Feature') class CountsFeatureEncoder(BaseFeatureEncoder): """Create a pileup array of counts of observed bases.""" _norm_modes_ = ['total', 'fwd_rev', None] feature_dtype = np.float32 def __init__( self, normalise='total', dtypes=('',), tag_name=None, tag_value=None, tag_keep_missing=False, read_group=None): """Initialize creation of neural network input features. :param normalise: str, how to normalise the data. :param dtypes: iterable of str, read id prefixes of distinct data types that should be counted separately. :param tag_name: two letter tag name by which to filter reads. :param tag_value: integer value of tag for reads to keep. :param tag_keep_missing: whether to keep reads when tag is missing. :param read_group: value of RG tag to which to filter reads. """ self.logger = medaka.common.get_named_logger('Feature') self.normalise = normalise self.dtypes = dtypes self.feature_indices = pileup_counts_norm_indices(self.dtypes) self.tag_name = tag_name self.tag_value = tag_value self.tag_keep_missing = tag_keep_missing self.read_group = read_group if self.normalise not in self._norm_modes_: raise ValueError('normalise={} is not one of {}'.format( self.normalise, self._norm_modes_)) # TODO: refactor/remove/move to base class. opts = inspect.signature( CountsFeatureEncoder.__init__).parameters.keys() opts = {k: getattr(self, k) for k in opts if k != 'self'} self.logger.debug("Creating features with: {}".format(opts)) @property def feature_vector_length(self): """Return size of a single feature vector. The length of a neural network input at a single time point. """ featlen = libmedaka.lib.featlen return len(self.dtypes) * featlen def _pileup_function(self, region, bam): return pileup_counts( region, bam, dtype_prefixes=self.dtypes, tag_name=self.tag_name, tag_value=self.tag_value, keep_missing=self.tag_keep_missing, read_group=self.read_group) def _post_process_pileup(self, counts, positions, region): # Normalise produced counts using chosen method. start, end = positions['major'][0], positions['major'][-1] # TODO investigate off-by-one if start != region.start or end + 1 != region.end: self.logger.warning( 'Pileup counts do not span requested region, requested {}, ' 'received {}-{}.'.format(region, start, end) ) # find the position index for parent major position of all minor # positions minor_inds = np.where(positions['minor'] > 0) major_pos_at_minor_inds = positions['major'][minor_inds] major_ind_at_minor_inds = np.searchsorted( positions['major'], major_pos_at_minor_inds, side='left') depth = np.sum(counts, axis=1) depth[minor_inds] = depth[major_ind_at_minor_inds] if self.normalise == 'total': # normalize counts by total depth at major position, since the # counts include deletions this is a count of spanning reads # max just to avoid div error feature_array = counts / np.maximum(1, depth).reshape((-1, 1)) elif self.normalise == 'fwd_rev': # normalize forward and reverse and by dtype feature_array = np.empty_like(counts, dtype=self.feature_dtype) for (dt, is_rev), inds in self.feature_indices.items(): dt_depth = np.sum(counts[:, inds], axis=1) dt_depth[minor_inds] = dt_depth[major_ind_at_minor_inds] # max just to avoid div err feature_array[:, inds] = \ counts[:, inds] / np.maximum(1, dt_depth).reshape((-1, 1)) else: feature_array = counts feature_array = feature_array.astype(self.feature_dtype) sample = medaka.common.Sample( ref_name=region.ref_name, features=feature_array, labels=None, ref_seq=None, positions=positions, label_probs=None ) self.logger.info('Processed {} (median depth {})'.format( sample.name, np.median(depth))) return sample def bams_to_training_samples( self, truth_bam, bam, region, label_scheme, truth_haplotag=None, min_length=1000): """Prepare training data chunks. :param truth_bam: .bam file of truth aligned to ref to generate labels. :param bam: input .bam file. :param region: `medaka.common.Region` instance for region to process. :param label_scheme: a `LabelScheme` describing network outputs. :param truth_haplotag: two letter tag name used for grouping truth labels by haplotype. :param min_length: minimum length for valid alignments. :returns: tuple of `medaka.common.Sample` objects. .. note:: Chunks might be missing if `truth_bam` is provided and regions with multiple mappings were encountered. """ # Find truth alignments (with some filtering). alns = medaka.labels.TruthAlignment.bam_to_alignments( truth_bam, region, haplotag=truth_haplotag, min_length=min_length) if len(alns) == 0: self.logger.info( "Filtering and grouping removed all alignments " "of truth to ref from {}.".format(region)) samples = [] for aln in alns: # get labels from truth alignments. truth_pos, truth_labels = label_scheme.encode(aln) # get features from read alignment data aln_samples = self.bam_to_sample(bam, medaka.common.Region( region.ref_name, aln[0].start, aln[0].end)) for sample in aln_samples: # create a label array that respects positions present # in the feature's position array (where single reads # may have inserted bases, creating minor postions absent # from the labels position array) shape = list(truth_labels.shape) shape[0] = len(sample.positions) padded_labels = np.full(shape, label_scheme.padding_vector, dtype=truth_labels.dtype) truth_inds = np.where(np.in1d(truth_pos, sample.positions)) sample_inds = np.where(np.in1d(sample.positions, truth_pos)) assert len(truth_inds[0]) == len(sample_inds[0]) assert np.alltrue( truth_pos[truth_inds] == sample.positions[sample_inds]) padded_labels[sample_inds] = truth_labels[truth_inds] sample = sample.amend(labels=padded_labels) samples.append(sample) return tuple(samples) class HardRLEFeatureEncoder(CountsFeatureEncoder): """Create a pileup array of counts of observed bases. Counts are segregated according to run lengths stored in the quality information of reads. """ def __init__( self, normalise='total', dtypes=('', ), tag_name=None, tag_value=None, tag_keep_missing=False, num_qstrat=15, read_group=None): """Class to generate neural network input features. :param normalise: str, how to normalise the data. :param dtypes: iterable of str, read id prefixes of distinct data types that should be counted separately. :param tag_name: two letter tag name by which to filter reads. :param tag_value: integer value of tag for reads to keep. :param tag_keep_missing: whether to keep reads when tag is missing. :param num_qstrat: number of layers for qscore stratification. :param read_group: value of RG tag to which to filter reads. """ self.num_qstrat = num_qstrat super().__init__( normalise, dtypes=dtypes, tag_name=tag_name, tag_value=tag_value, tag_keep_missing=tag_keep_missing, read_group=read_group) self.feature_indices = pileup_counts_norm_indices( self.dtypes, num_qstrat=self.num_qstrat) def _pileup_function(self, region, bam): return pileup_counts( region, bam, dtype_prefixes=self.dtypes, tag_name=self.tag_name, tag_value=self.tag_value, keep_missing=self.tag_keep_missing, num_qstrat=self.num_qstrat, read_group=self.read_group) @property def feature_vector_length(self): """Return size of a single feature vector. The length of a neural network input at a single time point. """ featlen = libmedaka.lib.featlen return len(self.dtypes) * featlen * self.num_qstrat class SymHardRLEFeatureEncoder(HardRLEFeatureEncoder): """HRLE encoder where lack of insertion == deletion. In minor positions, when a read spans an insertion but does not contain the inserted base, this will be counted as a deletion. """ def _pileup_function(self, region, bam): [(counts, positions)] = super()._pileup_function(region, bam) minor_inds = np.where(positions['minor'] > 0) major_pos_at_minor_inds = positions['major'][minor_inds] major_ind_at_minor_inds = np.searchsorted( positions['major'], major_pos_at_minor_inds, side='left') # Correct count of indels in minor positions, where reads that # do not contain the insertion are not counted as deletion in # minor positions, unlike in major positions for (dt, is_rev), inds in self.feature_indices.items(): dt_depth = np.sum(counts[:, inds], axis=1) # Every dtype needs a vector with size featlen x num_qstrat, # the elements divided in two (split fwd/reverse). Also, by design, # indels in RLE pileups are accumulated in the first layer of # stratification available to that dtype. E.g., for num_qstrat=2 # and 2 dtypes, 'R1' and 'R2': # {('R1', False): [4, 5, 6, 7, 9, 14, 15, 16, 17, 19], # ('R1', True): [0, 1, 2, 3, 8, 10, 11, 12, 13, 18], # ('R2', False): [24, 25, 26, 27, 29, 34, 35, 36, 37, 39], # ('R2', True): [20, 21, 22, 23, 28, 30, 31, 32, 33, 38]} featlen_index = libmedaka.lib.rev_del if is_rev else \ libmedaka.lib.fwd_del dtype_size = libmedaka.lib.featlen * self.num_qstrat del_ind = [x for x in inds if x % dtype_size == featlen_index][0] counts[minor_inds, del_ind] = dt_depth[major_ind_at_minor_inds] -\ dt_depth[minor_inds] return [(counts, positions)] class SoftRLEFeatureEncoder(HardRLEFeatureEncoder): """Create pileups using soft RLE calls.""" def _pileup_function(self, region, bam): return pileup_counts( region, bam, dtype_prefixes=self.dtypes, tag_name=self.tag_name, tag_value=self.tag_value, keep_missing=self.tag_keep_missing, num_qstrat=self.num_qstrat, weibull_summation=True, read_group=self.read_group) class SampleGenerator(object): """Orchestration of neural network inputs with chunking.""" def __init__(self, bam, region, feature_encoder, truth_bam=None, label_scheme=None, truth_haplotag=None, chunk_len=1000, chunk_overlap=200, enable_chunking=True, min_truth_length=1000): """Generate chunked inference (or training) samples. :param bam: `.bam` containing alignments from which to generate samples. :param region: a `medaka.common.Region` for which to generate samples. :param feature_encoder: a `FeatureEncoder` object used for constructing training features. :param truth_bam: a `.bam` containing alignment of truth sequence to `reference` sequence. Required only for creating training chunks. :param label_scheme: a `LabelScheme` used for deriving training truth labels. :param truth_haplotag: two letter tag name used for grouping truth labels by haplotype. :param reference: reference `.fasta`, should correspond to `bam`. :param enable_chunking: when yielding samples, do so in chunks. :param min_truth_length: minimum length of truth alignments to accepts for generation of truth labels. """ self.logger = medaka.common.get_named_logger("Sampler") self.sample_type = "training" if truth_bam is not None else "consensus" self.logger.info("Initializing sampler for {} of region {}.".format( self.sample_type, region)) self.fencoder = feature_encoder self.bam = bam self.region = region self.truth_bam = truth_bam self.label_scheme = label_scheme self.truth_haplotag = truth_haplotag self.chunk_len = chunk_len self.chunk_overlap = chunk_overlap self.enable_chunking = enable_chunking self.min_truth_length = min_truth_length self._source = None # the base data to be chunked self._quarantined = list() # samples which are shorter than chunk size if self.truth_bam is not None and self.label_scheme is None: raise ValueError( "A `LabelScheme` must be given to create training data.") def _fill_features(self): if self._source is None: self._quarantined = None t0 = now() if self.truth_bam is not None: self._source = self.fencoder.bams_to_training_samples( self.truth_bam, self.bam, self.region, self.label_scheme, truth_haplotag=self.truth_haplotag, min_length=self.min_truth_length) else: self._source = self.fencoder.bam_to_sample( self.bam, self.region) t1 = now() self.logger.info("Took {:.2f}s to make features.".format(t1-t0)) def _quarantine_sample(self, sample): """Add sample name and pileup width to a list.""" # Note: the below assumes we haven't split a pileup on minor positions. # This should be the case: chunking on minor positions only occurs for # larger regions. start, _ = sample.first_pos end, _ = sample.last_pos end += 1 # end exclusive self._quarantined.append(( medaka.common.Region(sample.ref_name, start, end), sample.size )) @property def samples(self): """List of (possibly) chunked samples.""" self._fill_features() self._quarantined = list() all_data = [] for source in self._source: if source.is_empty: continue if not self.enable_chunking: chunks = [source] else: if source.size < self.chunk_len: msg = ( "Region {} ({} positions) is smaller than " "inference chunk length {}, quarantining.").format( source.name, source.size, self.chunk_len) self.logger.warning(msg) self._quarantine_sample(source) continue self.logger.debug( "Chunking pileup data into {} columns with overlap " "of {}.".format(self.chunk_len, self.chunk_overlap)) chunks = source.chunks( chunk_len=self.chunk_len, overlap=self.chunk_overlap) self.logger.debug("Pileup for {} is of width {}".format( source.name, source.size)) all_data.extend(chunks) return all_data def _samples_worker(args, region, feature_encoder, label_scheme): logger = medaka.common.get_named_logger('PrepWork') logger.info("Processing region {}.".format(region)) data_gen = SampleGenerator( args.bam, region, feature_encoder, truth_bam=args.truth, label_scheme=label_scheme, truth_haplotag=args.truth_haplotag, chunk_len=args.chunk_len, chunk_overlap=args.chunk_ovlp) return list(data_gen.samples), region def create_samples(args): """Entry point for creation of feature .hdfs (labelled or unlabelled).""" logger = medaka.common.get_named_logger('Prepare') if args.chunk_ovlp >= args.chunk_len: raise ValueError( 'chunk_ovlp {} is not smaller than chunk_len {}'.format( args.chunk_ovlp, args.chunk_len)) regions = medaka.common.get_regions(args.bam, args.regions) reg_str = '\n'.join(['\t\t\t{}'.format(r) for r in regions]) logger.info('Got regions:\n{}'.format(reg_str)) if args.truth is None: logger.warn( 'Running medaka features without a truth bam, ' 'unlabelled data will be produced. Is this intended?') time.sleep(3) no_data = False with medaka.datastore.DataStore(args.output, 'w') as ds: # write feature options to file logger.info("Writing meta data to file.") num_qstrat = args.feature_encoder_args.get('num_qstrat') max_run = args.label_scheme_args.get('max_run') # If one of them is set, set the other to agree. # If both are set, force them to agree. # If none is set or they are the same continue merrily. if max_run is None and num_qstrat is not None: args.label_scheme_args['max_run'] = num_qstrat elif max_run is not None and num_qstrat is None: args.feature_encoder_args['num_qstrat'] = max_run elif max_run != num_qstrat: raise ValueError( 'num_qstrat in feature_encoder_args must agree ' 'with max_run in feature_encoder_args') # Create and serialise to file model ancilliaries feature_encoder = feature_encoders[args.feature_encoder]( **args.feature_encoder_args) ds.set_meta(feature_encoder, 'feature_encoder') label_scheme = medaka.labels.label_schemes[args.label_scheme]( **args.label_scheme_args) ds.set_meta(label_scheme, 'label_scheme') model_function = functools.partial( medaka.models.build_model, feature_encoder.feature_vector_length, len(label_scheme._decoding)) ds.set_meta(model_function, 'model_function') # TODO: this parallelism would be better in # `SampleGenerator.bams_to_training_samples` since training # alignments are usually chunked. ExecutorClass = concurrent.futures.ProcessPoolExecutor with ExecutorClass(max_workers=args.threads) as executor: # break up overly long chunks MAX_SIZE = int(1e6) regions = itertools.chain(*(r.split(MAX_SIZE) for r in regions)) futures = [executor.submit( _samples_worker, args, reg, feature_encoder, label_scheme) for reg in regions] for fut in concurrent.futures.as_completed(futures): if fut.exception() is None: samples, region = fut.result() logger.info( "Writing {} samples for region {}".format( len(samples), region)) for sample in samples: ds.write_sample(sample) else: logger.info(fut.exception()) logger.info(fut.result()) fut._result = None # python issue 27144 no_data = ds.n_samples == 0 if no_data: logger.critical( "Warning: No training data was written to file, " "deleting output.") os.remove(args.output)