# -*- coding: utf-8 -*- import numpy as np import cooler import sys import warnings import pandas as pd import itertools from multiprocessing import Pool from functools import partial import logging from natsort import index_natsorted, order_by_index, natsorted from scipy import sparse from scipy.linalg import toeplitz from cooltools import numutils, snipping import yaml import io def save_array_with_header(array, header, filename): """Save a numpy array with a YAML header generated from a dictionary Parameters ---------- array : np.array Array to save. header : dict Dictionaty to save into the header. filename : string Name of file to save array and metadata into. """ header = yaml.dump(header).strip() np.savetxt(filename, array, header=header) def load_array_with_header(filename): """Load array from files generated using `save_array_with_header`. They are simple txt files with an optional header in the first lines, commented using "# ". If uncommented, the header is in YAML. Parameters ---------- filename : string File to load from. Returns ------- data : dict Dictionary with information from the header. Access the associated data in an array using data['data']. """ with open(filename) as f: read_data = f.read() lines = read_data.split("\n") header = "\n".join([line[2:] for line in lines if line.startswith("# ")]) if len(header) > 0: metadata = yaml.load(header, Loader=yaml.FullLoader) else: metadata = {} data = "\n".join([line for line in lines if not line.startswith("# ")]) with io.StringIO(data) as f: metadata["data"] = np.loadtxt(f) return metadata def corner_cv(amap, i=4): """Get coefficient of variation for upper left and lower right corners of a pileup to estimate how noisy it is Parameters ---------- amap : 2D array Pileup. i : int, optional How many bins to use from each upper left and lower right corner: final corner shape is i^2. The default is 4. Returns ------- CV : float Coefficient of variation for the corner pixels. """ corners = np.concatenate((amap[0:i, 0:i], amap[-i:, -i:])) corners = corners[np.isfinite(corners)] return np.std(corners) / np.mean(corners) def norm_cis(amap, i=3): """Normalize the pileup by mean of pixels from upper left and lower right corners Parameters ---------- amap : 2D array Pileup. i : int, optional How many bins to use from each upper left and lower right corner: final corner shape is i^2. 0 will not normalize. The default is 3. Returns ------- amap : 2D array Normalized pileup. """ if i > 0: return amap / np.nanmean((amap[0:i, 0:i] + amap[-i:, -i:])) * 2 else: return amap def get_enrichment(amap, n): """Get values from the center of a pileup for a square with side *n* Parameters ---------- amap : 2D array Pileup. n : int Side of the central square to use. Returns ------- enrichment : float Mean of the pixels in the central square. """ c = int(np.floor(amap.shape[0] / 2)) return np.nanmean(amap[c - n // 2 : c + n // 2 + 1, c - n // 2 : c + n // 2 + 1]) def get_local_enrichment(amap, pad=1): """Get values from the central part of a pileup for a square, ignoring padding Parameters ---------- amap : 2D array Pileup. pad : int Relative padding used, i.e. if 1 the central third is used, if 2 the central fifth is used. The default is 1. Returns ------- enrichment : float Mean of the pixels in the central square. """ c = amap.shape[0] / (pad * 2 + 1) assert int(c) == c c = int(c) return np.nanmean(amap[c:-c, c:-c]) def get_insulation_strength(amap, ignore_central=0): """Divide values in upper left and lower right corners over upper right and lower left, ignoring the central bins. Parameters ---------- amap : 2D array Pileup. ignore_central : int, optional How many central bins to ignore. Has to be odd or 0. The default is 0. Returns ------- float Insulation strength. """ if not ignore_central==0 or ignore_central%2==1: raise ValueError(f'ignore_central has to be odd (or 0), got {ignore_central}') i = (amap.shape[0] - ignore_central)//2 intra = np.nansum(amap[ :i, :i]) + np.nansum(amap[-i:, -i:]) inter = np.nansum(amap[:i, -i: ]) + np.nansum(amap[-i:, :i]) return intra/inter def prepare_single(item): """Generate enrichment and corner CV, reformat into a list Parameters ---------- item : tuple Key, (n, pileup). Returns ------- list Concatenated list of key, n, enrichment1, enrichment3, cv3, cv5. """ key, (n, amap) = item enr1 = get_enrichment(amap, 1) enr3 = get_enrichment(amap, 3) cv3 = corner_cv(amap, 3) cv5 = corner_cv(amap, 5) return list(key) + [n, enr1, enr3, cv3, cv5] def norm_coverage(loop, cov_start, cov_end): """Normalize a pileup by coverage arrays Parameters ---------- loop : 2D array Pileup. cov_start : 1D array Accumulated coverage of the left side of the pileup. cov_end : 1D array Accumulated coverage of the bottom side of the pileup. Returns ------- loop : 2D array Normalized pileup. """ coverage = np.outer(cov_start, cov_end) coverage /= np.nanmean(coverage) loop /= coverage loop[np.isnan(loop)] = 0 return loop class CoordCreator: def __init__( self, baselist, resolution, bed2=None, bed2_ordered=True, anchor=False, pad=100000, chroms="all", minshift=10 ** 5, maxshift=10 ** 6, nshifts=10, mindist="auto", maxdist=None, minsize=0, maxsize=None, local=False, subset=0, seed=None, ): """Generator of coordinate pairs for pileups. Parameters ---------- baselist : str Path to a bed- or bedpe-style file with coordinates. resolution : int Data resolution. bed2 : str, optional Path to a second bed-style file with coordinates. If specified, interactions between baselist and bed2 are used. The default is None. bed2_ordered : bool, optional Whether interactions always have coordinates from baselist on the left and from bed2 on the bottom. If False, all interactions will be considered irrespective of order. The default is True. anchor : tuple of (str, int, int), optional Coordinates (chr, start, end) of an anchor region used to create interactions with baselist (in bp). Anchor is on the left of the final pileup. The default is False. pad : int, optional Paddin around the central bin, in bp. For example, with 5000 bp resolution and 100000 pad, final pileup is 205000×205000 bp. The default is 100000. chroms : str or list, optional Which chromosomes to use for pileups. Has to be in a list even for a single chromosome, e.g. ['chr1']. The default is "all" minshift : int, optional Minimal shift applied when generating random controls, in bp. The default is 10 ** 5. maxshift : int, optional Maximal shift applied when generating random controls, in bp. The default is 10 ** 6. nshifts : int, optional How many shifts to generate per region of interest. Does not take chromosome boundaries into account The default is 10. mindist : int, optional Shortest interactions to consider. Uses midpoints of regions of interest. "auto" selects it to avoid the two shortest diagonals of the matrix, i.e. 2 * pad + 2 * resolution The default is "auto". maxdist : int, optional Longest interactions to consider. The default is None. minsize : int, optional Shortest regions to consider. Only applies to bed files. The default is 0. maxsize : int, optional Longest regions to consider. Only applies to bed files. The default is None. local : bool, optional Whether to generate local coordinates, i.e. on-diagonal. The default is False. subset : int, optional What subset of the coordinate files to use. 0 or negative to use all. The default is 0. seed : int, optional Seed for np.random to make it reproducible. The default is None. Returns ------- Object that generates coordinates for pileups required for PileUpper. """ self.baselist = baselist self.stdin = self.baselist == sys.stdin self.resolution = resolution self.bed2 = bed2 self.bed2_ordered = bed2_ordered self.anchor = anchor self.pad = pad self.pad_bins = pad // self.resolution self.chroms = chroms self.minshift = minshift self.maxshift = maxshift self.nshifts = nshifts if mindist == "auto": self.mindist = 2 * self.pad + 2 * self.resolution else: self.mindist = mindist if maxdist is None: self.maxdist = np.inf else: self.maxdist = maxdist self.minsize = minsize if maxsize is None: self.maxsize = np.inf else: self.maxsize = maxsize self.local = local self.subset = subset self.seed = seed def filter_bed(self, df): length = df["end"] - df["start"] df = df[(length >= self.minsize) & (length <= self.maxsize)] if self.chroms != "all": df = df[df["chr"].isin(self.chroms)] if not np.all(df["end"] >= df["start"]): raise ValueError("Some ends in the file are smaller than starts") return df def filter_bedpe(self, df): mid1 = np.mean(df[["start1", "end1"]], axis=1) mid2 = np.mean(df[["start2", "end2"]], axis=1) length = mid2 - mid1 df = df[(length >= self.mindist) & (length <= self.maxdist)] if self.chroms != "all": df = df[(df["chr1"].isin(self.chroms)) & (df["chr2"].isin(self.chroms))] return df def auto_read_bed( self, file, kind="auto", ): if kind == "auto": # Guessing whether it's bed or bedpe style file if self.stdin: row1 = file.__next__().split("\t") else: with open(file, "r") as fobject: row1 = fobject.readline().split("\t") if len(row1) == 6: filetype = "bedpe" row1 = [ row1[0], int(row1[1]), int(row1[2]), row1[3], int(row1[4]), int(row1[5]), ] elif len(row1) == 3: filetype = "bed" row1 = [row1[0], int(row1[1]), int(row1[2])] else: raise ValueError( f"""Input bed(pe) file has unexpected number of columns: got {len(row1)}, expect 3 (bed) or 6 (bedpe) """ ) if filetype == "bed" or kind == "bed": filter_func = self.filter_bed names = ["chr", "start", "end"] dtype = {"chr": "str", "start": "int", "end": "int"} row1 = filter_func(pd.DataFrame([row1], columns=names).astype(dtype=dtype)) elif filetype == "bedpe" or kind == "bedpe": # bedpe filter_func = self.filter_bedpe names = ["chr1", "start1", "end1", "chr2", "start2", "end2"] dtype = { "chr1": "str", "start1": "int", "end1": "int", "chr2": "str", "start2": "int", "end2": "int", } row1 = filter_func(pd.DataFrame([row1], columns=names).astype(dtype=dtype)) else: raise ValueError( f"""Unsupported input kind: {kind}. Expect bed or bedpe""" ) bases = [] appended = False if kind == "auto": if row1.shape[0] == 1: bases.append(row1) appended = True for chunk in pd.read_csv( file, sep="\t", names=names, index_col=False, chunksize=10 ** 4, dtype=dtype, ): bases.append(filter_func(chunk)) bases = pd.concat(bases) if appended: # Would mean we read it twice when checking and in the first chunk bases = bases.iloc[1:] if filetype == "bed" or kind == "bed": kind = "bed" # bases["chr"] = bases["chr"].astype(str) # bases[["start", "end"]] = bases[["start", "end"]].astype(np.uint64) if filetype == "bedpe" or kind == "bedpe": kind = "bedpe" # bases[["chr1", "chr2"]] = bases[["chr1", "chr2"]].astype(str) # bases[["start1", "end1", "start2", "end2"]] = bases[ # ["start1", "end1", "start2", "end2"] # ].astype(np.uint64) return bases, kind def subset(self, df): if self.seed is not None: np.random.seed(self.seed) if self.subset > 0 and self.subset < len(df): return df.sample(self.subset) else: return df def bedpe2bed(self, df, ends=True, how="center"): # If ends==True, will combine all coordinates from both loop ends into one bed-style df # Else, will convert to long intervals covering the whole loop, for e.g. rescaled averaging # how: center - take center of loop bases; inner or outer if ends: df1 = df[["chr1", "start1", "end1"]] df1.columns = ["chr", "start", "end"] df2 = df[["chr2", "start2", "end2"]] df2.columns = ["chr", "start", "end"] return ( pd.concat([df1, df2]) .sort_values(["chr", "start", "end"]) .reset_index(drop=True) ) if how == "center": df["start"] = np.mean(df["start1"], df["end1"], axis=0) df["end"] = np.mean(df["start2"], df["end2"], axis=0) elif how == "outer": df = df[["chr1", "start1", "end2"]] df.columns = ["chr", "start", "end"] elif how == "inner": df = df[["chr1", "end1", "start2"]] df.columns = ["chr", "start", "end"] return df def _get_mids(self, intervals): if self.kind == "bed": intervals = intervals.sort_values(["chr", "start"]) mids = np.round((intervals["end"] + intervals["start"]) / 2).astype(int) widths = np.round((intervals["end"] - intervals["start"])).astype(int) mids = pd.DataFrame( { "chr": intervals["chr"], "Mids": mids, "Bin": mids // self.resolution, "Pad": widths / 2, } ).drop_duplicates( ["chr", "Bin"] ) # .drop('Bin', axis=1) elif self.kind == "bedpe": intervals = intervals.sort_values(["chr1", "chr2", "start1", "start2"]) mids1 = np.round((intervals["end1"] + intervals["start1"]) / 2).astype(int) widths1 = np.round((intervals["end1"] - intervals["start1"])).astype(int) mids2 = np.round((intervals["end2"] + intervals["start2"]) / 2).astype(int) widths2 = np.round((intervals["end2"] - intervals["start2"])).astype(int) mids = pd.DataFrame( { "chr1": intervals["chr1"], "Mids1": mids1, "Bin1": mids1 // self.resolution, "Pad1": widths1 / 2, "chr2": intervals["chr2"], "Mids2": mids2, "Bin2": mids2 // self.resolution, "Pad2": widths2 / 2, }, ).drop_duplicates( ["chr1", "chr2", "Bin1", "Bin2"] ) # .drop(['Bin1', 'Bin2'], axis=1) else: raise ValueError( """ kind can only be "bed" or "bedpe" """ ) return mids def filter_func_all(self, mids): return mids def _filter_func_chrom(self, mids, chrom): return mids[mids["chr"] == chrom] def _filter_func_pairs_chrom(self, mids, chrom): return mids[(mids["chr1"] == chrom) & (mids["chr2"] == chrom)] def filter_func_chrom(self, chrom): if self.kind == "bed": return partial(self._filter_func_chrom, chrom=chrom) else: return partial(self._filter_func_pairs_chrom, chrom=chrom) def _filter_func_region(self, mids, region): chrom, start, end = region start /= self.resolution end /= self.resolution return mids[ (mids["chr"] == chrom) & (mids["Bin"] >= start) & (mids["Bin"] < end) ] def _filter_func_pairs_region(self, mids, region): chrom, start, end = region start /= self.resolution end /= self.resolution return mids[ (mids["chr1"] == chrom) & (mids["chr2"] == chrom) & (mids["Bin1"] >= start) & (mids["Bin1"] < end) & (mids["Bin2"] >= start) & (mids["Bin2"] < end) ] def filter_func_region(self, region): if self.kind == "bed": return partial(self._filter_func_region, region) else: return partial(self._filter_func_pairs_region, region) def get_combinations(self, filter_func, mids=None, mids2=None, anchor=None): if anchor is None: anchor = self.anchor if (self.local and anchor) or (self.local and (self.mids2 is not None)): raise ValueError( """Can't have a local pileup with an anchor or with two bed files""" ) if mids is None: mids = self.mids mids = filter_func(self.mids) if not len(mids) > 1: logging.debug("Empty selection") yield None, None, None, None m = mids["Bin"].values.astype(int) p = (mids["Pad"] // self.resolution).values.astype(int) if mids2 is None: mids2 = self.mids2 if mids2 is not None: mids2 = filter_func(mids2) m2 = mids2["Bin"].values.astype(int) p2 = (mids2["Pad"] // self.resolution).values.astype(int) if self.local: for i, pi in zip(m, p): yield i, i, pi, pi elif anchor: anchor_bin = int((anchor[1] + anchor[2]) / 2 // self.resolution) anchor_pad = int(round((anchor[2] - anchor[1]) / 2)) // self.resolution for i, pi in zip(m, p): yield anchor_bin, i, anchor_pad, pi elif mids2 is None: for i, j in zip(itertools.combinations(m, 2), itertools.combinations(p, 2)): yield list(i) + list(j) elif (mids2 is not None) and self.bed2_ordered: for i, j in zip(itertools.product(m, m2), itertools.product(p, p2)): if i[1] > i[0]: yield list(i) + list(j) elif (mids2 is not None) and (not self.bed2_ordered): for i, j in itertools.chain( zip(itertools.product(m, m2), itertools.product(p, p2)), zip(itertools.product(m2, m), itertools.product(p2, p)), ): yield list(i) + list(j) def get_positions_stream(self, filter_func, mids=None): if mids is None: mids = self.mids mids = filter_func(mids) if not len(mids) >= 1: logging.debug("Empty selection") yield None, None m = mids["Bin"].astype(int).values p = (mids["Pad"] // self.resolution).astype(int).values for posdata in zip(m, p): yield posdata def get_position_pairs_stream(self, filter_func, mids=None): if mids is None: mids = self.mids mids = filter_func(mids) if not len(mids) >= 1: logging.debug("Empty selection") yield None, None, None, None m1 = mids["Bin1"].astype(int).values m2 = mids["Bin2"].astype(int).values p1 = (mids["Pad1"] // self.resolution).astype(int).values p2 = (mids["Pad2"] // self.resolution).astype(int).values for posdata in zip(m1, m2, p1, p2): yield posdata def control_regions(self, filter_func, pos_pairs=None): if self.seed is not None: np.random.seed(self.seed) minbin = self.minshift // self.resolution maxbin = self.maxshift // self.resolution if pos_pairs is None: source = self.pos_stream(filter_func) else: source = map(lambda x: x[1:], pos_pairs.itertuples()) row1 = source.__next__() if row1[0] is None: # Checking if empty selection logging.debug("Empty selection") yield row1 else: source = itertools.chain([row1], source) for start, end, p1, p2 in source: for i in range(self.nshifts): shift = np.random.randint(minbin, maxbin) sign = np.sign(np.random.random() - 0.5).astype(int) shift *= sign yield start + shift, end + shift, p1, p2 def get_combinations_by_window(self, chrom, ctrl=False, mids=None): assert self.kind == "bed" if mids is None: chrmids = self.filter_func_chrom(chrom)(self.mids) else: chrmids = self.filter_func_chrom(chrom)(mids) for i, (b, m, p) in chrmids[["Bin", "Mids", "Pad"]].astype(int).iterrows(): out_stream = self.get_combinations( self.filter_func_all, mids=chrmids, anchor=(chrom, m, m) ) if ctrl: out_stream = self.CC.control_regions(self.filter_func_all, out_stream) yield (m - p, m + p), out_stream def process(self): self.bases, self.kind = self.auto_read_bed(self.baselist) if self.bed2 is not None: self.bed2, self.bed2kind = self.auto_read_bed(self.bed2) if self.kind != "bed": raise ValueError( """Please provide two BED files; baselist doesn't seem to be one""" ) elif self.bed2kind != "bed": raise ValueError( """Please provide two BED files; bed2 doesn't seem to be one""" ) if self.kind == "bed": basechroms = set(self.bases["chr"]) if self.anchor: if self.anchor[0] not in basechroms: raise ValueError( """The anchor chromosome is not found in the baselist. Are they in the same format, e.g. starting with "chr"? Alternatively, all regions in that chromosome might have been filtered by some filters.""" ) else: basechroms = [self.anchor[0]] else: if self.anchor: raise ValueError("Can't use anchor with both sides of loops defined") elif self.local: raise ValueError("Can't make local with both sides of loops defined") basechroms = set(self.bases["chr1"]) & set(self.bases["chr2"]) if self.bed2 is not None: bed2chroms = set(self.bases["chr"]) basechroms = basechroms & bed2chroms self.basechroms = natsorted(list(basechroms)) if self.chroms == "all": self.final_chroms = natsorted(list(basechroms)) else: self.final_chroms = natsorted(list(set(self.chroms) & set(self.basechroms))) if len(self.final_chroms) == 0: raise ValueError( """No chromosomes are in common between the coordinate file/anchor and the cooler file. Are they in the same format, e.g. starting with "chr"? Alternatively, all regions might have been filtered by distance/size filters.""" ) self.mids = self._get_mids(self.bases) if self.bed2 is not None: self.mids2 = self._get_mids(self.bed2) else: self.mids2 = None if self.subset > 0: self.mids = self.mids.sample(self.subset) if self.mids2 is not None: self.mids2 = self.mids2.sample(self.subset) if self.kind == "bed": self.pos_stream = self.get_combinations else: self.pos_stream = self.get_position_pairs_stream def _chrom_mids(self, chroms, mids): for chrom in chroms: if self.kind == "bed": yield chrom, mids[mids["chr"] == chrom] else: yield chrom, mids[(mids["chr1"] == chrom) & (mids["chr2"] == chrom)] def chrom_mids(self): chrommids = self._chrom_mids(self.final_chroms, self.mids) if self.mids2 is not None: chrommids2 = self._chrom_mids(self.final_chroms, self.mids2) else: chrommids2 = zip(itertools.cycle([None]), itertools.cycle([None])) chrommids = zip(chrommids, chrommids2) for i in chrommids: yield i class PileUpper: def __init__( self, clr, CC, balance="weight", expected=False, control=False, coverage_norm=False, rescale=False, rescale_pad=1, rescale_size=99, ignore_diags=2, ): """Creates pileups Parameters ---------- clr : cool Cool file with Hi-C data. CC : CoordCreator CoordCreator object with correct settings. balance : bool or str, optional Whether to use balanced data, and which column to use as weights. The default is "weight". expected : DataFrame, optional If using expected, pandas DataFrame with chromosome-wide expected. The default is False. control : bool, optional Whehter to use randomly shifted controls. The default is False. coverage_norm : bool, optional Whether to normalize final the final pileup by accumulated coverage as an alternative to balancing. Useful for single-cell hi-C data. The default is False. rescale : bool, optional Whether to use real sizes of all ROIs and rescale them to the same shape The default is False. rescale_pad : float, optional Fraction of ROI size added on each end when extracting snippets, if rescale. The default is 1. rescale_size : int, optional Final shape of rescaled pileups. E.g. if 99, pileups will be squares of 99×99 pixels. The default is 99. ignore_diags : int, optional How many diagonals to ignore to avoid short-distance artefacts. The default is 2. Returns ------- Object that generates pileups. """ self.clr = clr self.resolution = self.clr.binsize self.CC = CC assert self.resolution == self.CC.resolution self.__dict__.update(self.CC.__dict__) self.balance = balance self.expected = expected self.control = control self.pad_bins = self.pad // self.resolution self.coverage_norm = coverage_norm self.rescale = rescale self.rescale_pad = rescale_pad self.rescale_size = rescale_size self.ignore_diags = ignore_diags # self.CoolSnipper = snipping.CoolerSnipper( # self.clr, cooler_opts=dict(balance=self.balance) # ) self.matsizes = np.ceil(self.clr.chromsizes / self.resolution).astype(int) self.chroms = natsorted( list(set(self.CC.final_chroms) & set(self.clr.chromnames)) ) self.regions = { chrom: cooler.util.parse_region_string(chrom) for chrom in self.chroms } if self.expected is not False: if self.control: warnings.warn( "Can't do both expected and control shifts; defaulting to expected" ) self.control = False assert isinstance(self.expected, pd.DataFrame) self.ExpSnipper = snipping.ExpectedSnipper(self.clr, self.expected) self.expected_selections = { chrom: self.ExpSnipper.select(region, region) for chrom, region in self.regions.items() } self.expected = True # def get_matrix(self, matrix, chrom, left_interval, right_interval): # lo_left, hi_left = left_interval # lo_right, hi_right = right_interval # lo_left *= self.resolution # hi_left *= self.resolution # lo_right *= self.resolution # hi_right *= self.resolution # matrix = self.CoolSnipper.snip( # matrix, # self.regions[chrom], # self.regions[chrom], # (lo_left, hi_left, lo_right, hi_right), # ) # return matrix def get_expected_matrix(self, chrom, left_interval, right_interval): """Generate expected matrix for a region Parameters ---------- chrom : str Chromosome name. left_interval : tuple Tuple of (lo_left, hi_left) bin IDs in the chromosome. right_interval : tuple Tuple of (lo_right, hi_right) bin IDs in the chromosome. Returns ------- exp_matrix : array Array of expected values for the selected coordinates. """ lo_left, hi_left = left_interval lo_right, hi_right = right_interval lo_left *= self.resolution hi_left *= self.resolution lo_right *= self.resolution hi_right *= self.resolution exp_matrix = self.ExpSnipper.snip( self.expected_selections[chrom], self.regions[chrom], self.regions[chrom], (lo_left, hi_left, lo_right, hi_right), ) return exp_matrix def make_outmap(self,): """Generate zero-filled array of the right shape Returns ------- outmap: array Array of zeros of correct shape. """ if self.rescale: outmap = np.zeros((self.rescale_size, self.rescale_size)) else: outmap = np.zeros((2 * self.pad_bins + 1, 2 * self.pad_bins + 1)) return outmap def get_data(self, region): """Get sparse data for a region Parameters ---------- region : tuple or str Region for which to load the data. Either tuple of (chr, start, end), or string with chromosome name. Returns ------- data : csr Sparse csr matrix for the corresponding region. """ logging.debug("Loading data") data = self.clr.matrix(sparse=True, balance=self.balance).fetch(region) data = sparse.triu(data) return data.tocsr() def get_coverage(self, data): """Get total coverage profile for upper triangular data Parameters ---------- data : array_like 2D array with upper triangular data. Returns ------- coverage : array 1D array of coverage. """ coverage = np.nan_to_num(np.ravel(np.sum(data, axis=0))) + np.nan_to_num( np.ravel(np.sum(data, axis=1)) ) return coverage def _do_pileups( self, mids, chrom, expected=False, ): if expected: data = None logging.debug("Doing expected") else: data = self.get_data( chrom ) # self.CoolSnipper.select(self.regions[chrom], self.regions[chrom]) max_right = self.matsizes[chrom] mymap = self.make_outmap() if self.coverage_norm: coverage = self.get_coverage(data) cov_start = np.zeros(mymap.shape[0]) cov_end = np.zeros(mymap.shape[1]) num = np.zeros_like(mymap) n = 0 for stBin, endBin, stPad, endPad in mids: rot_flip = False rot = False if stBin > endBin: stBin, stPad, endBin, endPad = endBin, endPad, stBin, stPad if self.anchor is None: rot_flip = True else: rot = True if self.rescale: stPad = stPad + int(round(self.rescale_pad * 2 * stPad)) endPad = endPad + int(round(self.rescale_pad * 2 * endPad)) else: stPad, endPad = self.pad_bins, self.pad_bins lo_left = stBin - stPad hi_left = stBin + stPad + 1 lo_right = endBin - endPad hi_right = endBin + endPad + 1 if lo_left < 0 or hi_right > max_right: continue diag = hi_left - lo_right if not expected: try: newmap = data[lo_left:hi_left, lo_right:hi_right].toarray() except (IndexError, ValueError): continue else: newmap = self.get_expected_matrix( chrom, (lo_left, hi_left), (lo_right, hi_right) ) # if ( # newmap.shape != mymap.shape and not self.rescale # ): # AFAIK only happens at ends of chroms # height, width = newmap.shape # h, w = mymap.shape # x = w - width # y = h - height # newmap = np.pad( # newmap, [(y, 0), (0, x)], "constant" # ) # Padding to adjust to the right shape newmap = newmap.astype(float) if not self.local: ignore_indices = np.tril_indices_from( newmap, diag - (stPad * 2 + 1) - 1 + self.ignore_diags ) newmap[ignore_indices] = np.nan else: newmap = np.triu(newmap, self.ignore_diags) newmap += np.triu(newmap, 1).T if self.rescale: if newmap.size == 0 or np.all(np.isnan(newmap)): newmap = np.zeros((self.rescale_size, self.rescale_size)) else: newmap = numutils.zoom_array( newmap, (self.rescale_size, self.rescale_size) ) if rot_flip: newmap = np.rot90(np.flipud(newmap), 1) elif rot: newmap = np.rot90(newmap, -1) mymap = np.nansum([mymap, newmap], axis=0) if self.coverage_norm and not expected and (self.balance is False): new_cov_start = coverage[lo_left:hi_left] new_cov_end = coverage[lo_right:hi_right] if self.rescale: if len(new_cov_start) == 0: new_cov_start = np.zeros(self.rescale_size) if len(new_cov_end) == 0: new_cov_end = np.zeros(self.rescale_size) new_cov_start = numutils.zoom_array( new_cov_start, (self.rescale_size,) ) new_cov_end = numutils.zoom_array(new_cov_end, (self.rescale_size,)) else: l = len(new_cov_start) r = len(new_cov_end) new_cov_start = np.pad( new_cov_start, (mymap.shape[0] - l, 0), "constant" ) new_cov_end = np.pad( new_cov_end, (0, mymap.shape[1] - r), "constant" ) cov_start += np.nan_to_num(new_cov_start) cov_end += +np.nan_to_num(new_cov_end) num += np.isfinite(newmap).astype(int) n += 1 return mymap, num, cov_start, cov_end, n def pileup_chrom( self, chrom, expected=False, ctrl=False, ): """ Parameters ---------- chrom : str Chromosome name. expected : bool, optional Whether to create pileup of expected values. The default is False. ctrl : bool, optional Whether to pileup randomly shifted control regions. The default is False. Returns ------- pileup : 2D array Pileup for the specified chromosome. n : int How many ROIs were piled up. cov_start : 1D array Accumulated coverage of the left side of the pileup. cov_end : 1D array Accumulated coverage of the bottom side of the pileup. """ mymap = self.make_outmap() cov_start = np.zeros(mymap.shape[0]) cov_end = np.zeros(mymap.shape[1]) if self.anchor: assert chrom == self.anchor[0] logging.info(f"Anchor: {chrom}:{self.anchor[1]}-{self.anchor[2]}") filter_func = self.CC.filter_func_chrom(chrom=chrom) if ctrl: mids = self.CC.control_regions(filter_func) else: mids = self.CC.pos_stream(filter_func) mids_row1 = mids.__next__() if mids_row1[0] is None: # Checking if empty selection logging.info(f"Nothing to sum up in chromosome {chrom}") return mymap, mymap, cov_start, cov_end, 0 else: mids = itertools.chain([mids_row1], mids) mymap, num, cov_start, cov_end, n = self._do_pileups( mids=mids, chrom=chrom, expected=expected, ) logging.info(f"{chrom}: {n}") return mymap, num, cov_start, cov_end, n def pileupsWithControl(self, nproc=1): """Perform pileups across all chromosomes and applies required normalization Parameters ---------- nproc : int, optional How many cores to use. Sends a whole chromosome per process. The default is 1. Returns ------- loop : 2D array Normalized pileup. """ if nproc > 1: p = Pool(nproc) mymap = p.map else: mymap = map # Loops f = partial(self.pileup_chrom, ctrl=False, expected=False,) loops, nums, cov_starts, cov_ends, ns = list(zip(*mymap(f, self.chroms))) loop = np.sum(loops, axis=0) n = np.sum(ns) num = np.sum(nums, axis=0) if self.coverage_norm: cov_start = np.sum(cov_starts, axis=0) cov_end = np.sum(cov_ends, axis=0) loop = norm_coverage(loop, cov_start, cov_end) loop /= num logging.info(f"Total number of piled up windows: {n}") # Controls if self.expected is not False: f = partial(self.pileup_chrom, ctrl=False, expected=True,) exps, nums, cov_starts, cov_ends, ns = list(zip(*mymap(f, self.chroms))) exp = np.sum(exps, axis=0) num = np.sum(nums, axis=0) exp /= num loop /= exp elif self.control: f = partial(self.pileup_chrom, ctrl=True, expected=False,) ctrls, nums, cov_starts, cov_ends, ns = list(zip(*mymap(f, self.chroms))) ctrl = np.sum(ctrls, axis=0) num = np.sum(nums, axis=0) n = np.sum(ns) if self.coverage_norm: cov_start = np.sum(cov_starts, axis=0) cov_end = np.sum(cov_ends, axis=0) ctrl = norm_coverage(ctrl, cov_start, cov_end) ctrl /= num logging.info(f"Total number of piled up control windows: {n}") loop /= ctrl if nproc > 1: p.close() loop[~np.isfinite(loop)] = 0 return loop def pileupsByWindow( self, chrom, expected=False, ctrl=False, ): """Creates pileups for each window against the rest for a chromosome Parameters ---------- chrom : str Chromosome name. expected : bool, optional Whether to create pileup of expected values. The default is False. ctrl : bool, optional Whether to pileup randomly shifted control regions. The default is False. Returns ------- pileups : dict Keys are tuples of (start, end) coordinates. Values are tuples of (n, pileup) n : int How many ROIs were piled up. pileup : 2D array Pileup for the region """ pileups = dict() for (start, end), stream in self.CC.get_combinations_by_window(chrom, ctrl): pileup, nums, cov_starts, cov_ends, ns = self._do_pileups( mids=stream, chrom=chrom, expected=expected, ) n = np.sum(ns) num = np.sum(nums, axis=0) if n > 0: pileup = pileup / num else: pileup = self.make_outmap() pileups[(start, end)] = n, pileup n_pileups = len(pileups) if expected: kind = "expected" elif ctrl: kind = "control" else: kind = "" logging.info(f"{chrom}: {n_pileups} {kind} by-window pileups") return pileups def pileupsByWindowWithControl( self, nproc=1, ): """Perform by-window pileups across all chromosomes and applies required normalization Parameters ---------- nproc : int, optional How many cores to use. Sends a whole chromosome per process. The default is 1. Returns ------- finloops : dict Keys are tuples of (chrom, start, end) coordinates. Values are tuples of (n, pileup) n : int How many ROIs were piled up. pileup : 2D array Pileup for the region """ if nproc > 1: p = Pool(nproc) mymap = p.map else: mymap = map # Loops f = partial(self.pileupsByWindow, ctrl=False, expected=False,) loops = {chrom: lps for chrom, lps in zip(self.chroms, mymap(f, self.chroms))} # Controls if self.expected is not False: f = partial(self.pileupsByWindow, ctrl=False, expected=True,) ctrls = { chrom: lps for chrom, lps in zip(self.chroms, mymap(f, self.chroms)) } elif self.control: f = partial(self.pileupsByWindow, ctrl=True, expected=False,) ctrls = { chrom: lps for chrom, lps in zip(self.chroms, mymap(f, self.chroms)) } if nproc > 1: p.close() finloops = {} for chrom in loops.keys(): for pos, lp in loops[chrom].items(): if self.expected is not False or self.control: loop = lp[1] / ctrls[chrom][pos][1] loop[~np.isfinite(loop)] = 0 finloops[(chrom, pos[0], pos[1])] = lp[0], loop return finloops