####!/usr/bin/env python2.7 ### ### This script is part of Sequenza ### http://www.cbs.dtu.dk/biotools/sequenza/ ### import argparse, os, sys, gzip, math, multiprocessing, time, logging, json, gc from itertools import izip_longest from multiprocessing.pool import ThreadPool from functools import partial from multiprocessing.queues import SimpleQueue from contextlib import contextmanager from shutil import rmtree from subprocess import Popen, check_call, PIPE from tempfile import mkdtemp VERSION = "2.1.0" DATE = "07 October 2014" AUTHOR = "Favero Francesco" MAIL = "favero@cbs.dtu.dk" @contextmanager def named_pipe(): ''' Thanks to google for this nice solution :) ''' dirname = mkdtemp() try: path = os.path.join(dirname, "temp.fifo") os.mkfifo(path) yield path finally: rmtree(dirname) def check_dir(directory): ''' Check the given directory and return the whole path or set the default to working directory. ''' if directory == '': directory = None if directory == None: directory = os.getcwd() if os.path.isdir(directory): directory = os.path.abspath(directory) else: sys.exit("Not existing directory, please create the directory first") return directory def walklevel(directory, level=1): ''' ''' directory = directory.rstrip(os.path.sep) assert os.path.isdir(directory) num_sep = directory.count(os.path.sep) for root, dirs, files in os.walk(directory): yield root, dirs, files num_sep_this = root.count(os.path.sep) if num_sep + level <= num_sep_this: del dirs[:] def xopen(filename, mode='r'): ''' Replacement for the "open" function that can also open files that have been compressed with gzip. If the filename ends with .gz, the file is opened with gzip.open(). If it doesn't, the regular open() is used. If the filename is '-', standard output (mode 'w') or input (mode 'r') is returned. ''' assert isinstance(filename, basestring) if filename == '-': return sys.stdin if 'rb' in mode else sys.stdout if filename.endswith('.gz'): return gzip.open(filename, mode) else: return open(filename, mode) class IterableQueue(SimpleQueue): _sentinel = object() def next(self): item = self.get() if item == "EOF": self.close() raise StopIteration return item def __iter__(self): return (iter(self.next, self._sentinel)) def close(self): self.put(self._sentinel) def grouper(n, iterable, padvalue=None): """grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')""" return izip_longest(*[iter(iterable)]*n, fillvalue=padvalue) def pileup_partial_split(pileup_line): ''' split pileup line in chromosome, position and rest of line, without even strip ''' chromosome, position, line = pileup_line.split('\t', 2) return [chromosome, int(position), line] class multiPileups: ''' Define an iterable object merging two pileups ''' def __init__(self, p1, p2): self.p1 = p1 self.p2 = p2 self._last_chromosome = None _sentinel = object() def next(self): try: chromosome1, position1, line1 = pileup_partial_split(self.p1.next()) chromosome2, position2, line2 = pileup_partial_split(self.p2.next()) except ValueError: raise StopIteration if chromosome1 == chromosome2: self._last_chromosome = chromosome1 going_on = True while going_on: ''' If one of the files finishes, it will raise StopIteration and will stop the iteration cleanly ''' try: if chromosome1 == chromosome2: self._last_chromosome = chromosome1 if position1 == position2: ref1, line1 = line1.split('\t', 1) ref2, line2 = line2.split('\t', 1) return [chromosome1, position1, ref1, line1.strip(), line2.strip()] going_on = False elif position1 > position2: chromosome2, position2, line2 = pileup_partial_split(self.p2.next()) elif position1 < position2: chromosome1, position1, line1 = pileup_partial_split(self.p1.next()) else: if chromosome1 != self._last_chromosome: chromosome2, position2, line2 = pileup_partial_split(self.p2.next()) else: chromosome1, position1, line1 = pileup_partial_split(self.p1.next()) except StopIteration: going_on = False raise StopIteration def close(self): self.p1.close() self.p2.close() def __iter__(self): return (iter(self.next, self._sentinel)) def next_gcfile(self): gc_line = self.gc.next() try: pos, self._last_GC = gc_line.split("\t", 1) self._last_window_s = int(pos) self._last_window_e = self._last_window_s + self._last_span return self except ValueError: line, chrom, span = gc_line.strip().split() self._chromosome = chrom.split('chrom=')[1] self._last_span = int(span.split('span=')[1]) pos, self._last_GC = self.gc.next().strip().split("\t", 1) self._last_window_s = int(pos) self._last_window_e = self._last_window_s + self._last_span return self except ValueError: if self._chromosome: raise StopIteration else: sys.exit("Error: the GC-content file is not in the expected format. See for example http://hgdownload-test.cse.ucsc.edu/goldenPath/hg19/gc5Base/ ") class GCmultiPileups: ''' Add GC-content to the pileup, and still make an iterable object ''' def __init__(self,multiPileups,gc): self.gc = gc self = next_gcfile(self) self.mpileup = multiPileups self._last_chromosome = None _sentinel = object() def next(self): self.pile_line = self.mpileup.next() going_on = True while going_on: if self._chromosome == self.pile_line[0]: self._last_chromosome = self._chromosome if self.pile_line[1] >= self._last_window_s and self.pile_line[1] < self._last_window_e: self.pile_line.append(self._last_GC.strip()) return self.pile_line going_on = False elif self.pile_line[1] < self._last_window_s: self.pile_line = self.mpileup.next() elif self.pile_line[1] >= self._last_window_e: self = next_gcfile(self) else: if self._last_chromosome != self._chromosome and self._last_chromosome != None: self.pile_line = self.mpileup.next() else: self = next_gcfile(self) def close(self): self.mpileup.close() self.gc.close() def __iter__(self): return (iter(self.next, self._sentinel)) class GCmultiPileupsAltDepth: ''' Add alternative depth to the GCpileup, and again, keep it iterable ''' def __init__(self,GCmultiPileups,p2): self.mpileup = GCmultiPileups self.p2 = p2 self._last_chromosome = None _sentinel = object() def next(self): self.pile_line = self.mpileup.next() try: chromosome2, position2, line2 = pileup_partial_split(self.p2.next()) except ValueError: raise StopIteration if self.pile_line[0] == chromosome2: self._last_chromosome = self.pile_line[0] going_on = True while going_on: ''' If one of the files ever finish it would rise StopIteration and it would stop the iteration cleanly ''' try: if self.pile_line[0] == chromosome2: self._last_chromosome = self.pile_line[0] if self.pile_line[1] == position2: ref2, depth2, line2 = line2.split('\t', 2) self.pile_line.append(depth2.strip()) return self.pile_line going_on = False elif self.pile_line[1] > position2: chromosome2, position2, line2 = pileup_partial_split(self.p2.next()) elif self.pile_line[1] < position2: self.pile_line = self.mpileup.next() else: if self.pile_line[0] != self._last_chromosome: chromosome2, position2, line2 = pileup_partial_split(self.p2.next()) else: self.pile_line = self.mpileup.next() except StopIteration: going_on = False raise StopIteration def close(self): self.mpileup.close() self.p2.close() def __iter__(self): return (iter(self.next, self._sentinel)) def parse_pileup(ref_base, line, qlimit=20, qformat='sanger'): ''' Parse the pileup format ''' depth, mut_list, mut_qual = line ref_base = ref_base.upper() depth = int(depth) if ref_base != "N": freq_dict = parse_pileup_seq(mut_list, mut_qual, depth, ref_base, qlimit, qformat) return [ref_base, depth, freq_dict['A'], freq_dict['C'], freq_dict['G'], freq_dict['T'], freq_dict['Z']] else: return [ref_base, depth, 0, 0, 0, 0, [0, 0, 0, 0]] def parse_pileup_str(line, min_depth, qlimit=20, qformat='sanger'): ''' Parse the pileup format ''' if line: line = line.strip() try: chromosome, n_base, ref_base, depth, mut_list, mut_qual = line.split() rd = int(depth) rebase = ref_base.upper() if rd >= min_depth and rebase != "N": freq_dict = parse_pileup_seq(mut_list, mut_qual, rd, rebase, qlimit, qformat) line = [chromosome, n_base, rebase, depth, freq_dict['A'], freq_dict['C'], freq_dict['G'], freq_dict['T'], ':'.join(map(str, freq_dict['Z']))] return '\t'.join(map(str,line)) else: pass except ValueError: pass else: pass def parse_pileup_seq(seq, quality, depth, reference, qlimit=20, qformat='sanger', noend=False, nostart=False): ''' Parse the piled up sequence and return the frequence of mutation. ''' nucleot_dict = {'A':0, 'C':0, 'G':0, 'T':0} strand_dict = {'A':0, 'C':0, 'G':0, 'T':0} reference = reference.strip().upper() H = 0 n = 0 if qformat == 'sanger': qlimit = qlimit + 33 elif qformat == 'illumina': qlimit = qlimit + 64 else: sys.exit("Supported quality format are only \"illumina\" and \"sanger\"(default).") quality = quality.strip() block = {'seq':'','length':0} start = False del_ins = False l_del_ins = '' last_base = '' ins_del_length = 0 for base in seq.strip(): if block['length'] == 0: if base == '$': if noend: nucleot_dict[last_base] -= 1 elif base == '^': start = True block['length'] += 1 block['seq'] = base elif base == '+' or base == '-': del_ins = True block['length'] += 1 block['seq'] = base elif base == '.' or base == ',': if ord(quality[n]) >= qlimit: nucleot_dict[reference] += 1 if base == '.': strand_dict[reference] += 1 last_base = reference n += 1 elif base.strip().upper() in nucleot_dict: if ord(quality[n]) >= qlimit: nucleot_dict[base.strip().upper()] += 1 if base.strip().isupper(): strand_dict[base.strip().upper()] += 1 last_base = base.strip().upper() n +=1 else: n += 1 else: if start: block['length'] += 1 block['seq'] += base if block['length'] == 3: if not nostart: if base == '.' or base == ',': if ord(quality[n]) >= qlimit: nucleot_dict[reference] += 1 if base == '.': strand_dict[reference] += 1 elif base.strip().upper() in nucleot_dict: if ord(quality[n]) >= qlimit: nucleot_dict[base.strip().upper()] += 1 if base.strip().isupper(): strand_dict[base.strip().upper()] += 1 block['length'] = 0 block['seq'] = '' start = False n += 1 elif del_ins: if base.isdigit(): l_del_ins += base block['seq'] += base block['length'] += 1 else: ins_del_length = int(l_del_ins)+ 1 + len(l_del_ins) block['seq'] += base block['length'] += 1 if block['length'] == ins_del_length: block['length'] = 0 block['seq'] = '' l_del_ins = '' ins_del = False ins_del_length = 0 # Debug line #print str(n) + " " + base + " " + seq if n == depth: nucleot_dict["Z"] = [strand_dict['A'], strand_dict['C'], strand_dict['G'], strand_dict['T']] return nucleot_dict else: #print seq + " " + str(depth)+" " + str(n) sys.exit('Something went wrong on the block counting!!') def line_worker(line, depth_sum, qlimit=20, qformat='sanger', hom_t=0.85, het_t=0.35, alt_pileup=False): ''' After the 3 files are syncronized we need to transform the pileup in a readable format, find the alleles, and compute the allele frequency in tumor ''' if line: bases_list = ['A', 'C', 'G', 'T'] if alt_pileup: chromosome, position, ref, p1_str, p2_str, gc, alt_depth = line else: chromosome, position, ref, p1_str, p2_str, gc = line p1_list = p1_str.split() p2_list = p2_str.split() if int(p1_list[0]) + int(p2_list[0]) >= depth_sum and len(p1_list) == len(p2_list): p1_mu = parse_pileup(ref, p1_list, qlimit, qformat) sum_p1 = float(sum(p1_mu[2:6])) if sum_p1 > 0: p1_freq = [x/sum_p1 for x in p1_mu[2:6]] sort_freq_p1 = sorted(p1_freq, reverse = True) if sort_freq_p1[0] >= hom_t: # Homozygous positions here i = p1_freq.index(sort_freq_p1[0]) p2_mu = parse_pileup(ref, p2_list, qlimit, qformat) sum_p2 = float(sum(p2_mu[2:6])) if sum_p2 > 0: p2_freq = [x/sum_p2 for x in p2_mu[2:6]] homoz_p2 = p2_freq[i] no_zero_idx = [val for val in range(len(p2_freq)) if p2_freq[val] > 0] no_zero_bases = [str(bases_list[ll])+str(round(p2_freq[ll], 3)) for ll in no_zero_idx if ll != i] if no_zero_bases == []: no_zero_bases = '.' strands_bases = '0' else: no_zero_bases = ":".join(map(str, no_zero_bases)) strands_bases = [str(bases_list[ll])+str(round(p2_mu[6][ll]/float(p2_mu[2+ll]), 3)) for ll in no_zero_idx if ll != i] strands_bases = ":".join(map(str, strands_bases)) #homoz_p2 = p2_mu[2 + i]/sum_p2 # chromosome, n_base, base_ref, depth_normal, depth_sample, depth.ratio, Af, Bf, zygosity.normal, GC-content, reads above quality, AB.ref, AB.tum, percentage AB.tum in fw if alt_pileup: line_out = [chromosome, position, p1_mu[0], alt_depth, p2_mu[1], round(p2_mu[1]/float(alt_depth), 3), round(homoz_p2, 3), 0, 'hom', gc, int(sum_p2), bases_list[i], no_zero_bases, strands_bases] else: line_out = [chromosome, position, p1_mu[0], p1_mu[1], p2_mu[1], round(p2_mu[1]/float(p1_mu[1]), 3), round(homoz_p2, 3), 0, 'hom', gc, int(sum_p2), bases_list[i], no_zero_bases, strands_bases] return line_out else: pass else: if sort_freq_p1[1] >= het_t: # Heterozygous position here allele = list() for b in xrange(4): if p1_freq[b] >= het_t: allele.append(bases_list[b]) if len(allele) == 2: p2_mu = parse_pileup(ref, p2_list, qlimit, qformat) sum_p2 = float(sum(p2_mu[2:6])) if sum_p2 > 0: i = bases_list.index(allele[0]) ii = bases_list.index(allele[1]) het_a_p2 = p2_mu[2 + i]/sum_p2 het_b_p2 = p2_mu[2 + ii]/sum_p2 if het_a_p2 >= het_b_p2: if alt_pileup: line_out = [chromosome, position, p1_mu[0], alt_depth, p2_mu[1], round(p2_mu[1]/float(alt_depth), 3), round(het_a_p2 , 3), round(het_b_p2 , 3) , 'het', gc, int(sum_p2), bases_list[i]+bases_list[ii], ".", "0"] else: line_out = [chromosome, position, p1_mu[0], p1_mu[1], p2_mu[1], round(p2_mu[1]/float(p1_mu[1]), 3), round(het_a_p2 , 3), round(het_b_p2 , 3) , 'het', gc, int(sum_p2), bases_list[i]+bases_list[ii], ".", "0"] return line_out elif het_a_p2 < het_b_p2: if alt_pileup: line_out = [chromosome, position, p1_mu[0], alt_depth, p2_mu[1], round(p2_mu[1]/float(alt_depth), 3), round(het_b_p2 , 3), round(het_a_p2 , 3) , 'het', gc, int(sum_p2), bases_list[ii]+bases_list[i], ".", "0"] else: line_out = [chromosome, position, p1_mu[0], p1_mu[1], p2_mu[1], round(p2_mu[1]/float(p1_mu[1]), 3), round(het_b_p2 , 3), round(het_a_p2 , 3) , 'het', gc, int(sum_p2), bases_list[ii]+bases_list[i], ".", "0"] return line_out else: pass else: pass else: pass else: pass class abfreReduce: def __init__(self, abf, w): self.abf = abf self.w = w self._status = 1 self._header = next(self.abf).strip() line = next(self.abf).strip() line_ls = line.split('\t') self.__refresh__(line_ls, line) _sentinel = object() def next(self): if self._status == 1: try: line = next(self.abf).strip() line_ls = line.split('\t') except StopIteration: self._status = 0 self.__do_dict__() #return [line_ls[1], self.w + self._last_edge, self._n] return self.line_dict if self.w + self._last_edge < int(line_ls[1]) or self._last_chromosome != line_ls[0]: self.__do_dict__() line_dict = self.line_dict self.__refresh__(line_ls, line) #return [line_ls[1], self.w + self._last_edge, self._n] return line_dict else: self.__addline__(line_ls, line) elif self._status == 0: raise StopIteration def __refresh__(self, line_ls, line): self._last_chromosome = line_ls[0] self._last_edge = int(line_ls[1]) self._last_position = self._last_edge self._n_depth = int(line_ls[3]) self._t_depth = int(line_ls[4]) self._ratio = float(line_ls[5]) self._gc = float(line_ls[9]) self._n = 1 self.line_dict = {'top': '', 'middle': [], 'end': ''} if line_ls[12]+line_ls[8] != '.hom': self.line_dict['top'] = line_ls def __addline__(self, line_ls, line): self._last_position = int(line_ls[1]) self._n_depth += int(line_ls[3]) self._t_depth += int(line_ls[4]) self._ratio += float(line_ls[5]) self._gc += float(line_ls[9]) self._n += 1 if line_ls[12]+line_ls[8] != '.hom': self.line_dict['middle'].append(line_ls) def __do_dict__(self): gc = str(int(round(self._gc/self._n, 0))) avg_line = [self._last_chromosome, self._last_position, 'N', self._n_depth/self._n, self._t_depth/self._n, round(self._ratio/self._n, 3), 1.0, 0, 'hom', gc , self._n, 'N', '.', '0'] self.line_dict['end'] = map(str, avg_line) if self.line_dict['top'] == '': avg_line[1] = self._last_edge self.line_dict['top'] = map(str, avg_line) else: self.line_dict['top'][9] = gc for mid in self.line_dict['middle']: mid[9] = gc def close(self): self.abf.close() def __iter__(self): return (iter(self.next, self._sentinel)) def stream_fasta(fa_file, window_size, out_queue): """ Read and stream a fasta file in a Pipe """ tmp_line = '' for line in fa_file: line = line.strip() if line.startswith('>'): chromosome = line if tmp_line == '': #print chromosome out_queue.put(chromosome) else: groups = grouper(window_size, tmp_line.upper()) for group in groups: #print group out_queue.put(group) tmp_line = '' #print chromosome out_queue.put(chromosome) else: tmp_line = tmp_line + line if len(tmp_line)%window_size == 0: groups = grouper(window_size, tmp_line.upper()) for group in groups: #print group out_queue.put(group) tmp_line = '' else: #print tmp_line pass if tmp_line == '': pass else: groups = grouper(window_size, tmp_line.upper()) for group in groups: #print group out_queue.put(group) out_queue.put('EOF') def seq_map(seq_list): """ ... """ counter = {"A":0, "C":0, "G":0, "N":0, "T":0, "R":0, "Y":0, "S":0, "W":0, "K":0, "M":0, "B":0, "D":0, "H":0, "V":0} for nucleotide in seq_list: counter[nucleotide] +=1 return counter def process_gc_from_pipe(in_queue, window_size): """ Process fasta from a multiprocessing.queue and compute the percentage of GC within a given window """ base_counter = 1 window_size = float(window_size) while True: seq_list = in_queue.get() #print seq_list if seq_list != 'EOF': if type(seq_list) == str: if seq_list.startswith('>'): chromosome = seq_list.strip('>').split(" ")[0] print "variableStep chrom=" + chromosome + " span=" + str(int(window_size)) base_counter = 1 else: seq_list = filter(None,seq_list) stats = seq_map(seq_list) seq_len = len(seq_list) if seq_len == window_size: if (stats['N'] + stats['R'] + stats['Y'] + stats['K'] + stats['M'] + stats['B'] + stats['D'] + stats['H'] + stats['V'])/window_size >= 0.75: base_counter = base_counter + window_size pass else: gc_percent = 100 * ((stats['G'] + stats['C'] + stats['S'])/window_size) print str(int(base_counter)) + "\t" + str(gc_percent) base_counter = base_counter + window_size else: if (stats['N'] + stats['R'] + stats['Y'] + stats['K'] + stats['M'] + stats['B'] + stats['D'] + stats['H'] + stats['V'])/window_size >= 0.75: base_counter = base_counter + seq_len pass else: gc_percent = 100 * ((stats['G'] + stats['C'] + stats['S'])/float(seq_len)) print str(int(base_counter)) + "\t" + str(gc_percent) base_counter = base_counter + seq_len else: break class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter): def _format_action(self, action): parts = super(argparse.RawDescriptionHelpFormatter, self)._format_action(action) if action.nargs == argparse.PARSER: parts = "\n".join(parts.split("\n")[1:]) return parts class DefaultHelpParser(argparse.ArgumentParser): def error(self, message): sys.stderr.write('error: %s\n' % message) self.print_help() sys.exit(2) def DOpup2seqz(p1, p2, gc, n2, n, qlimit, qformat, hom, het, fileout, out_header): stream_mpileup = IterableQueue() if not n2: line_worker_partial = partial(line_worker, depth_sum=n, qlimit=qlimit, qformat=qformat, hom_t=hom, het_t=het) with xopen(p1, 'rb') as normal, xopen(p2, 'rb') as tumor, xopen(gc, 'rb') as gc_file: pup = multiPileups(normal,tumor) pup = GCmultiPileups(pup, gc_file) fileout.write("\t".join(out_header) + '\n') for line in pup: res = line_worker_partial(line) if res: fileout.write('\t'.join(map(str,res))+'\n') else: line_worker_partial = partial(line_worker, depth_sum=n, qlimit=qlimit, qformat=qformat, hom_t=hom, het_t=het, alt_pileup=True) with xopen(p1, 'rb') as normal, xopen(p2, 'rb') as tumor, xopen(gc, 'rb') as gc_file, xopen(n2, 'rb') as alt_normal: pup = multiPileups(normal,tumor) pup = GCmultiPileups(pup, gc_file) pup = GCmultiPileupsAltDepth(pup, alt_normal) fileout.write("\t".join(out_header) + '\n') for line in pup: res = line_worker_partial(line) if res: fileout.write('\t'.join(map(str,res))+'\n') def pileup2acgt(parser, subparser): subparser.add_argument('pileup', metavar='pileup', help='Name of the input pileup (SAMtools) file. If the filename ends in .gz it will be opened in gzip mode. If the file name is - it will be read from STDIN.') subparser.add_argument('-n', dest='n', type=int, help='The minimum read depth on a base required to make a mutation call.') parser_pup2muoutput = subparser.add_argument_group(title='Output', description='Arguments that involve the output destination.') parser_pup2muoutput.add_argument('-o', '--output', dest='output', type = str, default = '-', help='Name of the output file. To use gzip compression name the file ending in .gz. (default STDOUT).') parser_pup2muoutput.add_argument('--quiet', dest='quiet', action="store_true", help='Do not output additional debugging information.') parser_pup2muqualitysets = subparser.add_argument_group(title='Quality and Format', description='Argument that change the quality threshold or the quality format.') parser_pup2muqualitysets.add_argument('-q', '--qlimit', dest='qlimit', default=20,type=int, help='Minimum nucleotide quality score for inclusion in the counts.') parser_pup2muqualitysets.add_argument('-f', '--qformat', dest='qformat', default="sanger", help='Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value.') parser_pup2muinfo = subparser.add_argument_group(title='Info',description='Other Information.') parser_pup2muinfo.add_argument('-v', '--version', action="version", version='%(prog)s version: ' + VERSION + ". " + DATE, help='Display the version information and exit.') return parser.parse_args() def pileup2seqz(parser, subparser): parser_ABinput = subparser.add_argument_group(title='Input Files',description='Required input files.') parser_ABgenotype = subparser.add_argument_group(title='Genotyper',description='Options regarding the genotyping.') parser_ABqualitysets = subparser.add_argument_group(title='Quality and Format', description='Options that change the quality threshold and format.') parser_ABinput.add_argument('-n', '--normal', dest = 'normal', required = True, help='Name of the pileup file from the reference/normal sample') parser_ABinput.add_argument('-t', '--tumor', dest = 'tumor', required = True, help='Name of the pileup file from the tumor sample') parser_ABinput.add_argument('-gc', dest = 'gc', metavar = 'gc', required = True, help='The GC-content file coming from UCSC genome browser, or generated in the same UCSC format') parser_ABinput.add_argument('-n2', '--normal2', dest = 'normal2', type = str, default = None, help='EXPERIMENTAL: Optional pileup used only to compute the depth.normal and depth-ratio, instead of \"normal\"') parser_ABqualitysets.add_argument('-q', '--qlimit', dest = 'qlimit', default = 20, type = int, help='Minimum nucleotide quality score for inclusion in the counts. Default 20.') parser_ABqualitysets.add_argument('-f', '--qformat', dest = 'qformat', default = "sanger", help='Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value. Default "sanger".') parser_ABqualitysets.add_argument('-N', dest = 'n', type = int, default = 20, help='Threshold to filter positions by the sum of read depth of the two samples. Default 20.') parser_ABgenotype.add_argument('--hom', dest = 'hom', type = float, default = 0.9, help='Threshold to select homozygous positions. Default 0.9.') parser_ABgenotype.add_argument('--het', dest = 'het', type = float, default = 0.25, help='Threshold to select heterozygous positions. Default 0.25.') return parser.parse_args() def bam2seqz(parser, subparser): parser_ABinput = subparser.add_argument_group(title='Input Files',description='Required input files.') parser_ABgenotype = subparser.add_argument_group(title='Genotyper',description='Options regarding the genotyping.') parser_ABsamtools = subparser.add_argument_group(title='Samtools', description='Options regarding samtools.') parser_ABqualitysets = subparser.add_argument_group(title='Quality and Format', description='Options that change the quality threshold and format.') parser_ABinput.add_argument('-n', '--normal', dest = 'normal', required = True, help='Name of the BAM file from the reference/normal sample') parser_ABinput.add_argument('-t', '--tumor', dest = 'tumor', required = True, help='Name of the BAM file from the tumor sample') parser_ABinput.add_argument('-gc', dest = 'gc', metavar = 'gc', required = True, help='The GC-content file coming from UCSC genome browser, or generated in the same UCSC format') parser_ABinput.add_argument('-F', "--fasta", dest = 'fasta', metavar = 'fasta', required = True, help='The reference FASTA file used to generate the pileup') parser_ABinput.add_argument('-n2', '--normal2', dest = 'normal2', type = str, default = None, help='EXPERIMENTAL: Optional BAM used only to compute the depth.normal and depth-ratio, instead of \"normal\"') parser_ABqualitysets.add_argument('-q', '--qlimit', dest = 'qlimit', default = 20, type = int, help='Minimum nucleotide quality score for inclusion in the counts. Default 20.') parser_ABqualitysets.add_argument('-f', '--qformat', dest = 'qformat', default = "sanger", help='Quality format, options are "sanger" or "illumina". This will add an offset of 33 or 64 respectively to the qlimit value. Default "sanger".') parser_ABqualitysets.add_argument('-N', dest = 'n', type = int, default = 20, help='Threshold to filter positions by the sum of read depth of the two samples. Default 20.') parser_ABgenotype.add_argument('--hom', dest = 'hom', type = float, default = 0.9, help='Threshold to select homozygous positions. Default 0.9.') parser_ABgenotype.add_argument('--het', dest = 'het', type = float, default = 0.25, help='Threshold to select heterozygous positions. Default 0.25.') parser_ABsamtools.add_argument("-S", '--samtools', dest = 'samtools', type = str, default = "samtools", help='Path of samtools to use for the pileup generation.') parser_ABsamtools.add_argument("-C", '--chromosome', dest = 'chr', type = str, default = None, help='Argument to restrict the input/output to a chromosome or a chromosome region. Coordinate format is Name:pos.start-pos.end, eg: chr17:7565097-7590856, for a particular region; eg: chr17, for the entire chromosome. Chromosome names can checked in the BAM files and are depending on the FASTA reference used for alignment. Default behaviour is to not selecting any cromosome.') return parser.parse_args() def GC_windows(parser, subparser): subparser.add_argument('fasta', metavar = 'fasta', help='the fasta file. It can be a file name or \"-\" to take the input from STDIN') subparser.add_argument('-w', dest = 'window', type = int, default = 50, help='The window size to calculate the GC-content percentage') return parser.parse_args() def merge_pileups(parser, subparser): subparser.add_argument('-1', '--pileup1', dest = 'p1', required = True, help='Name of the first pileup file') subparser.add_argument('-2', '--pileup2', dest = 'p2', required = True, help='Name of the second pileup, will show as the last columns set') return parser.parse_args() def reduce_seqz(parser, subparser): subparser.add_argument('-s', '--seqz', dest = 'seqz', required = True, help='An seqz file from the pileup2seqz function.') subparser.add_argument('-w', '--window', dest = 'w', type = int, default = 50, help='Window size used for binning the original seqz file. Default is 50.') return parser.parse_args() def main(): ''' Execute the function with args ''' parser = DefaultHelpParser(prog = __file__, formatter_class=lambda prog: SubcommandHelpFormatter(prog, max_help_position=20, width=75), description='\nsequenza-utils.py \nThis script is part of Sequenza http://www.cbs.dtu.dk/biotools/sequenza/ \n\nSequenza Utils is a set of tools to perform various tasks, primarily aimed to convert bam/pileup files to a format usable by the sequenza R package.', usage= 'sequenza-utils.py command [options]', epilog = 'This is version {0} - Francesco Favero - {1}'.format(VERSION, DATE)) subparsers = parser.add_subparsers(dest='module') subparsers.metavar = None parser_bam2seqz = subparsers.add_parser('bam2seqz', help = 'Process a paired set of BAM files (tumor and matching normal), and GC-content genome-wide information, to extract the common positions with A and B alleles frequencies',formatter_class=lambda prog: SubcommandHelpFormatter(prog,max_help_position=39, width=90)) parser_pileup2seqz = subparsers.add_parser('pileup2seqz', help = 'Process a paired set of pileup files (tumor and matching normal), and GC-content genome-wide information, to extract the common positions with A and B alleles frequencies',formatter_class=lambda prog: SubcommandHelpFormatter(prog,max_help_position=39, width=90)) parser_reduce_seqz = subparsers.add_parser('seqz-binning', help = 'Perform binning of the seqz file to reduce file size and memory requirement for the analysis.') parser_pup2mu = subparsers.add_parser('pileup2acgt', help = 'Convert pileup format to ACGT format',formatter_class=lambda prog: SubcommandHelpFormatter(prog,max_help_position=30, width=90)) parser_gc_window = subparsers.add_parser('GC-windows', help = 'Compute the average GC content in sliding windows from a fasta file, output in the same format as gc5Base from UCSC') parser_merge_pileups = subparsers.add_parser('merge-pileups', help = 'Merge two pileups by finding the common positions, and return an mpileup file adding the second pileup as the last 3 columns.') try: used_module = sys.argv[1] if used_module == "pileup2acgt": args = pileup2acgt(parser, parser_pup2mu) with xopen(args.output, "wb") as fileout, xopen(args.pileup, "rb") as f: fileout.write('chr' + "\t" + 'n_base' + "\t" + 'ref_base' + "\t" + 'read.depth' + "\t" + 'A' + "\t" + 'C' + "\t" + 'G' + "\t" + 'T' + "\t" + "strand" + '\n') parse_pileup_partial = partial(parse_pileup_str, min_depth=args.n, qlimit=args.qlimit, qformat=args.qformat) for line in f: try: fileout.write(parse_pileup_partial(line) + '\n') except AttributeError: pass elif used_module == "bam2seqz": args = bam2seqz(parser, parser_bam2seqz) with xopen('-', "wb") as fileout: out_header = ["chromosome", "position", "base.ref", "depth.normal", "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", "GC.percent", "good.reads", "AB.normal", "AB.tumor", "tumor.strand"] if args.chr: cmd_tum1 = [args.samtools, "view", "-u", args.tumor, args.chr] cmd_nor1 = [args.samtools, "view", "-u", args.normal, args.chr] else: cmd_tum1 = [args.samtools, "view", "-u", args.tumor] cmd_nor1 = [args.samtools, "view", "-u", args.normal] cmd_tum2 = [args.samtools, "mpileup", "-f", args.fasta, "-q " + str(args.qlimit), "-"] cmd_nor2 = [args.samtools, "mpileup", "-f", args.fasta, "-q " + str(args.qlimit), "-"] if args.normal2: if args.chr: cmd_nor3 = [args.samtools, "view", "-u", args.normal2, args.chr] else: cmd_nor3 = [args.samtools, "view", "-u", args.normal2] cmd_nor4 = [args.samtools, "mpileup", "-f", args.fasta, "-q " + str(args.qlimit), "-"] tum1 = Popen(cmd_tum1, stdout = PIPE) nor1 = Popen(cmd_nor1, stdout = PIPE) nor2 = Popen(cmd_nor3, stdout = PIPE) with named_pipe() as tfifo, named_pipe() as nfifo, named_pipe() as n2fifo: res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, n2fifo, args.n, args.qlimit, args.qformat, args.hom, args.het, fileout, out_header)) res.start() with open(nfifo, 'wb', 0) as normal, open(tfifo, 'wb', 0) as tumor, open(n2fifo, 'wb', 0) as normal2: fifos = [Popen(cmd_tum2, stdin=tum1.stdout, stdout=tumor, stderr=PIPE), Popen(cmd_nor2, stdin=nor1.stdout, stdout=normal, stderr=PIPE), Popen(cmd_nor4, stdin=nor2.stdout, stdout=normal2, stderr=PIPE)] tum1.stdout.close() nor1.stdout.close() nor2.stdout.close() res.join() for fifo in fifos: fifo.terminate() else: tum1 = Popen(cmd_tum1, stdout = PIPE) nor1 = Popen(cmd_nor1, stdout = PIPE) with named_pipe() as tfifo, named_pipe() as nfifo: res = multiprocessing.Process(target = DOpup2seqz, args = (nfifo, tfifo, args.gc, args.normal2, args.n, args.qlimit, args.qformat, args.hom, args.het, fileout, out_header)) res.start() with open(nfifo, 'wb', 0) as normal, open(tfifo, 'wb', 0) as tumor: fifos = [Popen(cmd_tum2, stdin=tum1.stdout, stdout=tumor, stderr=PIPE), Popen(cmd_nor2, stdin=nor1.stdout, stdout=normal, stderr=PIPE)] tum1.stdout.close() nor1.stdout.close() res.join() for fifo in fifos: fifo.terminate() elif used_module == "pileup2seqz": args = pileup2seqz(parser, parser_pileup2seqz) with xopen('-', "wb") as fileout: out_header = ["chromosome", "position", "base.ref", "depth.normal", "depth.tumor", "depth.ratio", "Af", "Bf", "zygosity.normal", "GC.percent", "good.reads", "AB.normal", "AB.tumor", "tumor.strand"] res = multiprocessing.Process(target = DOpup2seqz, args = (args.normal, args.tumor, args.gc, args.normal2, args.n, args.qlimit, args.qformat, args.hom, args.het, fileout, out_header)) res.start() res.join() elif used_module == "GC-windows": args = GC_windows(parser, parser_gc_window) gc_queue = SimpleQueue() gc_process = multiprocessing.Process(target = process_gc_from_pipe, args = (gc_queue, args.window)) gc_process.deamon = True gc_process.start() with xopen(args.fasta, 'rb') as fa_file: stream_fasta(fa_file, args.window, gc_queue) #gc_process.terminate() #gc_process.join() elif used_module == "merge-pileups": args = merge_pileups(parser, parser_merge_pileups) with xopen(args.p1, 'rb') as pileup1, xopen(args.p2, 'rb') as pileup2: pup = multiPileups(pileup1,pileup2) for line in pup: print('\t'.join(map(str, line)) + '\n') elif used_module == "seqz-binning": args = reduce_seqz(parser, parser_reduce_seqz) with xopen(args.seqz, 'rb') as seqz: abfred = abfreReduce(seqz, args.w) print abfred._header for a in abfred: if a: print '\t'.join(a['top']) for mid in a['middle']: print '\t'.join(mid) if ['end'] != ['top']: print '\t'.join(a['end']) else: return parser.parse_args() except IndexError: args = parser.parse_args() if __name__ == "__main__": main()