''' Created on May 3, 2011 @author: Luca Pinello ''' import mmap import math import os, glob, string import numpy as np from numpy import zeros, inf, log2 from numpy.random import randint,permutation from scipy.stats import rv_discrete from scipy.io.matlab import savemat import subprocess import tempfile import time from bx.intervals.intersection import Intersecter, Interval from bx.seq.twobit import TwoBitFile import cPickle try: import pysam except: pass mask=lambda c: c if c.isupper() else 'N' def chunks(l, n): return [l[i:i+n] for i in range(0, len(l), n)] class constant_minus_one_dict (dict): def __missing__ (self, key): return -1 class constant_n_dict (dict): def __missing__ (self, key): return 'n' nt2int=constant_minus_one_dict({'a':0,'c':1,'g':2,'t':3}) int2nt=constant_n_dict({0:'a',1:'c',2:'g',3:'t'}) nt_complement=dict({'a':'t','c':'g','g':'c','t':'a','n':'n'}) def read_sequence_from_fasta(fin, bpstart, bpend, line_length=50.0): bpstart=bpstart-1 fin.seek(0) fin.readline() #read the first line; the pointer is at the second line nbp = bpend - bpstart offset = int( bpstart + math.floor(bpstart/line_length)) #assuming each line contains 50 characters; add 1 offset per line if offset > 0: fin.seek(int(offset),1) seq = fin.read(nbp+int(math.floor(nbp/line_length))+1) seq = seq.replace('\n','') if len(seq) < nbp: print 'Coordinate out of range:',bpstart,bpend return seq[0:nbp].lower() def FastaIterator(handle): while True: line = handle.readline() if line == "" : return if line[0] == ">": #skip > break while True: if line[0]!=">": raise ValueError("Fasta files should start with '>'") else: descr = line[1:].rstrip() id = descr.split()[0] name = id lines = [] line = handle.readline() while True: if not line : break if line[0] == ">": break lines.append(line.rstrip().replace(" ","").replace("\r","")) line = handle.readline() yield "".join(lines) if not line : return #End class Coordinate: def chr_id2_ord(self): id=self.chr_id.replace('chr','') try: id=int(id) except: cs=0 for c in id: cs+=ord(c) id=cs return id def __init__(self,chr_id,bpstart,bpend,name=None,score=None,strand=None): self.chr_id=chr_id self.bpstart=bpstart self.bpend=bpend self.name=name self.score=score self.strand=strand def bpcenter(self): return (self.bpstart+self.bpend)/2 def __eq__(self,other): return (self.chr_id==other.chr_id) & (self.bpstart==other.bpstart) & (self.bpend==other.bpend) def __ne__(self,other): return not self.__eq__(other) def __lt__(self, other): if self.chr_id2_ord<other.chr_id2_ord: return True elif self.chr_id2_ord>other.chr_id2_ord: return False elif self.bpstart<other.bpstart: return True elif self.bpstart>other.bpstart: return False else: return False def __gt__(self,other): return not ( self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) def __hash__(self): return hash((self.chr_id,self.bpstart,self.bpend)) def __str__(self): return self.chr_id+':'+str(self.bpstart)+'-'+str(self.bpend)+ (' '+self.name if self.name else '') + ( ' '+ str(self.score) if self.score else '')+ (' '+self.strand if self.strand else '') def __repr__(self): return self.chr_id+':'+str(self.bpstart)+'-'+str(self.bpend)+ (' '+self.name if self.name else '') + ( ' '+ str(self.score) if self.score else '')+ (' '+self.strand if self.strand else '') def __len__(self): return self.bpend-self.bpstart def __and__(self,other): if self.bpend < other.bpstart or other.bpend < self.bpstart or self.chr_id != other.chr_id: return None else: return Coordinate(self.chr_id,max(self.bpstart,other.bpstart),min(self.bpend,other.bpend)) def upstream(self,offset=2000): return Coordinate(self.chr_id,self.bpend+1,self.bpend+offset) def downstream(self,offset=2000): return Coordinate(self.chr_id,max(0,self.bpstart-offset),self.bpstart-1) @classmethod def bed_to_coordinates(cls,bed_filename,header_lines=0, cl_chr_id=0, cl_bpstart=1, cl_bpend=2, cl_name=3, cl_score=4, cl_strand=5): with open(bed_filename,'r') as infile: coordinates = list() for _ in range(header_lines): infile.readline() for line in infile: line=line.strip() try: coord=line.split('\t') chr_id=coord[cl_chr_id] bpstart=coord[cl_bpstart] bpend=coord[cl_bpend] try: name=coord[cl_name] except: name='ND' try: score=float(coord[cl_score]) except: score=0 try: strand=coord[cl_strand] except: strand='ND' coordinates.append(Coordinate(str(chr_id), int(bpstart), int(bpend), name, score,strand)) except: print 'Skipping line:',line return coordinates @classmethod def bed_to_coordinates_dict(cls,bed_file,header_lines=0,cl_chr_id=0, cl_bpstart=1, cl_bpend=2, cl_name=3, cl_score=4, cl_strand=5): with open(bed_file,'r') as infile: coordinates = dict() for _ in range(header_lines): infile.readline() for idx,line in enumerate(infile): coord=line.split() chr_id=coord[cl_chr_id] bpstart=coord[cl_bpstart] bpend=coord[cl_bpend] try: name=coord[cl_name] except: name='ND' try: score=coord[cl_score] except: score='ND' try: strand=coord[cl_strand] except: strand='ND' c=Coordinate(str(chr_id), int(bpstart), int(bpend), name, score,strand) print c coordinates[c]=idx+1 return coordinates @classmethod def coordinates_to_bed(cls,coordinates,bed_file,minimal_format=False): with open(bed_file,'w+') as outfile: for c in coordinates: if minimal_format: outfile.write('%s\t%d\t%d\n' %(c.chr_id,c.bpstart,c.bpend) ) else: outfile.write('%s\t%d\t%d\t%s\t%f\t%s\n' %(c.chr_id,c.bpstart,c.bpend,c.name if c.name else'ND',c.score if c.score else 0,c.strand if c.strand else '+') ) @classmethod def coordinates_to_nscore_format(cls,coordinates,bed_file): with open(bed_file,'w+') as outfile: for c in coordinates: outfile.write('%s\t%d\n' %(c.chr_id,c.bpcenter) ) @classmethod def coordinates_from_interval(cls,chr_id,interval): return Coordinate(chr_id, interval.start,interval.end,) @classmethod def coordinates_to_fasta(cls,coordinates,fasta_file,genome,chars_per_line=50,mask_repetitive=False): with open(fasta_file,'w+') as outfile: for c in coordinates: seq=genome.extract_sequence(c,mask_repetitive) outfile.write('>'+str(c)+'\n'+'\n'.join(chunks(seq,chars_per_line))+'\n') @classmethod def calculate_intersection(cls,coords1,coords2,build_matrix=False): coords_in_common=set() intersection_indexes=set() if build_matrix: intersection_matrix=sparse_matrix((len(coords1),len(coords2))) coord_to_row_index=dict() row_index=0 interval_tree=dict() #Build the interval tree on the first set of coordinates for c in coords1: if c.chr_id not in interval_tree: interval_tree[c.chr_id]=Intersecter() interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend)) coord_to_row_index[c]=row_index row_index+=1 #Calculating the intersection #for each coordinates on the second set check intersection and fill the matrix for cl_index,c in enumerate(coords2): if interval_tree.has_key(c.chr_id): coords_hits=interval_tree[c.chr_id].find(c.bpstart, c.bpend) #coords_in_common+=coords_hits for coord_hit in coords_hits: c_to_add=Coordinate.coordinates_from_interval(c.chr_id, coord_hit) coords_in_common.add(c_to_add) row_index=coord_to_row_index[c_to_add] intersection_indexes.add(row_index) if build_matrix: intersection_matrix[row_index,cl_index]+=1 if build_matrix: return list(coords_in_common),list(intersection_indexes),intersection_matrix else: return list(coords_in_common),list(intersection_indexes) bpcenter=property(bpcenter) chr_id2_ord=property(chr_id2_ord) @classmethod def coordinates_of_intervals_around_center(cls,coords,window_size): half_window=window_size/2 return [Coordinate(c.chr_id,c.bpcenter-half_window,c.bpcenter+half_window,strand=c.strand,name=c.name,score=c.score) for c in coords] #intersecter class Coordinates_Intersecter: def __init__(self,coordinates): self.interval_tree=dict() for c in coordinates: if c.chr_id not in self.interval_tree: self.interval_tree[c.chr_id]=Intersecter() self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend,c)) def find_intersections(self,c): coords_in_common=list() if self.interval_tree.has_key(c.chr_id): coords_hits=self.interval_tree[c.chr_id].find(c.bpstart-1, c.bpend+1) for coord_hit in coords_hits: coords_in_common.append(coord_hit.value) return coords_in_common class Gene: regions=[8000,2000,0,1000] def __init__(self,access,name,coordinate,regions=None,exons=None,introns=None,): self.access=access self.c=coordinate self.name=name self.exons=exons self.introns=introns if regions: self.regions=regions def __str__(self): return self.name+'_'+self.access+'_'+str(self.c) def __repr__(self): return self.name+'_'+self.access+'_'+str(self.c) def tss(self): if self.c.strand=='-': return self.c.bpend else: return self.c.bpstart def tes(self): if self.c.strand=='+': return self.c.bpend else: return self.c.bpstart def end(self): if self.c.strand=='+': return self.c.bpend else: return self.c.bpstart def distal_c(self): if self.c.strand=='+': return Coordinate(self.c.chr_id,self.tss-self.regions[0],self.tss-self.regions[1]-1,strand='+') else: return Coordinate(self.c.chr_id,self.tss+self.regions[1]+1,self.tss+self.regions[0],strand='-') def promoter_c(self): if self.c.strand=='+': return Coordinate(self.c.chr_id,self.tss-self.regions[1],self.tss+self.regions[2]-1,strand='+') else: return Coordinate(self.c.chr_id,self.tss-self.regions[2]+1,self.tss+self.regions[1],strand='-') def intra_c(self): if self.c.strand=='+': return Coordinate(self.c.chr_id,self.tss+self.regions[2],self.end+self.regions[3],strand='+') else: return Coordinate(self.c.chr_id,self.end-self.regions[3],self.tss-self.regions[2],strand='-') def full_c(self): if self.c.strand=='+': return Coordinate(self.c.chr_id,self.tss-self.regions[0],self.end+self.regions[3],strand='+') else: return Coordinate(self.c.chr_id,self.end-self.regions[3],self.tss+self.regions[0],strand='-') @classmethod def load_from_annotation(cls,gene_annotation_file,load_exons_introns_info=False,header_lines=1,regions=regions,format='txt'): genes_list=[] with open(gene_annotation_file,'r') as genes_file: for _ in range(header_lines): genes_file.readline() if format=='txt': idx_chr_id=2 idx_bpstart=4 idx_bpend=5 idx_strand=3 idx_access=1 idx_name=-4 idx_exon_starts=9 idx_exon_ends=10 else: idx_chr_id=0 idx_bpstart=1 idx_bpend=2 idx_strand=5 idx_access=3 idx_name=3 idx_exon_starts=10 idx_exon_ends=11 for gene_line in genes_file: fields=gene_line.split('\t') chr_id=fields[idx_chr_id] bpstart=int(fields[idx_bpstart]) bpend=int(fields[idx_bpend]) strand=fields[idx_strand] access=fields[idx_access] name=fields[idx_name] c=Coordinate(chr_id,bpstart,bpend,strand=strand) if load_exons_introns_info: exon_starts=map(int,fields[idx_exon_starts].split(',')[:-1]) exon_ends=map(int,fields[idx_exon_ends].split(',')[:-1]) exons=[Coordinate(chr_id,bpstart,bpend,strand=strand) for bpstart,bpend in zip(exon_starts,exon_ends)] introns=[Coordinate(chr_id,bpstart+1,bpend-1,strand=strand) for bpstart,bpend in zip(exon_ends[:-1],exon_starts[1:])] genes_list.append(Gene(access,name,c,exons=exons,introns=introns,regions=regions)) else: genes_list.append(Gene(access,name,c,regions=regions)) return genes_list @classmethod def exons_from_annotations(cls,gene_annotation_file,header_lines=1): exons_list=[] with open(gene_annotation_file,'r') as genes_file: for _ in range(header_lines): genes_file.readline() for gene_line in genes_file: fields=gene_line.split('\t') chr_id=fields[2] strand=fields[3] access=fields[1] exon_starts=map(int,fields[9].split(',')[:-1]) exon_ends=map(int,fields[10].split(',')[:-1]) exons=[Coordinate(chr_id,bpstart,bpend,strand=strand,name=access) for bpstart,bpend in zip(exon_starts,exon_ends)] exons_list+=exons return exons_list @classmethod def introns_from_annotations(cls,gene_annotation_file,header_lines=1): introns_list=[] with open(gene_annotation_file,'r') as genes_file: for _ in range(header_lines): genes_file.readline() for gene_line in genes_file: fields=gene_line.split('\t') chr_id=fields[2] strand=fields[3] access=fields[1] exon_starts=map(int,fields[9].split(',')[:-1]) exon_ends=map(int,fields[10].split(',')[:-1]) introns=[Coordinate(chr_id,bpstart-1,bpend,strand=strand,name=access) for bpstart,bpend in zip(exon_ends[:-1],exon_starts[1:])] introns_list+=introns return introns_list @classmethod def genes_coordinates_from_annotations(cls,gene_annotation_file,header_lines=1): genes_coordinates=[] with open(gene_annotation_file,'r') as genes_file: for _ in range(header_lines): genes_file.readline() for gene_line in genes_file: fields=gene_line.split('\t') chr_id=fields[2] bpstart=int(fields[4]) bpend=int(fields[5]) strand=fields[3] access=fields[1] name=fields[-4] genes_coordinates.append(Coordinate(chr_id,bpstart,bpend,strand=strand,name=access)) return genes_coordinates @classmethod def promoter_coordinates_from_annotations(cls,gene_annotation_file,load_exons_introns_info=False,header_lines=1,promoter_region=None): if promoter_region: return [g.promoter_c for g in cls.load_from_annotation(gene_annotation_file,load_exons_introns_info=False,header_lines=1,regions=[Gene.regions[0],promoter_region[0],promoter_region[1],Gene.regions[3]])] else: return [g.promoter_c for g in cls.load_from_annotation(gene_annotation_file,load_exons_introns_info=False,header_lines=1)] @classmethod def write_genomic_regions_bed(cls,gene_annotation_file,genome_name='',minimal_format=True, promoter_region=None): exons=Gene.exons_from_annotations(gene_annotation_file) Coordinate.coordinates_to_bed(exons,genome_name+'exons.bed',minimal_format=minimal_format) del(exons) introns=Gene.introns_from_annotations(gene_annotation_file) Coordinate.coordinates_to_bed(introns,genome_name+'introns.bed',minimal_format=minimal_format) del(introns) promoters=Gene.promoter_coordinates_from_annotations(gene_annotation_file,promoter_region=promoter_region) Coordinate.coordinates_to_bed(promoters,genome_name+'promoters.bed',minimal_format=True) del(promoters) tss=property(tss) tes=property(tes) distal_c=property(distal_c) promoter_c=property(promoter_c) intra_c=property(intra_c) full_c=property(full_c) class Sequence: def __init__(self,seq=''): self.seq=string.lower(seq) def set_bg_model(self,ACGT_probabilities): self.bg_model= rv_discrete(name='bg', values=([0, 1, 2,3], ACGT_probabilities)) def __str__(self): return self.seq def __len__(self): return len(self.seq) # seq_complement def _reverse_complement(self): return "".join([nt_complement[c] for c in self.seq[-1::-1]]) reverse_complement=property(_reverse_complement) @classmethod def reverse_complement(self,seq): return "".join([nt_complement[c] for c in seq[-1::-1]]) @classmethod def generate_random(cls,n,bgmodel=rv_discrete(name='bg', values=([0, 1, 2,3], [0.2955, 0.2045, 0.2045,0.2955]))): int_seq=cls.bg_model.rvs(size=n) return ''.join([int2nt[c] for c in int_seq]) class Seq_set(dict): def add_name(self,name): self.name=name def add_seq(self,seq,cord): self[cord]=seq def remove_seq_from_cord(self,cord): if cord in self.keys(): del self[cord] else: print('sequence not found') class Genome: def __init__(self,genome_directory,release='ND',verbose=False): self.chr=dict() self.genome_directory=genome_directory self.release=release self.chr_len=dict() self.verbose=verbose for infile in glob.glob( os.path.join(genome_directory, '*.fa') ): try: chr_id=os.path.basename(infile).replace('.fa','') self.chr[chr_id] = open(infile,'r') self.chr_len[chr_id]=0 self.chr[chr_id].readline() for line in self.chr[chr_id]: self.chr_len[chr_id]+=len(line.strip()) if verbose: print 'Read:'+ infile except: if verbose: print 'Error, not loaded:',infile if verbose: print 'Genome initializated' def estimate_background(self): counting={'a':.0,'c':.0,'g':.0,'t':.0} all=0.0 for chr_id in self.chr.keys(): if self.verbose: print 'Counting on:',chr_id self.chr[chr_id].seek(0) self.chr[chr_id].readline() for line in self.chr[chr_id]: for nt in counting.keys(): count_nt=line.lower().count(nt) counting[nt]+=count_nt all+=count_nt if self.verbose: print counting for nt in counting.keys(): counting[nt]/=all return counting def extract_sequence(self,coordinate, mask_repetitive=False,line_length=50.0): if not self.chr.has_key(coordinate.chr_id): if self.verbose: print "Warning: chromosome %s not present in the genome" % coordinate.chr_id else: bpstart= coordinate.bpstart-1 bpend=coordinate.bpend self.chr[coordinate.chr_id].seek(0) self.chr[coordinate.chr_id].readline() nbp = bpend - bpstart offset = int( bpstart + math.floor(bpstart/line_length))-1 if offset > 0: self.chr[coordinate.chr_id].seek(offset,1) seq = self.chr[coordinate.chr_id].read(nbp+int(math.floor(nbp/line_length))+1) seq = seq.replace('\n','') if len(seq) < nbp: if self.verbose: print 'Warning: coordinate out of range:',bpstart,bpend if mask_repetitive: seq= ''.join([mask(c) for c in seq[0:nbp]]).lower() else: seq=seq[0:nbp].lower() if coordinate.strand=='-': return Sequence.reverse_complement(seq).lower() else: return seq class Genome_mm: def __init__(self,genome_directory,release='ND',verbose=False): self.chr=dict() self.genome_directory=genome_directory self.release=release self.chr_len=dict() self.verbose=verbose for infile in glob.glob( os.path.join(genome_directory, '*.fa') ): mm_filename=infile.replace('.fa','.mm') filename=infile.replace(genome_directory,'').replace('.fa','') chr_id=os.path.basename(infile).replace('.fa','') if not os.path.isfile(mm_filename): if verbose: print 'Missing:'+chr_id+' generating memory mapped file (This is necessary only the first time) \n' with open(infile) as fi: with open(os.path.join(genome_directory,chr_id+'.mm'),'w+') as fo: #skip header fi.readline() for line in fi: fo.write(line.strip()) if verbose: print 'Memory mapped file generated for:',chr_id with open(mm_filename,'r') as f: self.chr[chr_id]= mmap.mmap(f.fileno(),0, access=mmap.ACCESS_READ) self.chr_len[chr_id]=len(self.chr[chr_id]) if verbose: print "Chromosome:%s Read"% chr_id if verbose: print 'Genome initializated' def extract_sequence(self,coordinate,mask_repetitive=False): if mask_repetitive: seq= ''.join([mask(c) for c in self.chr[coordinate.chr_id][coordinate.bpstart-1:coordinate.bpend]]).lower() else: seq= self.chr[coordinate.chr_id][coordinate.bpstart-1:coordinate.bpend].lower() if coordinate.strand=='-': return Sequence.reverse_complement(seq).lower() else: return seq def estimate_background(self): counting={'a':.0,'c':.0,'g':.0,'t':.0} all=0.0 for chr_id in self.chr.keys(): if self.verbose: print 'Counting on:',chr_id for nt in counting.keys(): count_nt=self.chr[chr_id][:].lower().count(nt) counting[nt]+=count_nt all+=count_nt if self.verbose: print counting for nt in counting.keys(): counting[nt]/=all return counting class Genome_2bit: def __init__(self,genome_2bit_file,verbose=False): self.genome=TwoBitFile(open(genome_2bit_file,'rb')) self.chr_len=dict() self.verbose=verbose for chr_id in self.genome.keys(): self.chr_len[chr_id]=len(self.genome[chr_id]) if verbose: print 'Genome initializated' def extract_sequence(self,coordinate,mask_repetitive=False): if mask_repetitive: seq= ''.join([mask(c) for c in self.genome[coordinate.chr_id][coordinate.bpstart-1:coordinate.bpend]]).lower() else: seq= self.genome[coordinate.chr_id][coordinate.bpstart-1:coordinate.bpend].lower() if coordinate.strand=='-': return Sequence.reverse_complement(seq) else: return seq def estimate_background(self): counting={'a':.0,'c':.0,'g':.0,'t':.0} all=0.0 for chr_id in self.genome.keys(): if self.verbose: start_time = time.time() print 'Counting on:',chr_id for nt in counting.keys(): count_nt=self.genome[chr_id][:].lower().count(nt) counting[nt]+=count_nt all+=count_nt print 'elapsed:',time.time() - start_time if self.verbose: print counting for nt in counting.keys(): counting[nt]/=all return counting def write_meme_background(self,filename): counting=self.estimate_background() with open(filename,'w+') as outfile: for nt in counting.keys(): outfile.write('%s\t%2.4f\n' % (nt,counting[nt])) def write_chr_len(self,filename): with open(filename,'w+') as outfile: for chr_id in self.genome.keys(): outfile.write('%s\t%s\n' % (chr_id,self.chr_len[chr_id]) ) class Fimo: def __init__(self,meme_motifs_filename, bg_filename,p_value=1.e-4,temp_directory='./'): #be aware that they have changed the command line interface recently! self.fimo_command= 'fimo --text --thresh '+str(p_value)+' --bgfile '+bg_filename+' '+meme_motifs_filename self.temp_directory=temp_directory with open(meme_motifs_filename) as infile: self.motif_id_to_name=dict() self.motif_id_to_index=dict() self.motif_names=[] self.motif_name_to_index=dict() self.motif_ids=[] motif_index=0 for line in infile: try: if 'MOTIF' in line: #in meme format sometime you don't have the name and id but only one string for both #like in jolma 2013... motif_id=line.split()[1] try: motif_name=line.split()[2] except: motif_name=motif_id self.motif_id_to_name[motif_id]=motif_name self.motif_id_to_index[motif_id]=motif_index self.motif_name_to_index[motif_name]=motif_name self.motif_ids.append(motif_id) self.motif_names.append(motif_name) motif_index+=1 except: print 'problem with this line:', line def extract_motifs(self,seq, report_mode='full'): if report_mode=='indexes_set': motifs_in_sequence=set() elif report_mode=='fq_array': motifs_in_sequence=np.zeros(len(self.motif_names)) elif report_mode=='full': motifs_in_sequence=list() elif report_mode=='fq_and_presence': motifs_in_sequence={'presence':set(),'fq':np.zeros(len(self.motif_names))} else: raise Exception('report_mode not recognized') with tempfile.NamedTemporaryFile('w+',dir=self.temp_directory,delete=False) as tmp_file: tmp_file.write(''.join(['>S\n',seq,'\n'])) tmp_filename=tmp_file.name tmp_file.close() fimo_process=subprocess.Popen(self.fimo_command+' '+tmp_filename,stdin=None,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) output=fimo_process.communicate()[0] fimo_process.wait() os.remove(tmp_filename) lines=output.split('\n') lines=lines[1:] for line in lines: if line: fields=line.split('\t') motif_id=fields[0] motif_name=self.motif_id_to_name[motif_id] if report_mode=='full': c_start=int(fields[2]) c_end=int(fields[3]) strand=fields[4] score=float(fields[5]) p_value=float(fields[6]) length=len(fields[8]) motifs_in_sequence.append({'id':motif_id,'name':motif_name,'start':c_start,'end':c_end,'strand':strand,'score':score,'p_value':p_value,'length':length}) elif report_mode=='indexes_set': motifs_in_sequence.add(self.motif_name_to_index[motif_name]) elif report_mode=='fq_array': motifs_in_sequence[self.motif_name_to_index[motif_name]]+=1 elif report_mode=='fq_and_presence': motifs_in_sequence['presence'].add(self.motif_name_to_index[motif_name]) motifs_in_sequence['fq'][self.motif_name_to_index[motif_name]]+=1 return motifs_in_sequence if report_mode=='fq_array' or 'fq_and_presence' else list(motifs_in_sequence) def build_motif_in_seq_matrix(bed_filename,genome_directory,meme_motifs_filename,bg_filename,genome_mm=True,temp_directory='./',mask_repetitive=False,p_value=1.e-4, check_only_presence=False): print 'Loading coordinates from bed' target_coords=Coordinate.bed_to_coordinates(bed_filename) print 'Initialize Genome' if genome_mm: genome=Genome_mm(genome_directory) else: genome=Genome(genome_directory) print 'Initilize Fimo and load motifs' fimo=Fimo(meme_motifs_filename,bg_filename,temp_directory=temp_directory,p_value=p_value) print 'Initialize the matrix' motifs_in_sequences_matrix=np.zeros((len(target_coords),len(fimo.motif_names))) for idx_seq,c in enumerate(target_coords): seq=genome.extract_sequence(c,mask_repetitive) print idx_seq, len(target_coords) if check_only_presence: motifs_in_sequences_matrix[idx_seq,fimo.extract_motifs(seq,report_mode='indexes_set')]=1 else: motifs_in_sequences_matrix[idx_seq,:]+=fimo.extract_motifs(seq,report_mode='fq_array') return motifs_in_sequences_matrix, fimo.motif_names, fimo.motif_ids def build_motif_profile(target_coords,genome,meme_motifs_filename,bg_filename,genome_mm=True,temp_directory='./',mask_repetitive=False,p_value=1.e-4,check_only_presence=False): #print 'Initilize Fimo and load motifs' fimo=Fimo(meme_motifs_filename,bg_filename,temp_directory=temp_directory,p_value=p_value) #print 'Allocate memory' if check_only_presence: motifs_in_sequences_profile=np.zeros(len(fimo.motif_names)) else: motifs_in_sequences_profile={'fq':np.zeros((len(target_coords),len(fimo.motif_names))),'presence':np.zeros(len(fimo.motif_names))} for idx_seq,c in enumerate(target_coords): seq=genome.extract_sequence(c,mask_repetitive) print idx_seq, len(target_coords) if check_only_presence: motifs_in_sequences_profile[fimo.extract_motifs(seq,report_mode='indexes_set')]+=1 else: motifs_in_sequences=fimo.extract_motifs(seq,report_mode='fq_and_presence') #print motifs_in_sequences,motifs_in_sequences['presence'],motifs_in_sequences['fq'] motifs_in_sequences_profile['presence'][list(motifs_in_sequences['presence'])]+=1 motifs_in_sequences_profile['fq'][idx_seq,:]+=motifs_in_sequences['fq'] return motifs_in_sequences_profile, fimo.motif_names, fimo.motif_ids ''' given a set of coordinates in a bed file and a bam/sam file calculate the profile matrix ''' def calculate_profile_matrix_bed_bam(bed_filename,sam_filename,window_size=5000, resolution=50, fragment_length=200, use_strand=False,return_coordinates=False): cs=Coordinate.bed_to_coordinates(bed_filename) cs=Coordinate.coordinates_of_intervals_around_center(cs, window_size) samfile = pysam.Samfile(sam_filename) n_bins=(window_size/resolution)+1 profile_matrix=np.zeros((len(cs),n_bins)) for idx_c,c in enumerate(cs): n_start=max(0,c.bpcenter-window_size/2) n_end=c.bpcenter+window_size/2 for rd in samfile.fetch(c.chr_id, n_start,n_end): if rd.is_reverse: rd_st=rd.positions[-1]-fragment_length-1 rd_end=rd.positions[-1] else: rd_st=rd.positions[0] rd_end=max(rd.positions[-1],rd.positions[1]+fragment_length) bin_idx_st=max(0,1+(rd_st-n_start)/resolution) bin_idx_en=min(n_bins,1+ (rd_end-n_start)/resolution) #print rd.positions[1],rd.positions[-1],rd_st, rd_end,rd_end-rd_st,rd_st-n_start,rd_end-n_start,bin_idx_st,bin_idx_en profile_matrix[idx_c,bin_idx_st:bin_idx_en]+=1 if use_strand and c.strand == '-': profile_matrix[idx_c,:]=profile_matrix[idx_c,::-1] if return_coordinates: return profile_matrix,samfile.mapped,cs else: return profile_matrix,samfile.mapped def extract_bg_from_bed(bed_filename,genome_directory,bg_filename,genome_mm=True): acgt_fq={'a':0.0,'c':0.0,'g':0.0,'t':0.0} total=0 print 'Loading coordinates from bed' target_coords=Coordinate.bed_to_coordinates(bed_filename) print 'Initialize Genome' if genome_mm: genome=Genome_mm(genome_directory) else: genome=Genome(genome_directory) for idx_seq,c in enumerate(target_coords): seq=genome.extract_sequence(c) for nt in ['a','c','t','g']: acgt_fq[nt]+=seq.count(nt) total=sum(acgt_fq.values()) for nt in ['a','c','t','g']: acgt_fq[nt]/=total print acgt_fq with open(bg_filename, 'w+') as out_file: for nt in ['a','c','t','g']: out_file.write('%s\t%1.4f\n' % (nt,acgt_fq[nt])) #hgWiggle wrapper def read_from_wig(c,wig_path,wig_mask='.phastCons44way.hg18.compiled',only_average=False): position=c.chr_id+':'+str(c.bpstart)+'-'+str(c.bpend) wig_file=os.path.join(wig_path,c.chr_id+wig_mask) if only_average: command=' '.join(['hgWiggle','-position='+position,wig_file,"-doStats | sed -e '1,3d' | cut -f4,10"]) else: command=' '.join(['hgWiggle','-position='+position,wig_file," -rawDataOut "]) wig_process=subprocess.Popen(command,stdin=None,stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True) output=wig_process.communicate()[0] if only_average: return tuple(output.split()) else: values=output.split() return len(values),values class Annotator: def __init__(self,input_filename,annotations_filenames,annotation_names=None): self.annotations_filenames=annotations_filenames self.input_filename=input_filename print annotation_names #check if we have custom names defined if annotation_names is None: self.annotation_names=annotations_filenames else: self.annotation_names=annotation_names assert len(annotations_filenames) == len(annotation_names) #associate to each name a prime number for the multiple annotation trick.. self.annotation_names_to_prime={name:prime for (name,prime) in zip(self.annotation_names,self.__primes(len(self.annotation_names)))} print self.annotation_names_to_prime self.input_coordinates=Coordinate.bed_to_coordinates(input_filename) def annotate(self): #allocate_memory self.annotation_track=np.ones(len(self.input_coordinates),dtype=np.int) self.interval_tree=dict() self.coord_to_row_index=dict() self.row_index=0 #Build the interval Tree for c in self.input_coordinates: if c.chr_id not in self.interval_tree: self.interval_tree[c.chr_id]=Intersecter() self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend)) self.coord_to_row_index[c]=self.row_index self.row_index+=1 for idx,bed_filename in enumerate(self.annotations_filenames): coordinates=Coordinate.bed_to_coordinates(bed_filename) prime_number=self.annotation_names_to_prime[self.annotation_names[idx]] for idx_intersection in self.__intersection_indexes(coordinates): self.annotation_track[idx_intersection]*=prime_number def coordinates_by_annotation_name(self,annotation_name): if annotation_name=='out': return [self.input_coordinates[idx] for idx,value in enumerate(self.annotation_track) if value ==1] else: prime_number=self.annotation_names_to_prime[annotation_name] return [self.input_coordinates[idx] for idx,value in enumerate(self.annotation_track) if (value % prime_number)==0] def save_annotation_track_matlab(self,filename): savemat(filename,{'annotation_track':self.annotation_track,'mapping':self.annotation_names_to_prime}) print 'Annotation track saved to:',filename def save_annotation_track_bed(self,filename): with open(filename,'w+') as outfile: for idx,c in enumerate(self.input_coordinates): annotated=False line='%s\t%s\t%d\t' % (c.chr_id, c.bpstart, c.bpend,) for name in self.annotation_names_to_prime.keys(): if self.annotation_track[idx] % self.annotation_names_to_prime[name] == 0: line+=name+',' annotated=True line=line[:-1] outfile.write(line+'\n') print 'Annotation track saved to:',filename def __intersection_indexes(self,coordinates): intersection_indexes=set() for cl_index,c in enumerate(coordinates): if self.interval_tree.has_key(c.chr_id): coords_hits=self.interval_tree[c.chr_id].find(c.bpstart, c.bpend) for coord_hit in coords_hits: intersection_indexes.add(self.coord_to_row_index[Coordinate.coordinates_from_interval(c.chr_id, coord_hit)]) return intersection_indexes def __gen_primes(self): D = {} q = 2 while True: if q not in D: yield q D[q * q] = [q] else: for p in D[q]: D.setdefault(p + q, []).append(p) del D[q] q += 1 def __primes(self,n): primes=self.__gen_primes() return [primes.next() for i in range(n)] class Ngram: def __init__(self,ngram_length=4,alphabet_size=4): #print 'Ngram initialization' #build useful dictionary self.alphabet_size=alphabet_size self.ngram_length=ngram_length all_ngram=map(self.int2ngram,range(self.alphabet_size**self.ngram_length),[self.ngram_length]*(self.alphabet_size**self.ngram_length),[self.alphabet_size]*(self.alphabet_size**self.ngram_length)) self.ngram2index=dict() self.index2ngram=dict() self.non_redundant_ngram2index=dict() self.non_redundant_index2ngram=dict() self.ngram_rev_complement=dict() self.number_of_ngrams=len(all_ngram) for idx,ngram in enumerate(all_ngram): self.ngram2index[ngram]=idx self.ngram_rev_complement[ngram]=Sequence.reverse_complement(ngram) self.index2ngram[idx]=ngram for idx,ngram in enumerate([all_ngram[i] for i in self.non_redundant_idx()]): self.non_redundant_ngram2index[ngram]=idx self.non_redundant_index2ngram[idx]=ngram self.number_of_non_redundant_ngrams=len(self.non_redundant_ngram2index) def int2ngram(self,idx,ngram_length,alphabet_size): l=[] for _ in range(ngram_length): l.append(int2nt[idx % alphabet_size]) idx/=alphabet_size return "".join(l)[-1::-1] def non_redundant_idx(self): ngram_taken=set() non_redundant_idxs=[] for idx in range(self.number_of_ngrams): n=self.index2ngram[idx] if (self.ngram2index[n] in ngram_taken) or (self.ngram2index[self.ngram_rev_complement[n]] in ngram_taken ): pass else: non_redundant_idxs.append(idx) ngram_taken.add(self.ngram2index[n]) ngram_taken.add(self.ngram2index[self.ngram_rev_complement[n]]) return tuple(non_redundant_idxs) def build_ngram_fq_vector(self,seq): ngram_vector=zeros(self.number_of_ngrams) for i in xrange(len(seq)-self.ngram_length+1): try: ngram_vector[self.ngram2index[seq[i:i+self.ngram_length]]]+=1 except: pass #ngram_vector[self.ngram2index[self.ngram_rev_complement[seq[i:i+self.ngram_length]]]]+=1 return ngram_vector def build_ngram_fq_vector_non_redundant(self,seq): ngram_vector=zeros(self.number_of_non_redundant_ngrams) for i in xrange(len(seq)-self.ngram_length+1): try: ngram_vector[self.non_redundant_ngram2index[seq[i:i+self.ngram_length]]]+=1 except: pass try: ngram_vector[self.non_redundant_ngram2index[self.ngram_rev_complement[seq[i:i+self.ngram_length]]]]+=1 except: pass return ngram_vector def build_ngrams_fq_matrix(self,seq_set,non_redundant=True): if non_redundant: ngram_matrix=zeros((len(seq_set),self.number_of_non_redundant_ngrams)) for idx_seq,seq in enumerate(seq_set): ngram_matrix[idx_seq,:]=self.build_ngram_fq_vector_non_redundant(seq) else: ngram_matrix=zeros((len(seq_set),self.number_of_ngrams)) for idx_seq,seq in enumerate(seq_set): ngram_matrix[idx_seq,:]=self.build_ngram_fq_vector(seq) return ngram_matrix,np.array([len(seq) for seq in seq_set],ndmin=2) def build_ngram_profile_vector_non_redundant(self,seq): profile=zeros(len(seq)-self.ngram_length+1) for i in xrange(len(seq)-self.ngram_length+1): try: profile[i]=self.non_redundant_ngram2index[seq[i:i+self.ngram_length]] except: pass try: profile[i]=self.non_redundant_ngram2index[self.ngram_rev_complement[seq[i:i+self.ngram_length]]] except: pass return profile def build_ngrams_profile_matrix_non_redundant(self,seq_set): profile_matrix=zeros((len(seq_set),len(seq_set[0])-self.ngram_length+1)) for idx_seq,seq in enumerate(seq_set): profile_matrix[idx_seq,:]=self.build_ngram_profile_vector_non_redundant(seq) return profile_matrix def save_to_file(self,filename=None): if not filename: filename='ng'+str(self.ngram_length) with open(filename,'wb+') as outfile: print 'saving...' cPickle.dump(self, outfile,2) print 'done' @classmethod def load_from_file(cls,filename): with open(filename,'rb') as infile: return cPickle.load(infile)