#!/usr/bin/env python """ Copyright 2014 Novartis Institutes for Biomedical Research Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import sys import re import glob import os import math import shlex import random import numpy import itertools from multiprocessing import * from subprocess import PIPE, Popen from pytee import * from string import Template import yap_cmd_checks import yap_file_io import yap_log import yap_workflow_dict ## global variables global basename_warnings global missing_path_errors ## global variable intialization basename_warnings = [] missing_path_errors = [] def create_dictionary(config_file): """ configuration file reader / creates dictionary of parameters """ try: result = numpy.loadtxt(config_file, dtype = "a100",comments = '#',delimiter = ':=') workflow_config_dict = {} for i in range(0,len(result)): k = re.search('(.*\")(.*)(\".*)',result[i][0]) v = re.search('(.*\")(.*)(\".*)',result[i][1]) key = k.group(2).strip(" ") val = v.group(2).strip(" ") key = key.strip("\t") val = val.strip("\t") workflow_config_dict.setdefault(key,val) except IOError as (errno ,strerror): print "Error: while Opening the file : " , config_file print " I/O error({0}): {1}".format(errno,strerror) exit() except : print "Error : encountered format error while creating the dictionary for file : ",config_file print """ Please check the format the configuration file enclose variable and corresponding values in double quotes("...") use (:=) in variable value assignment (eg "variable" := "value") to add comments,start the line with symbol(#) please refer the documentation for the configuration file formats """ exit() return workflow_config_dict def split_array(totnum,nprocs,lpp,myrank): """ Returns the indices of the chunk(of the input file) to be split based on the rank of the processor.""" totnum = totnum/lpp n2=int(math.floor(totnum/nprocs)) remain=int(math.fmod(totnum,nprocs)) if (myrank < remain) : n1=n2+1 ib1=(myrank)*n1 ie1=ib1+(n1-1) else : n1=n2 ib1=(myrank)*n1+remain ie1=ib1+n1-1 ib=ib1*lpp ie=((ie1+1)*lpp)-1 ret=[] ret.append(ib) ret.append(ie) return ret def get_file_size(inputfile_name): """ Returns the size of a given file using file seek""" try: file_size_on_disk=os.path.getsize(inputfile_name) cs=file_size_on_disk except Exception as e: print e,"Error in finding the file size" input_file = yap_file_io.create_openfile_handler(inputfile_name) file_size=0 input_file.seek(0) file_pos=0 try: file_pos = 0 end_file = 'False' file_pos1 = 0 while end_file == 'False': file_pos=file_pos+cs input_file.seek(file_pos) read_buffer = input_file.read(1) length_read_buffer = len(read_buffer) if length_read_buffer == 0 : file_size=input_file.tell() end_file = 'True' finally: return file_size def get_file_split_position_symbolbased(inputfile_name,nchunks,file_size ,input_file_format): """ Returns the split file indices based on the input file format and specified chunk size.""" input_file = create_openfile_handler(inputfile_name) chunk_size = file_size / nchunks n_parts = 1 chunk_size_arr = [] search_symbol = '' j = 0 input_file.seek(0) check_first_element = input_file.read(1) # file format checking if input_file_format == 'fastq': search_symbol = '@' if check_first_element != search_symbol: print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() if input_file_format == 'fasta': search_symbol = '>' if check_first_element != search_symbol: print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() if input_file_format == 'qseq' or input_file_format == 'tab': search_symbol = '\n' try: end_pos = 0 i = 0 buf = 0 end_pos_arr = [] file_end = 'False' file_pos = 0 input_file.seek(0) for i in range(1,nchunks+1): hit_pos = -1 input_file.seek(chunk_size,1) current_seek = input_file.tell() if i == nchunks or current_seek >= file_size: end_pos = file_size end_pos_arr.append(end_pos) break while hit_pos == -1: buf += 1024 read_buffer = input_file.read(buf) length_read_buffer = len(read_buffer) if length_read_buffer == buf: hit_pos = read_buffer.rfind(search_symbol) else: hit_pos = length_read_buffer if search_symbol == "\n": end_pos = current_seek + (hit_pos+1) else: end_pos = current_seek + hit_pos end_pos_arr.append(end_pos) if end_pos >= file_size: break begin_end_index = numpy.zeros([len(end_pos_arr),3],int) for kk in range(0,len(end_pos_arr)): if kk == 0: begin_end_index[kk][0] = 0 else: begin_end_index[kk][0] = end_pos_arr[kk-1] begin_end_index[kk][1] = end_pos_arr[kk] begin_end_index[kk][2] = kk file_split_info = [] file_split_info.append(inputfile_name) file_split_info.append(begin_end_index) finally: input_file.close() return file_split_info def convert_format(seqs_str): """ Converts formats between qseq, fastq, fasta & tab if the specified input and output formats are different. """ seqs_arr = seqs_str.splitlines(1) seqs_arr_len = len(seqs_arr) if seqs_arr_len <= 0: print "Empty data : no sequences read or empty input file" exit() input_file_format = yap_workflow_dict.input_file_format output_file_format = yap_workflow_dict.preprocess_output_file_format format_seqs_arr = [] if output_file_format == 'qseq': print "please specify the output file format as fastq or fasta). output file format given := ", output_file_format exit() if input_file_format == 'qseq': try: for i in range (0,len(seqs_arr)): record = seqs_arr[i].strip("\n").split('\t') machine_name = record[0] run_number = record[1] lane_number = record[2] tile = record[3] x = record[4] y = record[5] read = record[7] sequence = record[8] quality = record[9] pass_qc_msg = record[10] if pass_qc_msg == str(1): seq_id = '%s_%s:%s:%s:%s:%s#%s/%s' % (machine_name,run_number,lane_number,tile,x,y,read,pass_qc_msg) if output_file_format == 'fasta': format_seqs_arr.append(">" + seq_id + "\n" ) format_seqs_arr.append(sequence + "\n") elif output_file_format == 'tab': format_seqs_arr.append(seq_id + "\t" + sequence + "\n" ) else: format_seqs_arr.append("@" + seq_id + "\n" ) format_seqs_arr.append(sequence+ "\n") format_seqs_arr.append("+" + "\n") format_seqs_arr.append(quality + "\n") except Exception as e: print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() if input_file_format == 'fastq': if seqs_arr[0][0] != "@": print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() for i in range(0,seqs_arr_len,4): seq_id = seqs_arr[i].strip("\n") sequence = seqs_arr[i+1].strip("\n") desc = seqs_arr[i+2].strip("\n") quality = seqs_arr[i+3].strip("\n") if output_file_format == 'fasta': format_seqs_arr.append(">" + seq_id.lstrip('@') + "\n") format_seqs_arr.append(sequence + "\n") elif output_file_format == 'tab': format_seqs_arr.append(seq_id.lstrip('@') + "\t" + sequence + "\n" ) else: format_seqs_arr.append(seq_id + "\n") format_seqs_arr.append(sequence + "\n") format_seqs_arr.append(desc + "\n") format_seqs_arr.append(quality + "\n") if input_file_format == 'fasta': if seqs_arr[0][0] != ">": print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() for i in range(0,seqs_arr_len,2): seq_id = seqs_arr[i].strip("\n") sequence = seqs_arr[i+1].strip("\n") if output_file_format == 'tab' : format_seqs_arr.append(seq_id.lstrip('>') + "\t" + sequence + "\n" ) elif output_file_format == 'fasta': format_seqs_arr.append(seq_id + "\n" ) format_seqs_arr.append(sequence + "\n" ) else: format_seqs_arr.append("@" + seq_id.lstrip('>') + "\n") format_seqs_arr.append(sequence + "\n") format_seqs_arr.append("+" + "\n") format_seqs_arr.append(" "+ "\n") if input_file_format == 'tab': try: for line in seqs_arr: record = line.strip('\n').split('\t') if len(record) != 2: print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() seq_id = record[0] sequence = record[1] quality = " " if output_file_format == 'tab' : format_seqs_arr.append(seq_id + "\t" + sequence + "\n") elif output_file_format == 'fasta': format_seqs_arr.append(">"+seq_id + "\n") format_seqs_arr.append(sequence + "\n" ) else: format_seqs_arr.append("@" + seq_id + "\n") format_seqs_arr.append(sequence + "\n") format_seqs_arr.append("+" + "\n") format_seqs_arr.append(" "+ "\n") except Exception as e: print " Error : Please specify the correct input file format in workflow configuration file" print " Input Data is not in the given " , input_file_format , "format" exit() return ''.join(format_seqs_arr) def format_sequences(seqs_arr,output_dict): """Print the Sequence array into the corresponding format given in the Main Config File""" output_format = yap_workflow_dict.preprocess_output_file_format out_arr= [] if output_format == "fasta": for i in range(0,len(seqs_arr)): j = i out_arr.append('>' + seqs_arr[j][0] + "\n" + seqs_arr[j][1] + "\n") if output_format == "fastq": for i in range(0,len(seqs_arr)): j = i out_arr.append('@'+ seqs_arr[j][0]+ "\n" + seqs_arr[j][1] + "\n" + '+' + "\n" + seqs_arr[j][2] + "\n") if output_format == "tab": for i in range(0,len(seqs_arr)): j = i out_arr.append(seqs_arr[j][0] + "\t" + seqs_arr[j][1] + "\n") return out_arr def qc_basecount(seqs_str, workflow_prov): """ Returns the base count per read location """ bases_dict = {'A':0,'a':0,'C':1,'c':1,'T':2,'t':2,'G':3,'g':3,'N':4,'n':4} output_file_format = yap_workflow_dict.preprocess_output_file_format max_read_length = int(yap_workflow_dict.max_read_length) if output_file_format == "fasta": loop_increment = 2 elif output_file_format == "tab": loop_increment = 1 else: loop_increment = 4 alphabet_size = 5 base_count_per_read_location =numpy.zeros((max_read_length,alphabet_size),dtype = numpy.int) if seqs_str != '': seqs_arr = seqs_str.splitlines(1) for jj in range(0, len(seqs_arr), loop_increment): if output_file_format == 'tab': record = seqs_arr[jj].strip("\n").split("\t") str1 = record[1] else: str1 = seqs_arr[jj+1].strip("\n") read_length = len(str1) for i in range(read_length): ii = bases_dict[str1[i]] base_count_per_read_location[i,ii] = base_count_per_read_location[i,ii]+1 return base_count_per_read_location,workflow_prov def plot_base_counts(x,a,c,t,g,n,output_fig): """ Generates plots of frequency of basecounts per read location. """ import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt max_y=max(max(a),max(c),max(t),max(g),max(n)) min_y=min(min(a),min(c),min(t),min(g),min(n)) min_x=min(x) max_x=max(x) fig = plt.figure(figsize=(9,6)) ax = fig.add_subplot(111) frame = ax.get_frame() frame.set_facecolor('1.00') # set the frame face color to light gray ax.plot(x,a,'k-',x,c,'b-',x,t,'g-',x,g,'m-',x,n,'r-') leg=plt.legend(('A', 'C', 'T' , 'G' , 'N'),loc=(1.001,.01)) ax.set_ylim([min_y,max_y]) ax.set_xlim([min_x,max_x]) ax.grid(False) ax.set_xlabel('Read Position') ax.set_ylabel('Base Count') ax.set_title('Position vs. Base Count') # set some legend properties. All the code below is optional. The # defaults are usually sensible but if you need more control, this # shows you how # the matplotlib.patches.Rectangle instance surrounding the legend frame = leg.get_frame() frame.set_facecolor('1.00') # set the frame face color to white # matplotlib.text.Text instances for t in leg.get_texts(): t.set_fontsize('small') # the legend text fontsize # matplotlib.lines.Line2D instances for l in leg.get_lines(): l.set_linewidth(1.5) # the legend line width pdf_file=output_fig+'.pdf' eps_file=output_fig+'.eps' plt.savefig(pdf_file, facecolor='w', edgecolor='w', orientation='landscape', papertype='letter', format='pdf', transparent=False, bbox_inches=None, pad_inches=0.1) plt.savefig(eps_file, facecolor='w', edgecolor='w', orientation='landscape', papertype='letter', format='eps', transparent=False, bbox_inches=None, pad_inches=0.1) def create_dir(directoryname): """ Checks to see if a given directory exists, else creates it. """ if os.path.exists(directoryname)==True: pass else: os.system ("mkdir -p " + " " + directoryname) return directoryname def find_pair_end_files(file_list): """ Finds the corresponding paired end files in a given input list. Uses '_1' and '_2' to pair them. """ pairs=file_list pairs.sort() found_pairs=[] found_set=[] for i in range(len(pairs)): if pairs[i] in found_set: continue else: for j in range(len(pairs)): if i==j or pairs[j] in found_set: continue else: if pairs[i] not in found_set: if len(pairs[i]) == len(pairs[j]): diff = sum(ch1 != ch2 for ch1, ch2 in zip(pairs[i],pairs[j]) ) if diff==1: for ch1, ch2 in zip(pairs[i],pairs[j]): if ch1 != ch2: if ch1=='1' and ch2=='2' or ch1 == '2' and ch1 =='1': found=[] found.append(pairs[i]) found.append(pairs[j]) found_pairs.append(found) found_set.append(pairs[i]) found_set.append(pairs[j]) else: pass else : if j == len(pairs)-1: pass else : continue else: continue else: break not_found_set= list(set(pairs)-set(found_set)) return found_pairs,not_found_set def find_unique_set(s1,s2): """Reads sequence dictionaries for two pairs after preprocess step, retains only the reads which belong to the same read""" only_headers_s1list=[] only_headers_s1dict={} only_headers_s2list=[] only_headers_s2dict={} unique_S1=[] unique_S2=[] for i in range(0,len(s1),4): only_headers_s1=re.match('^@\S+',str(s1[i])) if only_headers_s1: header_1 = only_headers_s1.group().strip("\n").strip("1") only_headers_s1dict[header_1]=[] only_headers_s1dict[header_1].append(s1[i+1]) only_headers_s1dict[header_1].append(s1[i+2]) only_headers_s1dict[header_1].append(s1[i+3]) set_s1=set(only_headers_s1dict.keys()) for i in range(0,len(s2),4): only_headers_s2=re.match('^@\S+',str(s2[i])) if only_headers_s2: header_2 = only_headers_s2.group().strip("\n").strip("2") only_headers_s2dict[header_2]=[] only_headers_s2dict[header_2].append(s2[i+1]) only_headers_s2dict[header_2].append(s2[i+2]) only_headers_s2dict[header_2].append(s2[i+3]) set_s2=set(only_headers_s2dict.keys()) unique_set= list(set_s1 & set_s2) for i in range(len(unique_set)): header_4_s1=unique_set[i]+"1"+"\n" header_4_s2=unique_set[i]+"2"+"\n" unique_S1.append(header_4_s1) unique_S2.append(header_4_s2) temp_S1=only_headers_s1dict[unique_set[i]] temp_S2=only_headers_s2dict[unique_set[i]] for j in range(len(temp_S1)): unique_S1.append(temp_S1[j]) for k in range(len(temp_S2)): unique_S2.append(temp_S2[k]) return ''.join(unique_S1),''.join(unique_S2) def read_file_chunks(input_file,input_file2,chunk_number,nchunks,chunk_size,file_size ,format_specific_lines): """ Reads the indices of a file rather than physically chunking it. Uses this mechanism to split the input table.""" end_pos_arr = [] extra_lines = 0 input_file.seek(0,1) current_pos = input_file.tell() inp1 = '' inptemp2 ='' if input_file != '' : if chunk_number == nchunks-1: inp1 = input_file.read() tempcount = inp1.count("\n") current_seek = input_file.tell() else: inp1 = input_file.read(chunk_size) tempcount = inp1.count("\n") if tempcount < format_specific_lines : extra_lines = format_specific_lines - tempcount for k in range(0,extra_lines): inp1 += input_file.readline() tempcount = tempcount + extra_lines current_seek = input_file.tell() else : if tempcount % format_specific_lines == 0: if inp1[-1] != '\n': last_index = inp1.rindex('\n') back_steps = len(inp1) - last_index inp1 = inp1[0:(last_index+1)] input_file.seek(-(back_steps-1),1) current_seek = input_file.tell() else : extra_lines = ((tempcount-(tempcount % format_specific_lines))+format_specific_lines) - tempcount for k in range(0,extra_lines): inp1 += input_file.readline() tempcount = tempcount + extra_lines current_seek = input_file.tell() if input_file2 != '': inptemp2 = ''.join(list(itertools.islice(input_file2,tempcount))) tempcount2 = inptemp2.count("\n") seekpos2 = input_file2.tell() back_steps_temp2 = 0 diff = (tempcount2 - tempcount) if diff != 0 : print "Error: Number of lines in paired chunks do not match" end_pos = current_seek end_pos_arr.append(end_pos) return end_pos,inp1,inptemp2 def find_variable(var,cmd): """Given a string and varible prefix, extracts complete variable name, until while space is encountered""" var_found = '' for jj in range(cmd.find(var),len(cmd)): if cmd[jj] != ' ': var_found += cmd[jj] else: break return var_found def find_nth(str1, mystr, n): """ Finds a pattern in an input string and returns the starting index. """ start = str1.find(mystr) while start >= 0 and n > 1: start = str1.find(mystr, start+len(mystr)) n -=1 return start def run_function(cmd_string,inp_str,kk,output_dict,fh): """ Executes a function using the subprocess module. The funtion is called in the multiproc_function().""" try: P1=Popen(cmd_string,stdin=PIPE,stdout=PIPE,stderr=fh,shell=True) output_dict[str(kk)] = P1.communicate(inp_str)[0] except Exception as e: print cmd_string, " Failed!!" fh.write(str(e)+'\n') fh.close() def multiproc_function(cmd_string,inp_str,lpp,header,err_log,stat_log): """ Parallelizes the commands across cores per node using the multiprocessing module in python. """ out_str='' err_str='' procs=[] st = 0 en = 0 try: lock = Lock() manager=Manager() output_dict=manager.dict() nprocs=cpu_count() tot_nlines=inp_str.count("\n") for i in range(0,nprocs): fh=open(err_log+"_multiproc_"+str(i).zfill(4),'a') ret=split_array(tot_nlines,nprocs,lpp,i) ib=ret[0] ie=ret[1] nlines=ie-ib+1 n1=find_nth(inp_str[st:len(inp_str)], "\n", nlines) n1=n1+1 en=st+n1 if header != '': cmd_string_final = cmd_string + "_" + str(i) procs.append(Process(target=run_function,args=(cmd_string_final,header + inp_str[st:en],i,output_dict,fh))) else: if inp_str[st:en] != '': procs.append(Process(target=run_function,args=(cmd_string,inp_str[st:en],i,output_dict,fh))) st=en for i in range(0,len(procs)): procs[i].start() for i in range(0,len(procs)): procs[i].join() for i in range(0,len(procs)): exit_code=procs[i].exitcode yap_file_io.write_data("EXIT_CODE: "+str(exit_code)+"\n",err_log+"_multiproc_"+str(i).zfill(4)) for i in range(0,len(procs)): out_str += output_dict[str(i)] except Exception as e: print e del output_dict return out_str def begin_end_checker(data,filename): """ Checks for corresponding begin and end terms to parse command sections of the configuration. """ config_list=[] begin_list = [] COMMENT_CHAR='#' for i in range(len(data)): line = data[i].strip() if COMMENT_CHAR in line: line,comment=line.split(COMMENT_CHAR, 1) config_list.append(line) config_string= '\n'.join(config_list) command_begin_list = [":begin"] command_end_list = [":end"] # Syntax checker/Prilimnary checks dict_flag={} begin_status = 'off' end_status = 'off' results = [] for i in range(len(config_list)): if config_list[i] in command_begin_list: if begin_status == 'on': results.append(" Missing :end corresponding to :begin at line " + str(j+1)) j = i else: begin_status = 'on' j = i elif config_list[i] in command_end_list: end_status = 'on' j = i if begin_status == 'off': results.append("Missing :begin corresponding to :end at line " + str(j+1)) if begin_status == 'on' and end_status == 'on': begin_status = 'off' end_status = 'off' if begin_status == 'on': results.append("Missing :end corresponding to :begin at line " + str(len(config_list))) if len(results) > 0: results.append("Note:Use symbol(:begin) and (:end) to define command sections,enclose variable and corresponding values in double quotes(eg \"..\")") begin_list = [] else: begin_list=config_string.split(':begin') return begin_list,results def yap_tee(initial_pipe_commands,commands2,input_file,err_log,stat_log): """ Emulates a Unix tee like function in python. """ dir_path , input_file_name = os.path.split(input_file) input_file_name,format=os.path.splitext(input_file_name) random_number = str(random.random()) if 'JOB_ID' in os.environ.keys(): unique_jobid = os.environ['JOB_ID'] else: unique_jobid = '' def func0(fifos,initial_pipe_commands,input_file): try: format=os.path.splitext(input_file)[1] tee1=create_tee(fifos,mode='w') icommand=[] num_pipe_commands=len(initial_pipe_commands)+1 if num_pipe_commands > 1 : if format==".gz": p1=subprocess.Popen(["zcat", input_file], stdout=subprocess.PIPE) icommand.append(p1) elif format==".bz2": p1=subprocess.Popen(["bzcat", input_file], stdout=subprocess.PIPE) icommand.append(p1) else : p1=subprocess.Popen(["cat", input_file], stdout=subprocess.PIPE) icommand.append(p1) for i in range(1,num_pipe_commands): if( i == num_pipe_commands-1): p1=subprocess.Popen(shlex.split(initial_pipe_commands[i-1]),stdin=icommand[i-1].stdout,stdout=subprocess.PIPE) icommand.append(p1) else: p1=subprocess.Popen(shlex.split(initial_pipe_commands[i-1]),stdin=icommand[i-1].stdout,stdout=subprocess.PIPE) icommand.append(p1) else : if format==".gz": p1=subprocess.Popen(["zcat", input_file], stdout=subprocess.PIPE) icommand.append(p1) elif format==".bz2": p1=subprocess.Popen(["bzcat", input_file], stdout=subprocess.PIPE) icommand.append(p1) else : p1=subprocess.Popen(["cat", input_file], stdout=subprocess.PIPE) icommand.append(p1) tee1.write(icommand[num_pipe_commands-1].communicate()[0]) tee1.flush() tee1.close() rc=icommand[num_pipe_commands-1].poll() if rc==0 : pass except Exception as e: print "\nError: yap_tee execution failed for initial pipe commands",e yap_log.write_log(str(initial_pipe_commands).lstrip("[").rstrip("]"),input_file,'',str(e),err_log,stat_log) def func1(cmd1,fifoin,t_err_log): icommand=[] try: fh=open(t_err_log,'a') fh1=open(fifoin,'r') num_cmd=len(cmd1) cmd2=[] for i in range(num_cmd): tt=Template(cmd1[i]) cmd2.append(tt.substitute(os.environ)) for i in range(num_cmd): if (num_cmd == 1) : p2=subprocess.Popen(cmd2[i],stdin=fh1,stdout=subprocess.PIPE,stderr=fh,shell=True) icommand.append(p2) elif (num_cmd > 1 and i == 0 ) : p2=subprocess.Popen(cmd2[i],stdin=fh1,stdout=subprocess.PIPE,stderr=fh,shell=True) icommand.append(p2) elif (num_cmd > 1 and i==num_cmd-1) : p2=subprocess.Popen(cmd2[i],stdin=subprocess.PIPE,stdout=subprocess.PIPE,stderr=fh,shell=True) icommand.append(p2) else : p2=subprocess.Popen(cmd2[i],stdin=icommand[i-1].stdout,stdout=subprocess.PIPE,stderr=fh,shell=True) icommand.append(p2) icommand[num_cmd-1].communicate(icommand[num_cmd-2].stdout.read()) fh1.close() icommand[num_cmd-2].stdout.close() icommand[num_cmd-2].wait() rc1=icommand[num_cmd-1].poll() if rc1==0: print cmd1, " Finished Successfully" , "return_code=", rc1 fh.close() except Exception as e: print "\nError: yap_tee execution failed for commands",cmd1 ,e yap_log.write_log(str(cmd1),input_file,'',str(e),err_log,stat_log) commands1=[] try: for i in range(len(commands2)): t1=commands2[i].split('|') commands1.append(t1) ncommands=len(commands1) nprocs=ncommands+1 commands=[] fifos=[] tmp_err_log=[] for i in range(0,ncommands): commands.append(commands1[i]) temp_dir = yap_workflow_dict.yap_temp_user_dir tt=Template(temp_dir) fifo_name = tt.substitute(os.environ)+"/"+ input_file_name + unique_jobid + "_" + random_number + "_fifo"+ str(i) fifos.append(fifo_name) if not os.path.exists(fifos[i]): os.mkfifo(fifos[i]) tmp_err_log.append(err_log+"_yap_tee_"+str(i).zfill(4)) lock=Lock() manager=Manager() procs=[] fh=[] procs.append(Process(target=func0,args=(fifos,initial_pipe_commands,input_file))) for i in range(1,nprocs): procs.append(Process(target=func1,args=(commands[i-1],fifos[i-1],tmp_err_log[i-1]))) for i in range(nprocs): procs[i].start() for i in range(nprocs): procs[i].join() for i in range(nprocs): exit_code=procs[i].exitcode yap_file_io.write_data("EXIT_CODE: "+str(exit_code)+"\n",tmp_err_log[i-1]) for i in (fifos): os.remove(i) yap_log.merge_tee_files(str(initial_pipe_commands).lstrip("[").rstrip("]")+","+str(commands2).lstrip("[").rstrip("]"),input_file,err_log,stat_log) except Exception as e: print "\nError: yap_tee execution failed at multiprocess call ",e yap_log.write_log(str(initial_pipe_commands).lstrip("[").rstrip("]")+","+str(commands2).lstrip("[").rstrip("]"),input_file,'',str(e),err_log,stat_log) def command_section_parser(data,filename,count): """ Parses command sections of the configuration and returns command arrays. """ config_string= '$$'.join(data) begin_list=config_string.split(':begin') error_list=[] # command validator object command_validator = yap_cmd_checks.yap_cmd_checks() if len(data) != 0: if len(begin_list[0])!=0: count=len(begin_list[0]) cmd_arr=[] for i in range(1,len(begin_list)): temp_list=[] temp_list=begin_list[i].split('$$') cmd='' cmd_name = '' blank = ' ' execute_flag = '' for j in range(0,(len(temp_list)-1)): count+=1 line =temp_list[j].strip().strip('\t') match_config='' match_config=re.match('\"(.*)\"[ \t]*\:\=[ \t]*\"(.*)\"',line) if match_config: key=match_config.group(1).strip().strip('\t') val=match_config.group(2).strip().strip('\t') if key == "execute_command": if val == "yes": execute_flag = 'yes' # enable command validator command_validator.enable = "yes" else: # disable command validator command_validator.enable = "no" elif key == "command_name": cmd += val + ' ' path , cmd_name = os.path.split(val) cmd_name,ext = os.path.splitext(cmd_name) # validate command name validation_output = command_validator.is_valid_cmd_name(val) error_msg ="Error: In file: "+ filename + " Invalid command name at line " +\ str(count) +\ ". Please check if command executable exists, executable "+\ "symbolic link is not broken, if any, and it is set as executable. ==> " + line # check and print error if required check_cmd_validation_output(validation_output, error_msg, str(count), filename, cmd_name) elif val == "no": pass elif val == "yes": cmd += key + blank # validate command argument validation_output = command_validator.is_valid_cmd_args(key) error_msg ="Error: In file: "+ filename + " Invalid command argument at line " + \ str(count) + \ ". Please check if all file name and paths exist and any symbolic "+\ "links are not broken. ==> " + line # check and print error if required check_cmd_validation_output(validation_output, error_msg, str(count), filename, cmd_name) else: cmd += key + blank + val + blank # validate command argument validation_output = command_validator.is_valid_cmd_args(key + blank + val) error_msg ="Error: In file: "+ filename + " Invalid command argument at line " + \ str(count) + \ ". Please check if all file name and paths exist and any symbolic "+\ "links are not broken. ==> " + line # check and print error if required check_cmd_validation_output(validation_output, error_msg, str(count), filename, cmd_name) #extra check for empty invalid cuffdiff sample file name if key == 'list_of_samples_to_compare' and execute_flag == 'yes': if len(val.strip()) == 0: cuff_diff_error = "Error: In file: "+ filename + " Invalid command argument at line " +\ str(count) +\ ". Please provide a valid file name for argument: list_of_samples_to_compare. ==> " + line error_list.append(cuff_diff_error)#append error #extra check for empty cuff merge and cuff compare file name if key == 'list_of_samples' and execute_flag == 'yes': if len(val.strip()) == 0: cuff_diff_error = "Error: In file: "+ filename + " Invalid command argument at line " +\ str(count) +\ ". Please provide a valid file name for argument: list_of_samples. ==> " + line error_list.append(cuff_diff_error)#append error else: if line ==':end': pass elif re.match('^[\s\t]+$',line) or len(line)==0: pass else: error_msg='' error_msg="Error: In file: "+ filename +\ " Check syntax at line " + str(count) + " ==> " + line error_list.append(error_msg) if execute_flag == 'yes': cmd_arr.append([cmd_name,cmd]) if len(error_list)!=0: cmd_arr = [] return cmd_arr,error_list def begin_end_checker1(data,filename,keywords): """ Checks for corresponding begin and end terms to parse command sections of the configuration. """ command_begin_sym = keywords[0] command_end_sym = keywords[1] config_list=[] begin_list = [] COMMENT_CHAR='#' for i in range(len(data)): line = data[i].strip() if COMMENT_CHAR in line: line,comment=line.split(COMMENT_CHAR, 1) config_list.append(line) config_string= '\n'.join(config_list) # Syntax checker/Prilimnary checks dict_flag={} begin_status = 'off' end_status = 'off' results = [] for i in range(len(config_list)): if config_list[i] == command_begin_sym: if begin_status == 'on': results.append(" Missing " + command_end_sym + " corresponding to " + command_begin_sym + " at line " + str(j+1)) j = i else: begin_status = 'on' j = i elif config_list[i] == command_end_sym: end_status = 'on' j = i if begin_status == 'off': results.append("Missing " + command_begin_sym + " corresponding to " + command_end_sym + " at line " + str(j+1)) if begin_status == 'on' and end_status == 'on': begin_status = 'off' end_status = 'off' if begin_status == 'on': results.append("Missing " + command_end_sym +" corresponding to " + command_begin_sym + " at line " + str(len(config_list))) return results def command_parser(file_arr,filename): """ Parses the configuration files based on begin..end segments. Returns syntactical and path errors if any. Else, returns the command list. """ syntax_arr = [[':begin',':end'],[':begin_tee',':end_tee']] meta_terms = ['input_file_type','input_directory'] error_arr = [] cmd_arr = [] for i in syntax_arr: keywords = i results = begin_end_checker1(file_arr,filename,keywords) error_arr += results if len(error_arr) > 0: error_arr.append("Note:Use symbol(:begin) and (:end) to define command sections,enclose variable and corresponding values in double quotes(eg \"..\")") else: count =0 tee_arr=[] tee_error_arr=[] cmd_arr=[] tmp_arr=[] tee_status='off' cmd_status='off' for i in range(len(file_arr)): cmd_meta_data = [] count=count+1 line = file_arr[i] line = line.partition('#')[0] line = line.lstrip().rstrip().rstrip('\n') if line == ":begin_tee": tee_status="on" tee_arr.append(":begin_tee") elif line == ":begin": tmp_arr.append(line) if tee_status=="off": tee_arr.append(":begin") cmd_status="on" elif line == ":end": tmp_arr.append(line) if tee_status=="off": cmd_metadata= [] tmp_arr,tee_error_arr=command_section_parser(tmp_arr,filename,(count-len(tmp_arr))) error_arr=error_arr+tee_error_arr if len(tmp_arr) >0: cmd_name = tmp_arr[0][0] cmd = tmp_arr[0][1] input_directory_obj = re.match( r'(.*) input_directory[\s\t]*([\S\T]*)[\s\t]*', cmd,re.M|re.I) input_file_ext_matchobj = re.match(r'(.*) input_file_type [\s\t]*([\S\T]*)[\s\t]*', cmd,re.M|re.I) input_file_extension = '' if input_file_ext_matchobj : input_file_extension = input_file_ext_matchobj.group(2) cmd_metadata.append('input_file_type' + ' ' + input_file_extension ) matchobj = re.match( r'(.*) input_directory (\w*)', cmd,re.M|re.I) input_directory = '' if matchobj: input_directory = matchobj.group(2) else: input_directory = "aligner_output" cmd_metadata.append('input_directory' + ' ' + input_directory ) cmd= cmd.replace('input_directory' + ' ' + input_directory + ' ' ,'') cmd = cmd.replace('input_file_type' + ' ' + input_file_extension + ' ' ,'') tmp_arr= [[cmd_name,cmd]] tee_arr.append(cmd_metadata) tee_arr.append(tmp_arr) cmd_arr.append(tee_arr) tmp_arr=[] tee_arr=[] cmd_status="off" elif line==":end_tee": ind=tmp_arr.index(':begin') inp_arr=[] cmd_meta_data = [] inp_arr=tmp_arr[:ind] meta_count=0 meta_key = '' meta_val = '' for i in range (0, len(inp_arr)): meta_match=re.match('\"(.*)\"[ \t]*\:\=[ \t]*\"(.*)\"',inp_arr[i]) if meta_match: meta_key , meta_val = inp_arr[i].strip("\n").replace('"','').replace(' ','').split(":=") cmd_meta_data.append(meta_key+ " " + meta_val) if meta_key in meta_terms: meta_count+=1 elif inp_arr[i] =='': pass else: error_arr.append("Error: Check syntax at line "+str(count-(len(tmp_arr))+i)+" ==> "+inp_arr[i]+" Please follow the convention \"parameter\" := \"value\"\n") if meta_count<len(meta_terms): error_arr.append("Error: Missing terms at line " + str(count-(len(tmp_arr))) + " ==> Missing " + str(meta_terms) + " in tee command section" ) elif meta_count>len(meta_terms): error_arr.append("Error: Replication of terms at line " + str(count-(len(tmp_arr))) + " ==> Possible replication of terms "+str(meta_terms) ) tmp_arr,tee_error_arr=command_section_parser(tmp_arr[ind:],filename,(count-len(tmp_arr[ind:]))-1) error_arr=error_arr+tee_error_arr tee_arr.append(cmd_meta_data) tee_arr.append(tmp_arr) if len(tmp_arr) >0: cmd_arr.append(tee_arr) tmp_arr=[] tee_arr=[] tee_status="off" else: if tee_status=="on" or cmd_status=="on": tmp_arr.append(line) else: pass if len(error_arr) > 0: error_arr.insert(0,"Errors found in " +filename) error_arr.append("Please follow the convention \"parameter\" := \"value\" for parameter\n") cmd_arr = [] return cmd_arr,error_arr def workflow_parser(data,filename,nprocs): """ Parses the key-value pairs in workflow configuration. Checks if paths are valid and for syntax errors. Returns a list of dictionaries of all the workflows contained in the configuration.""" begin_list,error_list = begin_end_checker(data,filename) error_list2 = [] count=0 workflow_struct = [] error_msg='' if len(begin_list) > 1 : expt_data=begin_list[0] ## BEFORE THE FIRST :begin :end BLOCK expt_data=begin_list[0] if len(expt_data)!=0: workflow_dict={} expt_list=[] expt_list=expt_data.split('\n') for i in range ( 0, (len(expt_list)-1)): line = expt_list[i] count+=1 match_expt='' match_expt=re.match('\"(.*)\"[\s\t]*\:\=[\s\t]*\"(.*)\"',line) if match_expt: key=match_expt.group(1).strip().strip('\t') val=match_expt.group(2).strip().strip('\t') workflow_dict[key]=val else: if re.match('^[\s\t]+$',line) or len(line)==0: pass else: error_msg="Error at line " + str(count) + " ==> " + line error_list.append(error_msg) workflow_struct.append(workflow_dict) #WORKFLOW CONFIGURATION CHECKER ## SCANS THROUGH BLOCK BY BLOCk ie :begin-:end # checks for errors # generates commands for i in range(1,len(begin_list)): count+=1 temp_list=[] temp_list=begin_list[i].split('\n') workflow_dict_obj=yap_workflow_dict.workflow_dictionary() workflow_dict = workflow_dict_obj.create_default_wf_dict() workflow_dict["nprocs"]=str(nprocs) tmp_key_list = [] for j in range(1,(len(temp_list)-1)): line=temp_list[j].strip().strip('\t') count+=1 match_config='' match_config=re.match('\"(.*)\"[ \t]*\:\=[ \t]*\"(.*)\"',line) if match_config: key=match_config.group(1).strip().strip('\t') val=match_config.group(2).strip().strip('\t') if key in tmp_key_list: error_list.append("Error: Replication of terms at line " + str(count) + " ==> Possible replication of terms: "+ key ) else: workflow_dict[key]=val tmp_key_list.append(key) else: if line==':end': pass elif re.match('^[\s\t]+$',line) or len(line)==0: pass else: error_msg="Error at line " + str(count) + " ==> " + line error_list.append(error_msg) workflow_struct.append(workflow_dict) error_list2.extend(workflow_dict_obj.validate_wf_dict(workflow_dict)) if len(error_list)!=0: error_list.insert(0,"Errors found in " +filename) error_list.append("Please follow the convention \"parameter\" := \"value\"\n") workflow_struct = [] elif len(error_list2) != 0: error_list2.insert(0,"Errors found in " +filename) workflow_struct = [] error_list = error_list2 return workflow_struct,error_list def rename_barcode(barcode): """ Returns the barcode information or "no_barcode_specified" if there are no barcodes""" if barcode == "no_barcode_specified": barcode_value = '' else: barcode_value = barcode return barcode_value def check_open_file_desc(): """ Checks if a particular file descriptor is still open. """ pid=os.getpid() ppid=os.getppid() procs=check_output("lsof -w -Ff -p "+str(ppid)) def check_cmd_validation_output(output, error_msg, count, file, command_name): """ Prints all the Warnings associated with a particular command. """ # print values in global list global missing_path_errors global basename_warnings if output == False: missing_path_errors.append(error_msg) elif str(output).find("basename") >=0: # split the basename warnings basename_warn_data = str(output).split(":") for warning in basename_warn_data: if len(warning) > 0: warning_message = "Warning: At Line: " + count + " in file: "+file+". Files were found using " + warning + \ ". Please make sure that command: " + command_name + " can work with basenames." # add warnings to the global list basename_warnings.append(warning_message) def split_array_old(totnum,nprocs,myrank): """ Returns a split array based on the number of processes and ranks. """ n2=math.floor(totnum/nprocs) remain=math.fmod(totnum,nprocs) if (myrank < remain) : n1=n2+1 ib=(myrank)*n1 ie=ib+(n1-1) else : n1=n2 ib=(myrank)*n1+remain ie=ib+n1-1 return int(ib),int(ie) def split_files_each_proc(file_arr,nprocs): """ Returns array that distributes samples across all processors. """ ntot = len(file_arr) post_proc_file_arr = [] for i in range(0,nprocs): each_proc_arr = [] ib,ie = split_array_old(ntot,nprocs,i) if i == 0: max_no = (ie-ib)+1 for j in range(ib,ie+1): each_proc_arr.append(j) if len(each_proc_arr) > max_no: max_no = len(each_proc_arr) elif len(each_proc_arr) < max_no : for k in range(0,max_no-(len(each_proc_arr))): each_proc_arr.append("no file") max_no = len(each_proc_arr) post_proc_file_arr.append(each_proc_arr) return post_proc_file_arr def split_array(totnum,nprocs,lpp,myrank): """ Determines how the data must be split based on available resources. """ totnum = totnum/lpp n2=int(math.floor(totnum/nprocs)) remain=int(math.fmod(totnum,nprocs)) if (myrank < remain) : n1=n2+1 ib1=(myrank)*n1 ie1=ib1+(n1-1) else : n1=n2 ib1=(myrank)*n1+remain ie1=ib1+n1-1 ib=ib1*lpp ie=((ie1+1)*lpp)-1 ret=[] ret.append(ib) ret.append(ie) return ret def get_filesplit_info(file_struct,inp_files_list,file_chunk_size,nprocs,format_specific_lines): """ Returns a list containing information specific to how the file is split: file size, chunk size, number of chunks, number of lines skipped according to format (eq: fastq = 4). """ file_split_struct=[] for k in range(0,len(file_struct)): if file_struct[k] != 'no file': file_split_info=[] each_file = inp_files_list[file_struct[k]][0] chunk_size = (int(file_chunk_size))*1024*1024 file_size = get_file_size(each_file) nchunks = file_size / chunk_size nprocs=int(nprocs) if nchunks < nprocs: nchunks = nprocs chunk_size = file_size / nchunks if nchunks > nprocs: cpp=long(math.ceil(float(nchunks)/float(nprocs))) nchunks=(nprocs*cpp) chunk_size = file_size / nchunks file_split_info.append(file_size) file_split_info.append(chunk_size) file_split_info.append(nchunks) file_split_info.append(format_specific_lines) if nchunks < nprocs: print " Number of processors provided= ",nprocs print " File contains less then ",nprocs," sequences" print " Please set the number of processors to smaller of equal to number of sequences" print "each processor should get atlest one sequence to process " exit() file_split_struct.append(file_split_info) return file_split_struct def file_cleanup(): """ Removes the temporary log directories after they have been merged. """ os.system("rm -r " + yap_workflow_dict.temp_dir_path) os.system("rm -r " + yap_workflow_dict.err_log_path + "/*_log_temp") os.system("rm -r " + yap_workflow_dict.stat_log_path + "/*_log_temp")