Python re.sub() Examples
The following are 30
code examples of re.sub().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
re
, or try the search function
.

Example #1
Source File: tokenizer_udpipe_mod.py From Turku-neural-parser-pipeline with Apache License 2.0 | 8 votes |
def parse_text(self,txt): err=udpipe.ProcessingError() tokenized="" current_block=[] for line in txt.split("\n"): if re.match(comment_regex, line.lstrip()): # comment line if current_block: tokenized+=self.pipeline.process("\n".join(current_block),err) current_block=[] tokenized+=re.sub(comment_regex, "# ", line.lstrip()+"\n") continue # normal text line, save to current block to be tokenized current_block.append(line) if current_block: tokenized+=self.pipeline.process("\n".join(current_block),err) return tokenized
Example #2
Source File: straight_dope_test_utils.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def _override_relative_paths(notebook): """Overrides the relative path for the data and image directories to point to the right places. This is required as we run the notebooks in a different directory hierarchy more suitable for testing. Args: notebook : string notebook name in folder/notebook format """ notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb" # Read the notebook and set epochs to num_epochs. with io.open(notebook_path, 'r', encoding='utf-8') as f: notebook = f.read() # Update the location for the data directory. modified_notebook = re.sub(RELATIVE_PATH_REGEX, NOTEBOOKS_DIR, notebook) # Replace the original notebook with the modified one. with io.open(notebook_path, 'w', encoding='utf-8') as f: f.write(modified_notebook)
Example #3
Source File: progress_bar.py From clikit with MIT License | 6 votes |
def display(self): """ Output the current progress string. """ if self._io.is_quiet(): return if self._format is None: self._set_real_format( self._internal_format or self._determine_best_format() ) self._overwrite( re.sub( r"(?i)%([a-z\-_]+)(?::([^%]+))?%", self._overwrite_callback, self._format, ) )
Example #4
Source File: pre_submission.py From MPContribs with MIT License | 6 votes |
def add_comp_one(compstr): """ Adds stoichiometries of 1 to compstr that don't have them :param compstr: composition as a string :return: compositon with stoichiometries of 1 added """ sample = pd.np.array(re.sub(r"([A-Z])", r" \1", compstr).split()).astype(str) sample = ["".join(g) for _, g in groupby(sample, str.isalpha)] samp_new = "" for k in range(len(sample)): spl_samp = re.sub(r"([A-Z])", r" \1", sample[k]).split() for l in range(len(spl_samp)): if spl_samp[l][-1].isalpha() and spl_samp[l][-1] != "x": spl_samp[l] = spl_samp[l] + "1" samp_new += spl_samp[l] return samp_new
Example #5
Source File: __init__.py From ALF with Apache License 2.0 | 6 votes |
def test_quo9(self): #right: "<h5 id='id824837' onload='chat(\'id705147\',1,\' width=\\\'2pt\\\'\')'>" # ^ -- esc() -- ^ #wrong: "<h5 id='id824837' onload='chat(\'id705147\',1,\\\' width=\\\'2pt\'\')'>" # ^ -- esc() -- ^ w = Grammar("@id 8\n" "root \"<h5 id='\" id \"' onload='\" esc(func) \"'>\" #rclean\n" "id 'id' [0-9]{6}\n" "func \"chat('\" id \"',\" [0-9] \",'\" esc(\" width='2pt'\") \"')\"\n" , esc=lambda x:re.sub(r"('|\\)", r"\\\1", x)) self.assertRegex(w.generate(), r"^<h5 id='id[0-9]{6}' onload='chat\(\\'id[0-9]{6}" r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$") # same grammar with '@id' in chat() instead of 'id' w = Grammar("@id 8\n" "root \"<h5 id='\" id \"' onload='\" esc(func) \"'>\" #rclean\n" "id 'id' [0-9]{6}\n" "func \"chat('\" @id \"',\" [0-9] \",'\" esc(\" width='2pt'\") \"')\"\n" , esc=lambda x:re.sub(r"('|\\)", r"\\\1", x)) self.assertRegex(w.generate(), r"^<h5 id='(id[0-9]{6})' onload='chat\(\\'\1" r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$")
Example #6
Source File: views.py From MPContribs with MIT License | 6 votes |
def add_comp_one(compstr): """ Adds stoichiometries of 1 to compstr that don't have them :param compstr: composition as a string :return: compositon with stoichiometries of 1 added """ sample = re.sub(r"([A-Z])", r" \1", compstr).split() sample = ["".join(g) for _, g in groupby(str(sample), str.isalpha)] samp_new = "" for k in range(len(sample)): spl_samp = re.sub(r"([A-Z])", r" \1", sample[k]).split() for l in range(len(spl_samp)): if spl_samp[l][-1].isalpha() and spl_samp[l][-1] != "x": spl_samp[l] = spl_samp[l] + "1" samp_new += spl_samp[l] return samp_new
Example #7
Source File: grammr2_test.py From ALF with Apache License 2.0 | 6 votes |
def test_quo9(self): #right: "<h5 id='id824837' onload='chat(\'id705147\',1,\' width=\\\'2pt\\\'\')'>" # ^ -- esc() -- ^ #wrong: "<h5 id='id824837' onload='chat(\'id705147\',1,\\\' width=\\\'2pt\'\')'>" # ^ -- esc() -- ^ w = Grammar("root \"<h5 id='\" id \"' onload='\" esc(func) \"'>\"\n" "id 'id' /[0-9]{6}/\n" "func \"chat('\" id \"',\" /[0-9]/ \",'\" esc(\" width='2pt'\") \"')\"\n" , esc=lambda x: re.sub(r"('|\\)", r"\\\1", x)) self.assertRegex(w.generate(), r"^<h5 id='id[0-9]{6}' onload='chat\(\\'id[0-9]{6}" r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$") # same grammar with '@id' in chat() instead of 'id' w = Grammar("root \"<h5 id='\" id \"' onload='\" esc(func) \"'>\"\n" "id 'id' /[0-9]{6}/\n" "func \"chat('\" @id \"',\" /[0-9]/ \",'\" esc(\" width='2pt'\") \"')\"\n" , esc=lambda x: re.sub(r"('|\\)", r"\\\1", x)) self.assertRegex(w.generate(), r"^<h5 id='(id[0-9]{6})' onload='chat\(\\'\1" r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$")
Example #8
Source File: zmirror.py From zmirror with MIT License | 6 votes |
def response_cookie_rewrite(cookie_string): """ rewrite response cookie string's domain to `my_host_name` :type cookie_string: str """ cookie_string = regex_cookie_rewriter.sub('domain=' + my_host_name_no_port, cookie_string) return cookie_string # ################# End Server Response Handler ################# # ################# Begin Client Request Handler #################
Example #9
Source File: uninstall_distro.py From multibootusb with GNU General Public License v2.0 | 6 votes |
def update_sys_cfg_file(uninstall_distro_dir_name): """ Main function to remove uninstall distro specific operations. :return: """ sys_cfg_file = os.path.join(config.usb_mount, "multibootusb", "syslinux.cfg") if not os.path.exists(sys_cfg_file): gen.log("syslinux.cfg file not found for updating changes.") else: gen.log("Updating syslinux.cfg file...") string = open(sys_cfg_file).read() string = re.sub(r'#start ' + re.escape(uninstall_distro_dir_name) + '.*?' + '#end ' + re.escape(uninstall_distro_dir_name) + r'\s*', '', string, flags=re.DOTALL) config_file = open(sys_cfg_file, "w") config_file.write(string) config_file.close()
Example #10
Source File: uninstall_distro.py From multibootusb with GNU General Public License v2.0 | 6 votes |
def update_grub_cfg_file(uninstall_distro_dir_name): """ Main function to remove uninstall distro name from the grub.cfg file. :return: """ grub_cfg_file = os.path.join(config.usb_mount, "multibootusb", "grub", "grub.cfg") if not os.path.exists(grub_cfg_file): gen.log("grub.cfg file not found for updating changes.") else: gen.log("Updating grub.cfg file...") string = open(grub_cfg_file).read() string = re.sub(r'#start ' + re.escape(uninstall_distro_dir_name) + '.*?' + '#end ' + re.escape(uninstall_distro_dir_name) + r'\s*', '', string, flags=re.DOTALL) config_file = open(grub_cfg_file, "w") config_file.write(string) config_file.close()
Example #11
Source File: pagination.py From grlc with MIT License | 6 votes |
def buildPaginationHeader(resultCount, resultsPerPage, pageArg, url): """Build link header for result pagination""" lastPage = resultCount / resultsPerPage if pageArg: page = int(pageArg) next_url = re.sub("page=[0-9]+", "page={}".format(page + 1), url) prev_url = re.sub("page=[0-9]+", "page={}".format(page - 1), url) first_url = re.sub("page=[0-9]+", "page=1", url) last_url = re.sub("page=[0-9]+", "page={}".format(lastPage), url) else: page = 1 next_url = url + "?page=2" prev_url = "" first_url = url + "?page=1" last_url = url + "?page={}".format(lastPage) if page == 1: headerLink = "<{}>; rel=next, <{}>; rel=last".format(next_url, last_url) elif page == lastPage: headerLink = "<{}>; rel=prev, <{}>; rel=first".format(prev_url, first_url) else: headerLink = "<{}>; rel=next, <{}>; rel=prev, <{}>; rel=first, <{}>; rel=last".format(next_url, prev_url, first_url, last_url) return headerLink
Example #12
Source File: mask_db.py From CAMISIM with Apache License 2.0 | 6 votes |
def removeLines(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.tax') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#') col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#') out = csv.OutFileBuffer(dstFilePath) removed = 0 for col0,col1 in zip(col0,col1): if re.sub(pattern, r'\1', col0) not in removeSet: out.writeText(str(col0 + '\t' + col1 + '\n')) else: removed += 1 out.close() print mg, 'removeLines', removed
Example #13
Source File: analysis_mg.py From CAMISIM with Apache License 2.0 | 6 votes |
def parse(self, line): lineArray = line.split() if len(lineArray) != 2: print '_MothurOutFileParser: wrong line', line return name = re.sub(r'^([0-9]+_[0-9]+)_[0-9]+_[0-9]+_[pr]+[0-2]$',r'\1', lineArray[0]) tag = re.sub(r'^[0-9]+_[0-9]+_([0-9]+_[0-9]+_[pr]+[0-2])$',r'\1', lineArray[0]) placementList = lineArray[1].replace('unclassified;', '').rsplit(';') if len(placementList) < 2: #print '_MothurOutFileParser: skip line', line return placement = placementList[-2] try: clade = int(re.sub('([0-9]+)\(.*', r'\1' , placement)) except ValueError: return weight = float(re.sub('[0-9]+\(([0-9\.]+)\)', r'\1' , placement)) entry = str(str(name) + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(self.source) + '\t' + str(tag)) if self.outBuffer.isEmpty(): self.outBuffer.writeText(entry) else: self.outBuffer.writeText(str('\n' + entry))
Example #14
Source File: gquery.py From grlc with MIT License | 6 votes |
def paginate_query(query, results_per_page, get_args): """Modify the given query so that it can be paginated. The paginated query will split display a maximum of `results_per_page`.""" page = get_args.get('page', 1) glogger.info("Paginating query for page {}, {} results per page".format(page, results_per_page)) # If contains LIMIT or OFFSET, remove them glogger.debug("Original query: " + query) no_limit_query = re.sub("((LIMIT|OFFSET)\s+[0-9]+)*", "", query) glogger.debug("No limit query: " + no_limit_query) # Append LIMIT results_per_page OFFSET (page-1)*results_per_page paginated_query = no_limit_query + " LIMIT {} OFFSET {}".format(results_per_page, (int(page) - 1) * results_per_page) glogger.debug("Paginated query: " + paginated_query) return paginated_query
Example #15
Source File: straight_dope_test_utils.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def _override_epochs(notebook): """Overrides the number of epochs in the notebook to 1 epoch. Note this operation is idempotent. Args: notebook : string notebook name in folder/notebook format """ notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb" # Read the notebook and set epochs to num_epochs. with io.open(notebook_path, 'r', encoding='utf-8') as f: notebook = f.read() # Set number of epochs to 1. modified_notebook = re.sub(EPOCHS_REGEX, 'epochs = 1', notebook) # Replace the original notebook with the modified one. with io.open(notebook_path, 'w', encoding='utf-8') as f: f.write(modified_notebook)
Example #16
Source File: lint.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def get_header_guard_dmlc(filename): """Get Header Guard Convention for DMLC Projects. For headers in include, directly use the path For headers in src, use project name plus path Examples: with project-name = dmlc include/dmlc/timer.h -> DMLC_TIMTER_H_ src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_ """ fileinfo = cpplint.FileInfo(filename) file_path_from_root = fileinfo.RepositoryName() inc_list = ['include', 'api', 'wrapper'] if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None: idx = file_path_from_root.find('src/') file_path_from_root = _HELPER.project_name + file_path_from_root[idx + 3:] else: for spath in inc_list: prefix = spath + os.sep if file_path_from_root.startswith(prefix): file_path_from_root = re.sub('^' + prefix, '', file_path_from_root) break return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
Example #17
Source File: symbol_doc.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def _build_doc(func_name, desc, arg_names, arg_types, arg_desc, key_var_num_args=None, ret_type=None): """Build docstring for symbolic functions.""" param_str = _build_param_doc(arg_names, arg_types, arg_desc) if key_var_num_args: desc += '\nThis function support variable length of positional input.' doc_str = ('%s\n\n' + '%s\n' + 'name : string, optional.\n' + ' Name of the resulting symbol.\n\n' + 'Returns\n' + '-------\n' + 'Symbol\n' + ' The result symbol.') doc_str = doc_str % (desc, param_str) extra_doc = "\n" + '\n'.join([x.__doc__ for x in type.__subclasses__(SymbolDoc) if x.__name__ == '%sDoc' % func_name]) doc_str += _re.sub(_re.compile(" "), "", extra_doc) doc_str = _re.sub('NDArray-or-Symbol', 'Symbol', doc_str) return doc_str
Example #18
Source File: lint.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def get_header_guard_dmlc(filename): """Get Header Guard Convention for DMLC Projects. For headers in include, directly use the path For headers in src, use project name plus path Examples: with project-name = dmlc include/dmlc/timer.h -> DMLC_TIMTER_H_ src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_ """ fileinfo = cpplint.FileInfo(filename) file_path_from_root = fileinfo.RepositoryName() inc_list = ['include', 'api', 'wrapper'] if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None: idx = file_path_from_root.find('src/') file_path_from_root = _HELPER.project_name + file_path_from_root[idx + 3:] else: for spath in inc_list: prefix = spath + os.sep if file_path_from_root.startswith(prefix): file_path_from_root = re.sub('^' + prefix, '', file_path_from_root) break return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
Example #19
Source File: data_helpers.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower()
Example #20
Source File: data_helpers.py From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 | 6 votes |
def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower()
Example #21
Source File: DataLoader_NER.py From pytorch_NER_BiLSTM_CNN_CRF with Apache License 2.0 | 6 votes |
def _clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower()
Example #22
Source File: cluster.py From CAMISIM with Apache License 2.0 | 6 votes |
def __init__(self, line): tokens = line.split(',') self._threshold = float(re.sub(r'^([^\t]+)\t[^\t]+\t.*', r'\1', tokens[0])) tokens[0] = re.sub(r'^[^\t]+\t[^\t]+\t(.*)', r'\1', tokens[0]) self.groupIdCount = 0 self.seqNameToGroupId = dict([]) self.groupIdToSeqNameSet = dict([]) for token in tokens: names = token.split('\t') self.groupIdToSeqNameSet[self.groupIdCount] = set([]) for name in names: #print name if re.match(r'^[0-9]+_.*$', name): seqName = re.sub(r'^([0-9]+_[0-9]+)_.*$',r'\1', name) self.seqNameToGroupId[seqName] = self.groupIdCount self.groupIdToSeqNameSet[self.groupIdCount].add(seqName) self.groupIdCount += 1
Example #23
Source File: analysis_mg.py From CAMISIM with Apache License 2.0 | 6 votes |
def parse(self, line): if line.strip() == '': return if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+\t[^\t]+$', line): scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+\t[^\t]+$',r'\1' ,line)) contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+\t[^\t]+$',r'\1' ,line)) ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+\t[^\t]+$',r'\1' ,line)) weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+\t[^\t]+$',r'\1' ,line)) source = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)\t[^\t]+$',r'\1' ,line)) tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+\t([^\t]+)$',r'\1' ,line)) if ncbid != 1: taxPathDict = self.taxonomy.getPathToRoot(ncbid) if taxPathDict is not None and taxPathDict.keys() >= 1: self.sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag) self.assignedIdList.append(contigId) else: sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid)))
Example #24
Source File: mask_db.py From CAMISIM with Apache License 2.0 | 6 votes |
def removeSequences(mg): removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt' #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt' srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna') #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' ) pattern = r'.*ncbid:([0-9]+)$' #pattern = r'^([^\-]+)\-.*$' removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#')) seqIdToSeq = fas.fastaFileToDict(srcFilePath) out = csv.OutFileBuffer(dstFilePath) removed = 0 for seqId in seqIdToSeq: if re.sub(pattern, r'\1', str(seqId)) not in removeSet: out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n')) else: removed += 1 out.close() print mg, 'removeSequences', removed
Example #25
Source File: pps.py From CAMISIM with Apache License 2.0 | 5 votes |
def toRealNames(config, sequences): """ Transforms a PPS file fileName.fas.PP.out that names sequences according to their ids to their real names. """ outIdsPPSFile = str(config.get('inputIdsFastaFile') + '.PP.out') outNamesPPSFile = outIdsPPSFile + '.n' #os.path.normpath print outNamesPPSFile try: fr = open(os.path.normpath(outIdsPPSFile),'r') fw = open(os.path.normpath(outNamesPPSFile),'w') except Exception: print "Cannot open one of the files:", outIdsPPSFile, "or", outNamesPPSFile raise else: for line in fr: if re.match(r'^[0-9]+_[0-9]+[^0-9].*$', line): id = re.sub(r'^[0-9]+_([0-9]+)[^0-9].*$',r'\1' , line) rest = re.sub(r'^[0-9]+_[0-9]+([^0-9].*)$',r'\1' , line) seq = sequences.getSequence(int(id)) fw.write(seq.name + rest) # seq.scaffold.name else: fw.write(line) finally: fr.close() fw.close()
Example #26
Source File: sequences.py From CAMISIM with Apache License 2.0 | 5 votes |
def replaceIdsWithNames(outputFileContigSubPattern, nameToIDsFile, targetFile, outFile): """ @deprecated: NOT IMPLEMENTED YET!!! replace ids with names @param nameToIdsFile: file that contains lines: contigName tab contigID @param targetFile: file that contain in the first column scaffoldID_contigID which will be replaced by its name @param outFile: file that contain the first column in the form scaffoldID_contigID with the name (that can be modified by substitution defined in the config file .. according to outputFileContigSubPattern) """ idToName = dir([]) assert False, 'NOT IMPLEMENTED YET' #try: # f = open(os.path.normpath(nameToIDsFile), 'r') # for line in f: # if re.match('^#', line): # continue # name = re.sub(outputFileContigSubPattern, r'\1' , noNewLine(re.sub(r'^([^ \t]+)\t[0-9]+$',r'\1', line))) # id = int(noNewLine(re.sub(r'^[^ \t]+\t([0-9]+)$',r'\1', line))) # idToName[id] = name #except Exception: # print "Cannot create a file or write to it:", outFile # raise #finally: # f.close() #now: go through the targetFile and for each line do: # extract contigID and the rest of the line ^[0-9]+_[0-9]+([^0-9].*)$ # write name + rest of the line + \n to the outFile !!!!!!!!!! #compare two sequences according to their length #def seqLenCmp(seq1, seq2): # return seq1.seqBp - seq2.seqBp
Example #27
Source File: analysis16s.py From CAMISIM with Apache License 2.0 | 5 votes |
def _setCandidatePlacement(self, sequences, taxonomy, predFileName, source): assignedIdList = [] try: f = open(os.path.normpath(predFileName),'r') except Exception: print "Cannot open file:", predFileName raise else: for line in f: line = common.noNewLine(line) if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$', line): scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line)) contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line)) ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+$',r'\1' ,line)) weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+$',r'\1' ,line)) tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)$',r'\1' ,line)) if ncbid != 1: taxPathDict = taxonomy.getPathToRoot(ncbid) if taxPathDict is not None and taxPathDict.keys() >= 1: sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag) assignedIdList.append(contigId) else: sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid))) finally: f.close() return set(assignedIdList)
Example #28
Source File: sequences.py From CAMISIM with Apache License 2.0 | 5 votes |
def writePlacementsPPOut(self, outFile, taxaRanks, outputFileContigSubPattern): try: f = open(os.path.normpath(outFile), 'w') f.write('#Output of pPPS\n#\n'), header = str('#ID' + '\t' + 'root') for rank in taxaRanks: header += str('\t' + rank) f.write(header) for seq in self.sequences: entry = str('\n' + re.sub(outputFileContigSubPattern, r'\1' , seq.name)) taxPathDict = seq.getTaxonomyPath() if taxPathDict is None: entry += str('\t') else: entry += str('\t' + 'root') for rank in taxaRanks: if (taxPathDict is not None) and (rank in taxPathDict) and (not taxPathDict[rank].isCopy()): entry += str('\t' + taxPathDict[rank].name) else: entry += '\t' f.write(entry) except Exception: print "Cannot create a file or write to it:", outFile raise finally: f.close()
Example #29
Source File: pps.py From CAMISIM with Apache License 2.0 | 5 votes |
def readPPSOutput(sequences, taxonomy, inputFastaIdsPPSFile, overwriteAllPlacements=False): """ Reads the output file of PPS and for each sequence decides: if overwriteAllPlacements=True is, then the sequence is placed according to the PPS file regardless of its previous placement if overwriteAllPlacements=False then if a sequence is placed to a less specific rank, than PPS suggests then the sequence is placed according to the PPS file """ infile = str(inputFastaIdsPPSFile + '.out') try: f = open(os.path.normpath(infile),'r') except Exception: print "Cannot open file:", infile raise else: #i = 0 for line in f: line = common.noNewLine(line) if re.match(r'^[0-9]+_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$', line): scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$',r'\1' ,line)) contigId = int(re.sub(r'^[0-9]+_([0-9]+).*[^0-9]+[0-9]+[^0-9]*$',r'\1' ,line)) ncbid = int(re.sub(r'^[0-9]+_[0-9]+.*[^0-9]+([0-9]+)[^0-9]*$',r'\1' ,line)) weight = None # the weight is not yet defined !!! if ncbid != 1: #print line, ":", scaffoldId, contigId, ncbid taxPathDictPPS = taxonomy.getPathToRoot(ncbid) if taxPathDictPPS.keys() >= 1: taxPathDictCurrent = sequences.getSequence(contigId).getTaxonomyPath() if taxPathDictCurrent == None: sequences.setTaxonomyPath(contigId, scaffoldId, taxPathDictPPS, weight)#weight = None !!! #i += 1 else: if ((overwriteAllPlacements) or (taxPathDictPPS.keys() > taxPathDictCurrent.keys())): sequences.setTaxonomyPathOverride(contigId, scaffoldId, taxPathDictPPS, weight)#weight = None !!! #i += 1 #print "placed seq by PPS:", i finally: f.close()
Example #30
Source File: loader.py From Att-ChemdNER with Apache License 2.0 | 5 votes |
def augment_with_pretrained(dictionary, ext_emb_path, words): #{{{ """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print 'Loading pretrained embeddings from %s...' % ext_emb_path assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if words is None: for word in pretrained: if word not in dictionary: dictionary[word] = 0 else: for word in words: if any(x in pretrained for x in [ word, word.lower(), re.sub('\d', '0', word.lower()) ]) and word not in dictionary: dictionary[word] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word #}}}