Python re.sub() Examples

The following are 30 code examples of re.sub(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module re , or try the search function .
Example #1
Source File: tokenizer_udpipe_mod.py    From Turku-neural-parser-pipeline with Apache License 2.0 8 votes vote down vote up
def parse_text(self,txt):
        err=udpipe.ProcessingError()
        tokenized=""
        current_block=[]
        for line in txt.split("\n"):
            if re.match(comment_regex, line.lstrip()): # comment line
                if current_block:
                    tokenized+=self.pipeline.process("\n".join(current_block),err)
                    current_block=[]
                tokenized+=re.sub(comment_regex, "# ", line.lstrip()+"\n")
                continue
            # normal text line, save to current block to be tokenized
            current_block.append(line)
        if current_block:
            tokenized+=self.pipeline.process("\n".join(current_block),err)
        return tokenized 
Example #2
Source File: straight_dope_test_utils.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def _override_relative_paths(notebook):
    """Overrides the relative path for the data and image directories to point
    to the right places. This is required as we run the notebooks in a different
    directory hierarchy more suitable for testing.

    Args:
        notebook : string
            notebook name in folder/notebook format
    """
    notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb"

    # Read the notebook and set epochs to num_epochs.
    with io.open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = f.read()

    # Update the location for the data directory.
    modified_notebook = re.sub(RELATIVE_PATH_REGEX, NOTEBOOKS_DIR, notebook)

    # Replace the original notebook with the modified one.
    with io.open(notebook_path, 'w', encoding='utf-8') as f:
        f.write(modified_notebook) 
Example #3
Source File: views.py    From MPContribs with MIT License 6 votes vote down vote up
def add_comp_one(compstr):
    """
    Adds stoichiometries of 1 to compstr that don't have them
    :param compstr:  composition as a string
    :return:         compositon with stoichiometries of 1 added
    """
    sample = re.sub(r"([A-Z])", r" \1", compstr).split()
    sample = ["".join(g) for _, g in groupby(str(sample), str.isalpha)]
    samp_new = ""
    for k in range(len(sample)):
        spl_samp = re.sub(r"([A-Z])", r" \1", sample[k]).split()
        for l in range(len(spl_samp)):
            if spl_samp[l][-1].isalpha() and spl_samp[l][-1] != "x":
                spl_samp[l] = spl_samp[l] + "1"
            samp_new += spl_samp[l]

    return samp_new 
Example #4
Source File: __init__.py    From ALF with Apache License 2.0 6 votes vote down vote up
def test_quo9(self):
        #right: "<h5 id='id824837' onload='chat(\'id705147\',1,\' width=\\\'2pt\\\'\')'>"
        #                                                        ^  -- esc() --   ^
        #wrong: "<h5 id='id824837' onload='chat(\'id705147\',1,\\\' width=\\\'2pt\'\')'>"
        #                                                      ^  -- esc() --   ^
        w = Grammar("@id 8\n"
                    "root   \"<h5 id='\" id \"' onload='\" esc(func) \"'>\" #rclean\n"
                    "id     'id' [0-9]{6}\n"
                    "func   \"chat('\" id \"',\" [0-9] \",'\" esc(\" width='2pt'\") \"')\"\n"
                    , esc=lambda x:re.sub(r"('|\\)", r"\\\1", x))
        self.assertRegex(w.generate(), r"^<h5 id='id[0-9]{6}' onload='chat\(\\'id[0-9]{6}"
                                       r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$")
        # same grammar with '@id' in chat() instead of 'id'
        w = Grammar("@id 8\n"
                    "root   \"<h5 id='\" id \"' onload='\" esc(func) \"'>\" #rclean\n"
                    "id     'id' [0-9]{6}\n"
                    "func   \"chat('\" @id \"',\" [0-9] \",'\" esc(\" width='2pt'\") \"')\"\n"
                    , esc=lambda x:re.sub(r"('|\\)", r"\\\1", x))
        self.assertRegex(w.generate(), r"^<h5 id='(id[0-9]{6})' onload='chat\(\\'\1"
                                       r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$") 
Example #5
Source File: grammr2_test.py    From ALF with Apache License 2.0 6 votes vote down vote up
def test_quo9(self):
        #right: "<h5 id='id824837' onload='chat(\'id705147\',1,\' width=\\\'2pt\\\'\')'>"
        #                                                        ^  -- esc() --   ^
        #wrong: "<h5 id='id824837' onload='chat(\'id705147\',1,\\\' width=\\\'2pt\'\')'>"
        #                                                      ^  -- esc() --   ^
        w = Grammar("root   \"<h5 id='\" id \"' onload='\" esc(func) \"'>\"\n"
                    "id     'id' /[0-9]{6}/\n"
                    "func   \"chat('\" id \"',\" /[0-9]/ \",'\" esc(\" width='2pt'\") \"')\"\n"
                    , esc=lambda x: re.sub(r"('|\\)", r"\\\1", x))
        self.assertRegex(w.generate(), r"^<h5 id='id[0-9]{6}' onload='chat\(\\'id[0-9]{6}"
                                       r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$")
        # same grammar with '@id' in chat() instead of 'id'
        w = Grammar("root   \"<h5 id='\" id \"' onload='\" esc(func) \"'>\"\n"
                    "id     'id' /[0-9]{6}/\n"
                    "func   \"chat('\" @id \"',\" /[0-9]/ \",'\" esc(\" width='2pt'\") \"')\"\n"
                    , esc=lambda x: re.sub(r"('|\\)", r"\\\1", x))
        self.assertRegex(w.generate(), r"^<h5 id='(id[0-9]{6})' onload='chat\(\\'\1"
                                       r"\\',[0-9],\\' width=\\\\\\'2pt\\\\\\'\\'\)'>$") 
Example #6
Source File: progress_bar.py    From clikit with MIT License 6 votes vote down vote up
def display(self):
        """
        Output the current progress string.
        """
        if self._io.is_quiet():
            return

        if self._format is None:
            self._set_real_format(
                self._internal_format or self._determine_best_format()
            )

        self._overwrite(
            re.sub(
                r"(?i)%([a-z\-_]+)(?::([^%]+))?%",
                self._overwrite_callback,
                self._format,
            )
        ) 
Example #7
Source File: pre_submission.py    From MPContribs with MIT License 6 votes vote down vote up
def add_comp_one(compstr):
    """
    Adds stoichiometries of 1 to compstr that don't have them
    :param compstr:  composition as a string
    :return:         compositon with stoichiometries of 1 added
    """
    sample = pd.np.array(re.sub(r"([A-Z])", r" \1", compstr).split()).astype(str)
    sample = ["".join(g) for _, g in groupby(sample, str.isalpha)]
    samp_new = ""
    for k in range(len(sample)):
        spl_samp = re.sub(r"([A-Z])", r" \1", sample[k]).split()
        for l in range(len(spl_samp)):
            if spl_samp[l][-1].isalpha() and spl_samp[l][-1] != "x":
                spl_samp[l] = spl_samp[l] + "1"
            samp_new += spl_samp[l]
    return samp_new 
Example #8
Source File: uninstall_distro.py    From multibootusb with GNU General Public License v2.0 6 votes vote down vote up
def update_sys_cfg_file(uninstall_distro_dir_name):
    """
    Main function to remove uninstall distro specific operations.
    :return:
    """

    sys_cfg_file = os.path.join(config.usb_mount, "multibootusb", "syslinux.cfg")
    if not os.path.exists(sys_cfg_file):
        gen.log("syslinux.cfg file not found for updating changes.")
    else:
        gen.log("Updating syslinux.cfg file...")
        string = open(sys_cfg_file).read()
        string = re.sub(r'#start ' + re.escape(uninstall_distro_dir_name)
                        + '.*?' + '#end '
                        + re.escape(uninstall_distro_dir_name)
                        + r'\s*', '', string, flags=re.DOTALL)
        config_file = open(sys_cfg_file, "w")
        config_file.write(string)
        config_file.close() 
Example #9
Source File: uninstall_distro.py    From multibootusb with GNU General Public License v2.0 6 votes vote down vote up
def update_grub_cfg_file(uninstall_distro_dir_name):
    """
    Main function to remove uninstall distro name from the grub.cfg file.
    :return:
    """

    grub_cfg_file = os.path.join(config.usb_mount, "multibootusb",
                                 "grub", "grub.cfg")
    if not os.path.exists(grub_cfg_file):
        gen.log("grub.cfg file not found for updating changes.")
    else:
        gen.log("Updating grub.cfg file...")
        string = open(grub_cfg_file).read()
        string = re.sub(r'#start ' + re.escape(uninstall_distro_dir_name)
                        + '.*?' + '#end '
                        + re.escape(uninstall_distro_dir_name)
                        + r'\s*', '', string, flags=re.DOTALL)
        config_file = open(grub_cfg_file, "w")
        config_file.write(string)
        config_file.close() 
Example #10
Source File: pagination.py    From grlc with MIT License 6 votes vote down vote up
def buildPaginationHeader(resultCount, resultsPerPage, pageArg, url):
    """Build link header for result pagination"""
    lastPage = resultCount / resultsPerPage

    if pageArg:
        page = int(pageArg)
        next_url = re.sub("page=[0-9]+", "page={}".format(page + 1), url)
        prev_url = re.sub("page=[0-9]+", "page={}".format(page - 1), url)
        first_url = re.sub("page=[0-9]+", "page=1", url)
        last_url = re.sub("page=[0-9]+", "page={}".format(lastPage), url)
    else:
        page = 1
        next_url = url + "?page=2"
        prev_url = ""
        first_url = url + "?page=1"
        last_url = url + "?page={}".format(lastPage)

    if page == 1:
        headerLink = "<{}>; rel=next, <{}>; rel=last".format(next_url, last_url)
    elif page == lastPage:
        headerLink = "<{}>; rel=prev, <{}>; rel=first".format(prev_url, first_url)
    else:
        headerLink = "<{}>; rel=next, <{}>; rel=prev, <{}>; rel=first, <{}>; rel=last".format(next_url, prev_url, first_url, last_url)
    return headerLink 
Example #11
Source File: mask_db.py    From CAMISIM with Apache License 2.0 6 votes vote down vote up
def removeLines(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.tax')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.tax')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.tax' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    col0 = csv.getColumnAsList(srcFilePath, colNum=0, sep='\t', comment='#')
    col1 = csv.getColumnAsList(srcFilePath, colNum=1, sep='\t', comment='#')
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for col0,col1 in zip(col0,col1):
        if re.sub(pattern, r'\1', col0) not in removeSet:
            out.writeText(str(col0 + '\t' + col1 + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeLines', removed 
Example #12
Source File: mask_db.py    From CAMISIM with Apache License 2.0 6 votes vote down vote up
def removeSequences(mg):
    removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_ncbids.txt'
    #removeListFilePath = '/net/metagenomics/projects/PPSmg/data/V35/genome_accession_silva.txt'
    srcFilePath = str('/net/metagenomics/projects/PPSmg/data/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/markerGenes/db/' + mg + '_bact+arch_dnaV.noalign.fna')
    #srcFilePath = str('/net/metagenomics/projects/PPSmg/data/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    #dstFilePath = str('/net/metagenomics/projects/PPSmg/data/V35/genomesRemoved/silva/' + mg + '_silva106_ncbitax.bacteria+archaea.fna' )
    pattern = r'.*ncbid:([0-9]+)$'
    #pattern = r'^([^\-]+)\-.*$'

    removeSet = set(csv.getColumnAsList(removeListFilePath, colNum=0, comment='#'))
    seqIdToSeq = fas.fastaFileToDict(srcFilePath)
    out = csv.OutFileBuffer(dstFilePath)
    removed = 0
    for seqId in seqIdToSeq:
        if re.sub(pattern, r'\1', str(seqId)) not in removeSet:
            out.writeText(str('>' + str(seqId) + '\n' + str(seqIdToSeq[seqId]) + '\n'))
        else:
            removed += 1

    out.close()
    print mg, 'removeSequences', removed 
Example #13
Source File: analysis_mg.py    From CAMISIM with Apache License 2.0 6 votes vote down vote up
def parse(self, line):
        if line.strip() == '':
            return

        if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+\t[^\t]+$', line):
            scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+\t[^\t]+$',r'\1' ,line))
            contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+\t[^\t]+$',r'\1' ,line))
            ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+\t[^\t]+$',r'\1' ,line))
            weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+\t[^\t]+$',r'\1' ,line))
            source = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)\t[^\t]+$',r'\1' ,line))
            tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+\t([^\t]+)$',r'\1' ,line))

        if ncbid != 1:
            taxPathDict = self.taxonomy.getPathToRoot(ncbid)
            if taxPathDict is not None and taxPathDict.keys() >= 1:
                self.sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag)
                self.assignedIdList.append(contigId)
            else:
                sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid))) 
Example #14
Source File: gquery.py    From grlc with MIT License 6 votes vote down vote up
def paginate_query(query, results_per_page, get_args):
    """Modify the given query so that it can be paginated. The paginated query will 
    split display a maximum of `results_per_page`."""
    page = get_args.get('page', 1)

    glogger.info("Paginating query for page {}, {} results per page".format(page, results_per_page))

    # If contains LIMIT or OFFSET, remove them
    glogger.debug("Original query: " + query)
    no_limit_query = re.sub("((LIMIT|OFFSET)\s+[0-9]+)*", "", query)
    glogger.debug("No limit query: " + no_limit_query)

    # Append LIMIT results_per_page OFFSET (page-1)*results_per_page
    paginated_query = no_limit_query + " LIMIT {} OFFSET {}".format(results_per_page,
                                                                    (int(page) - 1) * results_per_page)
    glogger.debug("Paginated query: " + paginated_query)

    return paginated_query 
Example #15
Source File: straight_dope_test_utils.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def _override_epochs(notebook):
    """Overrides the number of epochs in the notebook to 1 epoch. Note this operation is idempotent.

    Args:
        notebook : string
            notebook name in folder/notebook format
    """
    notebook_path = os.path.join(*([NOTEBOOKS_DIR] + notebook.split('/'))) + ".ipynb"

    # Read the notebook and set epochs to num_epochs.
    with io.open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = f.read()

    # Set number of epochs to 1.
    modified_notebook = re.sub(EPOCHS_REGEX, 'epochs = 1', notebook)

    # Replace the original notebook with the modified one.
    with io.open(notebook_path, 'w', encoding='utf-8') as f:
        f.write(modified_notebook) 
Example #16
Source File: lint.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def get_header_guard_dmlc(filename):
    """Get Header Guard Convention for DMLC Projects.
    For headers in include, directly use the path
    For headers in src, use project name plus path
    Examples: with project-name = dmlc
        include/dmlc/timer.h -> DMLC_TIMTER_H_
        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
    """
    fileinfo = cpplint.FileInfo(filename)
    file_path_from_root = fileinfo.RepositoryName()
    inc_list = ['include', 'api', 'wrapper']

    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
        idx = file_path_from_root.find('src/')
        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
    else:
        for spath in inc_list:
            prefix = spath + os.sep
            if file_path_from_root.startswith(prefix):
                file_path_from_root = re.sub('^' + prefix, '', file_path_from_root)
                break
    return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' 
Example #17
Source File: symbol_doc.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def _build_doc(func_name,
               desc,
               arg_names,
               arg_types,
               arg_desc,
               key_var_num_args=None,
               ret_type=None):
    """Build docstring for symbolic functions."""
    param_str = _build_param_doc(arg_names, arg_types, arg_desc)
    if key_var_num_args:
        desc += '\nThis function support variable length of positional input.'
    doc_str = ('%s\n\n' +
               '%s\n' +
               'name : string, optional.\n' +
               '    Name of the resulting symbol.\n\n' +
               'Returns\n' +
               '-------\n' +
               'Symbol\n' +
               '    The result symbol.')
    doc_str = doc_str % (desc, param_str)
    extra_doc = "\n" + '\n'.join([x.__doc__ for x in type.__subclasses__(SymbolDoc)
                                  if x.__name__ == '%sDoc' % func_name])
    doc_str += _re.sub(_re.compile("    "), "", extra_doc)
    doc_str = _re.sub('NDArray-or-Symbol', 'Symbol', doc_str)
    return doc_str 
Example #18
Source File: lint.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def get_header_guard_dmlc(filename):
    """Get Header Guard Convention for DMLC Projects.

    For headers in include, directly use the path
    For headers in src, use project name plus path

    Examples: with project-name = dmlc
        include/dmlc/timer.h -> DMLC_TIMTER_H_
        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
    """
    fileinfo = cpplint.FileInfo(filename)
    file_path_from_root = fileinfo.RepositoryName()
    inc_list = ['include', 'api', 'wrapper']

    if file_path_from_root.find('src/') != -1 and _HELPER.project_name is not None:
        idx = file_path_from_root.find('src/')
        file_path_from_root = _HELPER.project_name +  file_path_from_root[idx + 3:]
    else:
        for spath in inc_list:
            prefix = spath + os.sep
            if file_path_from_root.startswith(prefix):
                file_path_from_root = re.sub('^' + prefix, '', file_path_from_root)
                break
    return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_' 
Example #19
Source File: data_helpers.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower() 
Example #20
Source File: data_helpers.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower() 
Example #21
Source File: DataLoader_NER.py    From pytorch_NER_BiLSTM_CNN_CRF with Apache License 2.0 6 votes vote down vote up
def _clean_str(string):
        """
        Tokenization/string cleaning for all datasets except for SST.
        Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
        """
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower() 
Example #22
Source File: cluster.py    From CAMISIM with Apache License 2.0 6 votes vote down vote up
def __init__(self, line):
        tokens = line.split(',')
        self._threshold = float(re.sub(r'^([^\t]+)\t[^\t]+\t.*', r'\1', tokens[0]))
        tokens[0] = re.sub(r'^[^\t]+\t[^\t]+\t(.*)', r'\1', tokens[0])
        self.groupIdCount = 0
        self.seqNameToGroupId = dict([])
        self.groupIdToSeqNameSet = dict([])
        for token in tokens:
            names = token.split('\t')
            self.groupIdToSeqNameSet[self.groupIdCount] = set([])
            for name in names:
                #print name
                if re.match(r'^[0-9]+_.*$', name):
                    seqName = re.sub(r'^([0-9]+_[0-9]+)_.*$',r'\1', name)
                    self.seqNameToGroupId[seqName] = self.groupIdCount
                    self.groupIdToSeqNameSet[self.groupIdCount].add(seqName)
            self.groupIdCount += 1 
Example #23
Source File: analysis_mg.py    From CAMISIM with Apache License 2.0 6 votes vote down vote up
def parse(self, line):
        lineArray = line.split()
        if len(lineArray) != 2:
            print '_MothurOutFileParser: wrong line', line
            return
        name = re.sub(r'^([0-9]+_[0-9]+)_[0-9]+_[0-9]+_[pr]+[0-2]$',r'\1', lineArray[0])
        tag = re.sub(r'^[0-9]+_[0-9]+_([0-9]+_[0-9]+_[pr]+[0-2])$',r'\1', lineArray[0])
        placementList = lineArray[1].replace('unclassified;', '').rsplit(';')
        if len(placementList) < 2:
            #print '_MothurOutFileParser: skip line', line
            return

        placement = placementList[-2]
        try:
            clade = int(re.sub('([0-9]+)\(.*', r'\1' , placement))
        except ValueError:
            return
        weight = float(re.sub('[0-9]+\(([0-9\.]+)\)', r'\1' , placement))

        entry = str(str(name) + '\t' + str(clade) + '\t' + str(weight) + '\t' + str(self.source) + '\t' + str(tag))
        if self.outBuffer.isEmpty():
            self.outBuffer.writeText(entry)
        else:
            self.outBuffer.writeText(str('\n' + entry)) 
Example #24
Source File: loader.py    From Att-ChemdNER with Apache License 2.0 5 votes vote down vote up
def augment_with_pretrained(dictionary, ext_emb_path, words):
#{{{
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print 'Loading pretrained embeddings from %s...' % ext_emb_path
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if words is None:
        for word in pretrained:
            if word not in dictionary:
                dictionary[word] = 0
    else:
        for word in words:
            if any(x in pretrained for x in [
                word,
                word.lower(),
                re.sub('\d', '0', word.lower())
            ]) and word not in dictionary:
                dictionary[word] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word 
#}}} 
Example #25
Source File: rooter.py    From ToonRooter with MIT License 5 votes vote down vote up
def patch_uboot(self):
        port = self._port

        log.info("Patching U-Boot")
        port.reset_input_buffer()
        sleep(0.1)
        port.write("printenv\n")
        port.flush()
        add_misc_match = re.compile(r'^addmisc=(.+)$')
        add_misc_val = None

        sleep(0.5)

        lines = port.read_until("U-Boot>")
        log.debug(lines)
        for line in lines.split('\n'):
            line = line.strip()
            log.debug(line)
            match = add_misc_match.match(line)
            if match:
                add_misc_val = match.group(1)

        if add_misc_val is None:
            log.error("Could not find value for addmisc environment variable")
            return

        cmd = "setenv addmisc " + re.sub(r'([\$;])',r'\\\1', add_misc_val + " init=/bin/sh")
        port.write(cmd + "\n")
        port.flush()
        log.debug(port.read_until("U-Boot>"))
        port.write("run boot_nand\n")
        port.flush() 
Example #26
Source File: pps.py    From CAMISIM with Apache License 2.0 5 votes vote down vote up
def readPPSOutput(sequences, taxonomy, inputFastaIdsPPSFile, overwriteAllPlacements=False):
    """
        Reads the output file of PPS and for each sequence decides:
        if overwriteAllPlacements=True is, then the sequence is placed according to the PPS file regardless of its
        previous placement
        if overwriteAllPlacements=False then if a sequence is placed to a less specific rank, than PPS suggests then
        the sequence is placed according to the PPS file
    """

    infile = str(inputFastaIdsPPSFile + '.out')
    try:
        f = open(os.path.normpath(infile),'r')
    except Exception:
            print "Cannot open file:", infile
            raise
    else:
        #i = 0
        for line in f:
            line = common.noNewLine(line)
            if re.match(r'^[0-9]+_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$', line):
                scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+.*[^0-9]+[0-9]+[^0-9]*$',r'\1' ,line))
                contigId = int(re.sub(r'^[0-9]+_([0-9]+).*[^0-9]+[0-9]+[^0-9]*$',r'\1' ,line))
                ncbid = int(re.sub(r'^[0-9]+_[0-9]+.*[^0-9]+([0-9]+)[^0-9]*$',r'\1' ,line))
                weight = None # the weight is not yet defined !!!
                if ncbid != 1:
                    #print line, ":", scaffoldId, contigId, ncbid
                    taxPathDictPPS = taxonomy.getPathToRoot(ncbid)
                    if taxPathDictPPS.keys() >= 1:
                        taxPathDictCurrent = sequences.getSequence(contigId).getTaxonomyPath()
                        if taxPathDictCurrent == None:
                            sequences.setTaxonomyPath(contigId, scaffoldId, taxPathDictPPS, weight)#weight = None !!!
                            #i += 1
                        else:
                            if ((overwriteAllPlacements) or (taxPathDictPPS.keys() > taxPathDictCurrent.keys())):
                                sequences.setTaxonomyPathOverride(contigId, scaffoldId, taxPathDictPPS, weight)#weight = None !!!
                                #i += 1
        #print "placed seq by PPS:", i

    finally:
        f.close() 
Example #27
Source File: pps.py    From CAMISIM with Apache License 2.0 5 votes vote down vote up
def toRealNames(config, sequences):
    """
        Transforms a PPS file fileName.fas.PP.out that names sequences according to their ids to their real names.
    """
    outIdsPPSFile = str(config.get('inputIdsFastaFile') + '.PP.out')
    outNamesPPSFile = outIdsPPSFile + '.n'
    #os.path.normpath
    print outNamesPPSFile

    try:
        fr = open(os.path.normpath(outIdsPPSFile),'r')
        fw = open(os.path.normpath(outNamesPPSFile),'w')
    except Exception:
        print "Cannot open one of the files:", outIdsPPSFile, "or", outNamesPPSFile
        raise
    else:
        for line in fr:
            if re.match(r'^[0-9]+_[0-9]+[^0-9].*$', line):
                id = re.sub(r'^[0-9]+_([0-9]+)[^0-9].*$',r'\1' , line)
                rest = re.sub(r'^[0-9]+_[0-9]+([^0-9].*)$',r'\1' , line)
                seq = sequences.getSequence(int(id))
                fw.write(seq.name + rest) # seq.scaffold.name
            else:
                fw.write(line)
    finally:
        fr.close()
        fw.close() 
Example #28
Source File: analysis16s.py    From CAMISIM with Apache License 2.0 5 votes vote down vote up
def _setCandidatePlacement(self, sequences, taxonomy, predFileName, source):
        assignedIdList = []
        try:
            f = open(os.path.normpath(predFileName),'r')
        except Exception:
            print "Cannot open file:", predFileName
            raise
        else:
            for line in f:
                line = common.noNewLine(line)
                if re.match(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$', line):
                    scaffoldId = int(re.sub(r'^([0-9]+)_[0-9]+\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    contigId = int(re.sub(r'^[0-9]+_([0-9]+)\t[0-9]+\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    ncbid = int(re.sub(r'^[0-9]+_[0-9]+\t([0-9]+)\t[0-9\.]+\t[^\t]+$',r'\1' ,line))
                    weight = float(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t([0-9\.]+)\t[^\t]+$',r'\1' ,line))
                    tag = str(re.sub(r'^[0-9]+_[0-9]+\t[0-9]+\t[0-9\.]+\t([^\t]+)$',r'\1' ,line))
                    if ncbid != 1:
                        taxPathDict = taxonomy.getPathToRoot(ncbid)
                        if taxPathDict is not None and taxPathDict.keys() >= 1:
                            sequences.setCandidateTaxonomyPath(contigId, scaffoldId, taxPathDict, weight, source, tag)
                            assignedIdList.append(contigId)
                        else:
                            sys.stderr.write(str('No taxonomic path found for ncbid: ' + str(ncbid)))
        finally:
            f.close()

        return set(assignedIdList) 
Example #29
Source File: sequences.py    From CAMISIM with Apache License 2.0 5 votes vote down vote up
def replaceIdsWithNames(outputFileContigSubPattern, nameToIDsFile, targetFile, outFile):
    """
        @deprecated: NOT IMPLEMENTED YET!!!
        replace ids with names
        @param nameToIdsFile: file that contains lines: contigName tab contigID
        @param targetFile: file that contain in the first column scaffoldID_contigID which will be replaced by its name
        @param outFile: file that contain the first column in the form scaffoldID_contigID with the name
        (that can be modified by substitution defined in the config file .. according to outputFileContigSubPattern)
    """
    idToName = dir([])
    assert False, 'NOT IMPLEMENTED YET'
    #try:
    #    f = open(os.path.normpath(nameToIDsFile), 'r')
    #    for line in f:
    #        if re.match('^#', line):
    #            continue
    #        name = re.sub(outputFileContigSubPattern, r'\1' , noNewLine(re.sub(r'^([^ \t]+)\t[0-9]+$',r'\1', line)))
    #        id = int(noNewLine(re.sub(r'^[^ \t]+\t([0-9]+)$',r'\1', line)))
    #        idToName[id] = name
    #except Exception:
    #    print "Cannot create a file or write to it:", outFile
    #    raise
    #finally:
    #    f.close()

    #now: go through the targetFile and for each line do:
    #    extract contigID and the rest of the line ^[0-9]+_[0-9]+([^0-9].*)$
    #    write name + rest of the line + \n to the outFile !!!!!!!!!!



#compare two sequences according to their length
#def seqLenCmp(seq1, seq2):
#    return seq1.seqBp - seq2.seqBp 
Example #30
Source File: sequences.py    From CAMISIM with Apache License 2.0 5 votes vote down vote up
def _writeSequencesOrScaffolds(self, outFile, writeSeq=True, writeIds=True, outputFileContigSubPattern=None):
        try:
            f = open(os.path.normpath(outFile), 'w')
            k = 0
            if writeSeq:
                list = self.sequences  # write sequences
            else:
                list = self.scaffolds  # write scaffolds

            for seq in list:
                if seq.seqBp == 0:
                    continue

                if writeIds:
                    if writeSeq:
                        name = str(str(seq.scaffold.id) + '_' + str(seq.id))  # sequence
                    else:
                        name = str(seq.id)  # scaffold
                else:
                    if outputFileContigSubPattern is None:
                        name = seq.name
                    else:
                        name = re.sub(outputFileContigSubPattern, r'\1' , seq.name)
                if k == 0:
                    f.write('>' + name)
                    k += 1
                else:
                    f.write('\n>' + name)
                s = seq.getSeq()
                l = seq.seqBp
                i = 0
                range = self._fastaLineMaxChar
                while i < l:
                    j = i + range
                    f.write('\n' + s[i:j])
                    i += range
        except Exception:
            print "Cannot create a file or write to it:", outFile
            raise
        finally:
            f.close()