Python nltk.tree() Examples

The following are 30 code examples for showing how to use nltk.tree(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 2
Project: razzy-spinner   Author: rafasashi   File: relextract.py    License: GNU General Public License v3.0 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 3
Project: luscan-devel   Author: blackye   File: util.py    License: GNU General Public License v2.0 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.node
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError, "Tree is too deeply nested to be printed in CoNLL format"
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 4
Project: luscan-devel   Author: blackye   File: viterbi.py    License: GNU General Public License v2.0 6 votes vote down vote up
def _trace_production(self, production, p, span, width):
        """
        Print trace output indicating that a given production has been
        applied at a given location.

        :param production: The production that has been applied
        :type production: Production
        :param p: The probability of the tree produced by the production.
        :type p: float
        :param span: The span of the production
        :type span: tuple
        :rtype: None
        """

        str = '|' + '.' * span[0]
        str += '=' * (span[1] - span[0])
        str += '.' * (width - span[1]) + '| '
        str += '%s' % production
        if self._trace > 2: str = '%-40s %12.10f ' % (str, p)

        print str 
Example 5
Project: luscan-devel   Author: blackye   File: relextract.py    License: GNU General Public License v2.0 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print "IEER: First 20 Headlines"
    print "=" * 45

    trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print
        print "%s:\n%s" % (doc.docno, tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 6
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: util.py    License: Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError(
                        "Tree is too deeply nested to be printed in CoNLL format"
                    )
                tags.append((contents[0], contents[1], prefix + category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 7
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)

    trees = [
        (doc.docno, doc.headline)
        for file in ieer.fileids()
        for doc in ieer.parsed_docs(file)
    ]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)


#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 8
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 9
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree 
Example 10
Project: razzy-spinner   Author: rafasashi   File: chunked.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 11
Project: razzy-spinner   Author: rafasashi   File: relextract.py    License: GNU General Public License v3.0 5 votes vote down vote up
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 12
Project: py-nltk-svo   Author: klintan   File: svo.py    License: MIT License 5 votes vote down vote up
def get_object(self, sub_tree):
        """
        Returns an Object with all attributes of an object
        """
        siblings = self.pred_verb_phrase_siblings
        Object = None
        for each_tree in sub_tree:
            if each_tree.label() in ["NP", "PP"]:
                sub_nodes = each_tree.subtrees()
                sub_nodes = [each for each in sub_nodes if each.pos()]

                for each in sub_nodes:
                    if each.label() in self.noun_types:
                        Object = each.leaves()
                        break
                break
            else:
                sub_nodes = each_tree.subtrees()
                sub_nodes = [each for each in sub_nodes if each.pos()]
                for each in sub_nodes:
                    if each.label() in self.adjective_types:
                        Object = each.leaves()
                        break
                # Get first noun in the tree
        self.pred_verb_phrase_siblings = None
        return {'object': Object} 
Example 13
Project: luscan-devel   Author: blackye   File: util.py    License: GNU General Public License v2.0 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 14
Project: luscan-devel   Author: blackye   File: util.py    License: GNU General Public License v2.0 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   top_node='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(top_node, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].node != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree 
Example 15
Project: luscan-devel   Author: blackye   File: viterbi.py    License: GNU General Public License v2.0 5 votes vote down vote up
def parse(self, tokens):
        # Inherit docs from ParserI

        tokens = list(tokens)
        self._grammar.check_coverage(tokens)

        # The most likely constituent table.  This table specifies the
        # most likely constituent for a given span and type.
        # Constituents can be either Trees or tokens.  For Trees,
        # the "type" is the Nonterminal for the tree's root node
        # value.  For Tokens, the "type" is the token's type.
        # The table is stored as a dictionary, since it is sparse.
        constituents = {}

        # Initialize the constituents dictionary with the words from
        # the text.
        if self._trace: print ('Inserting tokens into the most likely'+
                               ' constituents table...')
        for index in range(len(tokens)):
            token = tokens[index]
            constituents[index,index+1,token] = token
            if self._trace > 1:
                self._trace_lexical_insertion(token, index, len(tokens))

        # Consider each span of length 1, 2, ..., n; and add any trees
        # that might cover that span to the constituents dictionary.
        for length in range(1, len(tokens)+1):
            if self._trace:
                print ('Finding the most likely constituents'+
                       ' spanning %d text elements...' % length)
            for start in range(len(tokens)-length+1):
                span = (start, start+length)
                self._add_constituents_spanning(span, constituents,
                                                tokens)

        # Return the tree that spans the entire text & have the right cat
        return constituents.get((0, len(tokens), self._grammar.start())) 
Example 16
Project: luscan-devel   Author: blackye   File: viterbi.py    License: GNU General Public License v2.0 5 votes vote down vote up
def _find_instantiations(self, span, constituents):
        """
        :return: a list of the production instantiations that cover a
            given span of the text.  A "production instantiation" is
            a tuple containing a production and a list of children,
            where the production's right hand side matches the list of
            children; and the children cover ``span``.  :rtype: list
            of ``pair`` of ``Production``, (list of
            (``ProbabilisticTree`` or token.

        :type span: tuple(int, int)
        :param span: The section of the text for which we are
            trying to find production instantiations.  The span is
            specified as a pair of integers, where the first integer
            is the index of the first token that should be covered by
            the production instantiation; and the second integer is
            the index of the first token that should not be covered by
            the production instantiation.
        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
        :param constituents: The most likely constituents table.  This
            table records the most probable tree representation for
            any given span and node value.  See the module
            documentation for more information.
        """
        rv = []
        for production in self._grammar.productions():
            childlists = self._match_rhs(production.rhs(), span, constituents)

            for childlist in childlists:
                rv.append( (production, childlist) )
        return rv 
Example 17
Project: luscan-devel   Author: blackye   File: viterbi.py    License: GNU General Public License v2.0 5 votes vote down vote up
def _match_rhs(self, rhs, span, constituents):
        """
        :return: a set of all the lists of children that cover ``span``
            and that match ``rhs``.
        :rtype: list(list(ProbabilisticTree or token)

        :type rhs: list(Nonterminal or any)
        :param rhs: The list specifying what kinds of children need to
            cover ``span``.  Each nonterminal in ``rhs`` specifies
            that the corresponding child should be a tree whose node
            value is that nonterminal's symbol.  Each terminal in ``rhs``
            specifies that the corresponding child should be a token
            whose type is that terminal.
        :type span: tuple(int, int)
        :param span: The section of the text for which we are
            trying to find child lists.  The span is specified as a
            pair of integers, where the first integer is the index of
            the first token that should be covered by the child list;
            and the second integer is the index of the first token
            that should not be covered by the child list.
        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
        :param constituents: The most likely constituents table.  This
            table records the most probable tree representation for
            any given span and node value.  See the module
            documentation for more information.
        """
        (start, end) = span

        # Base case
        if start >= end and rhs == (): return [[]]
        if start >= end or rhs == (): return []

        # Find everything that matches the 1st symbol of the RHS
        childlists = []
        for split in range(start, end+1):
            l=constituents.get((start,split,rhs[0]))
            if l is not None:
                rights = self._match_rhs(rhs[1:], (split,end), constituents)
                childlists += [[l]+r for r in rights]

        return childlists 
Example 18
Project: luscan-devel   Author: blackye   File: knbc.py    License: GNU General Public License v2.0 5 votes vote down vote up
def __init__(self, root, fileids, encoding=None, morphs2str=_morphs2str_default):
        """
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self.morphs2str = morphs2str 
Example 19
Project: luscan-devel   Author: blackye   File: knbc.py    License: GNU General Public License v2.0 5 votes vote down vote up
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print knbc.fileids()[:10]
    print ''.join( knbc.words()[:100] )

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print '\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] )

    print '\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] ) 
Example 20
Project: luscan-devel   Author: blackye   File: chunked.py    License: GNU General Public License v2.0 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 21
Project: luscan-devel   Author: blackye   File: relextract.py    License: GNU General Public License v2.0 5 votes vote down vote up
def mk_pairs(tree):
    """
    Group a chunk structure into a list of pairs of the form (list(str), ``Tree``)

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    pairs = []
    pair = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            pair[0].append(dtr)
        else:
            # dtr is a Tree
            pair[1] = dtr
            pairs.append(pair)
            pair = [[], None]
    return pairs 
Example 22
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: util.py    License: Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

    #    print 'GOLD:', gold_tags[:50]
    #    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <yoavg@cs.bgu.ac.il>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 23
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: util.py    License: Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def conlltags2tree(
    sentence, chunk_types=('NP', 'PP', 'VP'), root_label='S', strict=False
):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word, postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word, postag)]))
        elif chunktag.startswith('I-'):
            if (
                len(tree) == 0
                or not isinstance(tree[-1], Tree)
                or tree[-1].label() != chunktag[2:]
            ):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word, postag)]))
            else:
                tree[-1].append((word, postag))
        elif chunktag == 'O':
            tree.append((word, postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree 
Example 24
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 25
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 26
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 4 votes vote down vote up
def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
                source_tagset=None, target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]

### CONLL 
Example 27
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    """

    stack = [Tree(root_label, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError('Error on line %d' % lineno)
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 28
Project: luscan-devel   Author: blackye   File: util.py    License: GNU General Public License v2.0 4 votes vote down vote up
def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_node: The label to use for chunk nodes
    :type chunk_node: str
    :param top_node: The label to use for the root of the tree
    :type top_node: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(top_node, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_node, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                stack[-1].append(str2tuple(text, sep))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]

### CONLL 
Example 29
Project: luscan-devel   Author: blackye   File: util.py    License: GNU General Public License v2.0 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param top_node: The node label to use for the root.
    :type top_node: str
    :rtype: Tree
    """

    stack = [Tree(top_node, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError, 'Error on line %d' % lineno
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].node
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 30
Project: V1EngineeringInc-Docs   Author: V1EngineeringInc   File: util.py    License: Creative Commons Attribution Share Alike 4.0 International 4 votes vote down vote up
def tagstr2tree(
    s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None
):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char {:d}'.format(len(s)))
    return stack[0]


### CONLL