Python nltk.tree() Examples

The following are code examples for showing how to use nltk.tree(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: razzy-spinner   Author: rafasashi   File: util.py    GNU General Public License v3.0 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 2
Project: razzy-spinner   Author: rafasashi   File: relextract.py    GNU General Public License v3.0 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 3
Project: robot-navigation   Author: ronaldahmed   File: tree_edit.py    MIT License 6 votes vote down vote up
def display(self):
        """
        Effectively, this is like an update function.  Clear the
        screen, draw it, then set the scrollable region to include
        everything.
        """
        
        self.c.delete(ALL)

        # call drawtree on the root of the tree
        right = self.drawtree(self.tree, PADDING)

        bbox = self.c.bbox(ALL)
        print bbox
        self.c['scrollregion']=(bbox[0]-PADDING, bbox[1]-50,
                                bbox[2]+PADDING, bbox[3]+50)
        self.c['width'] = min(bbox[2]-bbox[0]+(2*PADDING), 800)
        self.c['height'] = min(bbox[3]-bbox[1], 300) 
Example 4
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 5
Project: OpenBottle   Author: xiaozhuchacha   File: relextract.py    MIT License 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 6
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 7
Project: Health-Checker   Author: KriAga   File: util.py    MIT License 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 8
Project: Health-Checker   Author: KriAga   File: relextract.py    MIT License 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)

    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 9
Project: essay-rater-web   Author: alexandrablooky   File: essay_feature.py    MIT License 6 votes vote down vote up
def __fleschKincaidGradeLevel(self):
        """Compute the Flesch Kincaid Grade Level of the essay

        :return: 0.39 * mean_sentence_length + 11.8 * mean_syllable_length - 15.95
        """
        # if the number of sentences is 0, then we set the flesch_kincaid_grade_level2 to be -15.95
        if self.sentence_num == 0:
            self.essay_feature_dict['flesch_kincaid_grade_level'] = - 15.95
        else:
            total_syllables = 0
            total_words = 0
            for sentence in self.wordss:
                # words = sentence.split()
                total_words += len(sentence)
                for word in sentence:
                    total_syllables += self.__numOfSyllable(word)

            if total_words == 0:
                self.essay_feature_dict['mean_syllable_length'] = 0
            else:
                self.essay_feature_dict['mean_syllable_length'] = total_syllables / total_words
            flesch_kincaid_grade_level = 0.39 * self.essay_feature_dict['mean_sentence_length'] + 11.8 * self.essay_feature_dict['mean_syllable_length'] - 15.95
            self.essay_feature_dict['flesch_kincaid_grade_level'] = flesch_kincaid_grade_level

    # parse a sentence to syntax tree by corenlp parser 
Example 10
Project: text-cfg-parser   Author: Samurais   File: recursive_descent_parser_model.py    MIT License 6 votes vote down vote up
def _parse(self, remaining_text, tree, frontier):
        """Parsing function need to check all possibility that own investigating tree can have"""

        if len(remaining_text) == 0 and len(frontier) == 0:
            # Means that there are no nodes need to parsed and no nodes can be expanded or checked
            # => Our tree is completed
            # "Return" the tree
            yield tree
        elif len(frontier) == 0:
            # Mean that it still have word need to parsed. However, there are no nodes can be expanded or checked
            # Cannot expanded and checked => This tree is failed
            pass
        elif isinstance(tree[frontier[0]], Tree):
            # Mean that it still have node can be expanded and checked
            # So there will be 2 possibility:
            # - If considered node is a nonterminal => can be expanded
            for result in self._expand(remaining_text, tree, frontier):
                yield result
        else:
            # - If considered node is a terminal => can be checked
            for result in self._match(remaining_text, tree, frontier):
                yield result 
Example 11
Project: text-cfg-parser   Author: Samurais   File: recursive_descent_parser_model.py    MIT License 6 votes vote down vote up
def _production_to_tree(self, production):
        """This function try to convert a production to a Tree
            A tree has father node is Left Hand Side of the production,
            and children is a Right hand side of the production
            Example: S -> NP VP
            Left hand side :S
            Right hand side :NP, VP

            =>Return tree: (S (NP )(VP))
        """
        children = []
        for elt in production.rhs():
            if isinstance(elt, Nonterminal):
                # Check if each RHS is a nonterminal
                children.append(Tree(elt, []))
            else:
                # Check if each RHS is a terminal
                children.append(elt)
        return Tree(production.lhs(), children) 
Example 12
Project: FancyWord   Author: EastonLee   File: util.py    GNU General Public License v3.0 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.label()
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 13
Project: FancyWord   Author: EastonLee   File: relextract.py    GNU General Public License v3.0 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 14
Project: nltk-on-gae   Author: sivu22   File: util.py    Apache License 2.0 6 votes vote down vote up
def tree2conlltags(t):
    """
    Return a list of 3-tuples containing ``(word, tag, IOB-tag)``.
    Convert a tree to the CoNLL IOB tag format.

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: list(tuple)
    """

    tags = []
    for child in t:
        try:
            category = child.node
            prefix = "B-"
            for contents in child:
                if isinstance(contents, Tree):
                    raise ValueError("Tree is too deeply nested to be printed in CoNLL format")
                tags.append((contents[0], contents[1], prefix+category))
                prefix = "I-"
        except AttributeError:
            tags.append((child[0], child[1], "O"))
    return tags 
Example 15
Project: nltk-on-gae   Author: sivu22   File: viterbi.py    Apache License 2.0 6 votes vote down vote up
def _trace_production(self, production, p, span, width):
        """
        Print trace output indicating that a given production has been
        applied at a given location.

        :param production: The production that has been applied
        :type production: Production
        :param p: The probability of the tree produced by the production.
        :type p: float
        :param span: The span of the production
        :type span: tuple
        :rtype: None
        """

        str = '|' + '.' * span[0]
        str += '=' * (span[1] - span[0])
        str += '.' * (width - span[1]) + '| '
        str += '%s' % production
        if self._trace > 2: str = '%-40s %12.10f ' % (str, p)

        print(str) 
Example 16
Project: nltk-on-gae   Author: sivu22   File: relextract.py    Apache License 2.0 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree

    print("IEER: First 20 Headlines")
    print("=" * 45)

    trees = [doc.headline for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % (doc.docno, tree))



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 17
Project: NLIDB   Author: VV123   File: tagger.py    MIT License 6 votes vote down vote up
def treeLCAdepth(deptree, value_position, field_position):
    ''' Measure the depth of Lowest Common Ancestor (LCA) in a tree
        arguments --- deptree: dependency tree of a certain parse_query
                      value_position (int): the index of value in original query
                      field_position (int): the index of field in original query
        call function nltk.tree.treeposition_spanning_leaves(start, end), return the sequence of common ancestors
        return --- length of common ancestor sequence, the depth of LCA
    '''
    if value_position > field_position:
        start = field_position
        end = value_position
    else:
        start = value_position
        end = field_position
    treespan = deptree.treeposition_spanning_leaves(start, end)
    return len(treespan) 
Example 18
Project: razzy-spinner   Author: rafasashi   File: util.py    GNU General Public License v3.0 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 19
Project: razzy-spinner   Author: rafasashi   File: util.py    GNU General Public License v3.0 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree 
Example 20
Project: razzy-spinner   Author: rafasashi   File: chunked.py    GNU General Public License v3.0 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 21
Project: razzy-spinner   Author: rafasashi   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 22
Project: robot-navigation   Author: ronaldahmed   File: tree_edit.py    MIT License 5 votes vote down vote up
def display_node(self, canvas, x, depth, font_dict, app_dict):
        """
        draws the node to the canvas.  Basically this can be done in one of two
        senarios.  If we already have the xloc and yloc of the node (i.e. it
        has already been drawn) then just draw the node there, ignoring the
        arguments passed to you.  If however, one is drawing the nodes anew
        (for example, for the first time) then we draw the node at the x
        position and at 50*the depth (which is basically a variable relating
        to what level in the tree it is at)
        """
        
        text_color = app_dict['text']

        if 1 < 0:
            self.label = label = Text(canvas, self.xloc, self.yloc,
                                      font_dict,
                                      text=self.node,
                                      fill=text_color,
                                      style='bold')
        else:
            shift = int(Y_SPACING*font_dict['ratio'])
            self.label = label = Text(canvas, x, depth*shift,
                                      font_dict,
                                      text=self.node,
                                      fill=text_color,
                                      style='bold')
            # now that it has been drawn, save the xloc and yloc
            self.xloc = x
            self.yloc = depth*shift

        """
        if not self.width:
            bbox = canvas.bbox(label.tag, self.dependancies())
            self.width = bbox[2]-bbox[0]
        """
        
        return label 
Example 23
Project: robot-navigation   Author: ronaldahmed   File: tree_edit.py    MIT License 5 votes vote down vote up
def move(self, canvas, dx, dy):
        """
        move the node and all of the members of the tree below it
        dx places in the x direction and dy in the y direction
        """
        # if it is collaped, just move the label, the triangle, and the
        # collapsed string
        if self.collapsed == 1:
            self.label.move(canvas, dx, dy)
            self.collapsed_string.move(canvas, dx, dy)
            canvas.move(self.triangle, dx, dy)

        # else, go through the children and recursively call move on them, then
        # move all the arrows the requisite amount as well.
        else: 
            for child in self.children:
                child.move(canvas, dx, dy)

            self.label.move(canvas, dx, dy)

            for arrow in self.arrow_list:
                canvas.move(arrow, dx, dy)

        # update the xloc and yloc of the node
        self.xloc += dx
        self.yloc += dy 
Example 24
Project: robot-navigation   Author: ronaldahmed   File: tree_edit.py    MIT License 5 votes vote down vote up
def convert_tree(tree):
    """
    this function will recursively go through a Tree and return a tree
    in Draw_Node/Draw_Leaf/Draw_Token form
    """
    if isinstance(tree, Tree) or isinstance(tree, TreeToken):
        root = Draw_Node(tree)
        for child in tree:
            root.children.append(convert_tree(child))
    elif isinstance(tree, Token):
        root = Draw_Token(tree)
    else: 
        root = Draw_Leaf(tree)  
 
    return root 
Example 25
Project: robot-navigation   Author: ronaldahmed   File: tree_edit.py    MIT License 5 votes vote down vote up
def drawtreetoken(tree):
    """
    display the given tree_token tree
    """
    
    w = Window(tree) 
Example 26
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 27
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree 
Example 28
Project: OpenBottle   Author: xiaozhuchacha   File: chunked.py    MIT License 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 29
Project: OpenBottle   Author: xiaozhuchacha   File: relextract.py    MIT License 5 votes vote down vote up
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 30
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 31
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree 
Example 32
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 5 votes vote down vote up
def tree2conllstr(t):
    """
    Return a multiline string where each line contains a word, tag and IOB tag.
    Convert a tree to the CoNLL IOB string format

    :param t: The tree to be converted.
    :type t: Tree
    :rtype: str
    """
    lines = [" ".join(token) for token in tree2conlltags(t)]
    return '\n'.join(lines)

### IEER 
Example 33
Project: OpenBottle   Author: xiaozhuchacha   File: chunked.py    MIT License 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 34
Project: OpenBottle   Author: xiaozhuchacha   File: relextract.py    MIT License 5 votes vote down vote up
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 35
Project: Health-Checker   Author: KriAga   File: util.py    MIT License 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 36
Project: Health-Checker   Author: KriAga   File: util.py    MIT License 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag {0!r}".format(chunktag))
    return tree 
Example 37
Project: Health-Checker   Author: KriAga   File: chunked.py    MIT License 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 38
Project: Health-Checker   Author: KriAga   File: relextract.py    MIT License 5 votes vote down vote up
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 39
Project: py-nltk-svo   Author: klintan   File: svo.py    MIT License 5 votes vote down vote up
def get_object(self, sub_tree):
        """
        Returns an Object with all attributes of an object
        """
        siblings = self.pred_verb_phrase_siblings
        Object = None
        for each_tree in sub_tree:
            if each_tree.label() in ["NP", "PP"]:
                sub_nodes = each_tree.subtrees()
                sub_nodes = [each for each in sub_nodes if each.pos()]

                for each in sub_nodes:
                    if each.label() in self.noun_types:
                        Object = each.leaves()
                        break
                break
            else:
                sub_nodes = each_tree.subtrees()
                sub_nodes = [each for each in sub_nodes if each.pos()]
                for each in sub_nodes:
                    if each.label() in self.adjective_types:
                        Object = each.leaves()
                        break
                # Get first noun in the tree
        self.pred_verb_phrase_siblings = None
        return {'object': Object} 
Example 40
Project: essay-rater-web   Author: alexandrablooky   File: essay_feature.py    MIT License 5 votes vote down vote up
def __syntaxTreesAnalysis(self):
        avg_tree_height = 0
        avg_leaf_depth = 0
        for syntax_tree in self.syntax_trees:
            # get height of each syntax tree 
            avg_tree_height += syntax_tree.height()

            # get depth of all leaves 
            n_leaves = len(syntax_tree.leaves())
            leavepos = set(syntax_tree.leaf_treeposition(n)
                           for n in range(n_leaves))
            leave_depth = []
            for pos in leavepos:
                leave_depth.append(len(pos))
            avg_depth = 0
            for pos in leave_depth:
                avg_depth += pos
            avg_leaf_depth += avg_depth
        if len(self.syntax_trees) == 0:
            avg_tree_height /= 0
            avg_leaf_depth /= 0
        else:
            avg_tree_height /= len(self.syntax_trees)
            avg_leaf_depth /= len(self.syntax_trees)
        self.essay_feature_dict['avg_tree_height'] = avg_tree_height
        self.essay_feature_dict['avg_leaf_depth']  = avg_leaf_depth

    # S: Simple declarative clause 简单陈述句
    # SBAR: Subordinate clause 从属子句
    
    # get clause statistics include length and number of clauses 
Example 41
Project: essay-rater-web   Author: alexandrablooky   File: essay_feature.py    MIT License 5 votes vote down vote up
def get_clause_statistic(self, sent_tree):
        sent_tree = nltk.tree.ParentedTree.fromstring(str(sent_tree))
        
        clause_level_list = ["S","SBAR","SBARQ","SINV","SQ"]
        num_clauses = 0
        length_clauses = 0
        sub_trees = []

        # break the tree into subtrees of clauses using
        # clause levels "S","SBAR","SBARQ","SINV","SQ"
        for sub_tree in reversed(list(sent_tree.subtrees())):
            if sub_tree.label() in clause_level_list:
                if sub_tree.parent().label() in clause_level_list:
                    continue

                if (len(sub_tree) == 1 and sub_tree.label() == "S" and sub_tree[0].label() == "VP"
                    and not sub_tree.parent().label() in clause_level_list):
                    continue

                sub_trees.append(sub_tree)
                del sent_tree[sub_tree.treeposition()]

        for t in sub_trees:
            num_clauses += 1
            length_clauses += len(t.leaves())
        return num_clauses, length_clauses

    # get paragraph and structure feature 
Example 42
Project: text-cfg-parser   Author: Samurais   File: recursive_descent_parser_model.py    MIT License 5 votes vote down vote up
def parse(self, sentence):
        """This function try to initialize all essential parameters for parser:
        1/ Investigating Tree: A tree that is estimated to find the most appropriate tree
        2/ Remaining list
        3/ Frontier : List of locations of nodes that can be expanded(if Nonterminal) or can be matched/checked(if Terminal)
        """
        start = self._grammar.start().symbol()
        initial_tree = Tree(start, [])
        frontier = [()]
        remaining_list = list(sentence)
        return self._parse(
            remaining_text=remaining_list,
            tree=initial_tree,
            frontier=frontier) 
Example 43
Project: text-cfg-parser   Author: Samurais   File: recursive_descent_parser_model.py    MIT License 5 votes vote down vote up
def _expand(self, remaining_text, tree, frontier):
        """This function try to replace a considered node with a  appropriate tree if possible
        Example: my simple production is
        S -> NP VP
        NP -> DT NN

        I assume that my tree is : (S (NP )(VP ))
        I saw that NP can be expanded by DT NN
        => I try to make my tree become : (S (NP (DT )(NN ))(VP ))

        """

        productions = self._grammar.productions()

        for production in productions:
            lhs = production.lhs().symbol()
            if lhs == str(tree[frontier[0]].label()):
                subtree = self._production_to_tree(production)
                if frontier[0] == ():
                    newtree = subtree
                else:
                    newtree = tree.copy()
                    newtree[frontier[0]] = subtree
                new_frontier = [frontier[0] +
                                (i,) for i in range(len(production.rhs()))]

                for result in self._parse(
                        remaining_text, newtree, new_frontier + frontier[1:]):
                    yield result 
Example 44
Project: text-cfg-parser   Author: Samurais   File: recursive_descent_parser_model.py    MIT License 5 votes vote down vote up
def _match(self, remaining_text, tree, frontier):
        """This function try to check whether a terminal node is correct"""
        tree_leaves = tree[frontier[0]]
        if len(remaining_text) > 0 and tree_leaves == remaining_text[0]:
            new_tree = tree.copy()
            new_tree[frontier[0]] = tree_leaves
            for result in self._parse(
                    remaining_text[1:], new_tree, frontier[1:]):
                yield result 
Example 45
Project: text-cfg-parser   Author: Samurais   File: sample.py    MIT License 5 votes vote down vote up
def test_nltk_cfg_qtype(self):
        print("test_nltk_cfg_qtype")
        gfile = os.path.join(
            curdir,
            os.path.pardir,
            "config",
            "grammar.question-type.cfg")
        question_grammar = nltk.data.load('file:%s' % gfile)

        def get_missing_words(grammar, tokens):
            """
            Find list of missing tokens not covered by grammar
            """
            missing = [tok for tok in tokens
                       if not grammar._lexical_index.get(tok)]
            return missing

        # sentence = "do i need code when i dep now ?"
        sentence = "what is your name"

        # check type
        sent = sentence.split()
        missing = get_missing_words(question_grammar, sent)
        target = []
        for x in sent:
            if x in missing:
                continue
            target.append(x)

        rd_parser = RecursiveDescentParser(question_grammar)
        result = []
        print("target: ", target)
        for tree in rd_parser.parse(target):
            result.append(x)
            print("Question Type\n", tree)

        if len(result) == 0:
            print("Not Question Type") 
Example 46
Project: text-cfg-parser   Author: Samurais   File: sample.py    MIT License 5 votes vote down vote up
def test_nltk_cfg_en(self):
        print("test_nltk_cfg_en")
        grammar = nltk.CFG.fromstring("""
         S -> NP VP
         VP -> V NP | V NP PP
         V -> "saw" | "ate"
         NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
         Det -> "a" | "an" | "the" | "my"
         N -> "dog" | "cat" | "cookie" | "park"
         PP -> P NP
         P -> "in" | "on" | "by" | "with"
         """)

        # Make your POS sentence into a list of tokens.
        sent = "Mary saw Bob".split()

        # Load the grammar into the RecursiveDescentParser.
        rd_parser = RecursiveDescentParser(grammar)

        result = []

        for i, tree in enumerate(rd_parser.parse(sent)):
            result.append(tree)
            save_tree_png(
                str(tree),
                os.path.join(
                    curdir,
                    os.path.pardir,
                    "tmp",
                    "nltk_cfg_en_%s.png" %
                    i))

        assert len(result) > 0, "Can not recognize CFG tree."

        print(result) 
Example 47
Project: text-cfg-parser   Author: Samurais   File: sample.py    MIT License 5 votes vote down vote up
def test_nltk_cfg_zh(self):
        print("test_nltk_cfg_zh")
        grammar = nltk.CFG.fromstring("""
         S -> N VP
         VP -> V NP | V NP | V N
         V -> "尊敬"
         N -> "我们" | "老师"
         """)

        # Make your POS sentence into a list of tokens.
        sent = "我们 尊敬 老师".split()

        # Load the grammar into the RecursiveDescentParser.
        rd_parser = RecursiveDescentParser(grammar)

        result = []

        for i, tree in enumerate(rd_parser.parse(sent)):
            result.append(tree)
            print("Tree [%s]: %s" % (i + 1, tree))

        assert len(result) > 0, "Can not recognize CFG tree."
        if len(result) == 1 :
            print("Draw tree with Display ...")
            result[0].draw()
        else:
            print("WARN: Get more then one trees.")

        print(result) 
Example 48
Project: FancyWord   Author: EastonLee   File: util.py    GNU General Public License v3.0 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 49
Project: FancyWord   Author: EastonLee   File: util.py    GNU General Public License v3.0 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   root_label='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(root_label, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].label() != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree 
Example 50
Project: FancyWord   Author: EastonLee   File: chunked.py    GNU General Public License v3.0 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 51
Project: FancyWord   Author: EastonLee   File: relextract.py    GNU General Public License v3.0 5 votes vote down vote up
def tree2semi_rel(tree):
    """
    Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). 

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    semi_rels = []
    semi_rel = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            semi_rel[0].append(dtr)
        else:
            # dtr is a Tree
            semi_rel[1] = dtr
            semi_rels.append(semi_rel)
            semi_rel = [[], None]
    return semi_rels 
Example 52
Project: english-pddl-translator   Author: yu-peng   File: parseSentence.py    MIT License 5 votes vote down vote up
def parseEnglish(sent,parsertype):
            
    if parsertype == 3:
        print 'using Viterbi parser and customized grammar'
        tree = parser.parse(sent.split())
        return tree
    
    elif parsertype == 1:
        print 'using Stanford online parser'
        treeString = treeString = fetchStanfordOnline.getTreeString(sent)
        tree = Tree(treeString)
        return tree
    
    #treeview.draw_tree(tree) 
Example 53
Project: nltk-on-gae   Author: sivu22   File: util.py    Apache License 2.0 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 54
Project: nltk-on-gae   Author: sivu22   File: util.py    Apache License 2.0 5 votes vote down vote up
def conlltags2tree(sentence, chunk_types=('NP','PP','VP'),
                   top_node='S', strict=False):
    """
    Convert the CoNLL IOB format to a tree.
    """
    tree = Tree(top_node, [])
    for (word, postag, chunktag) in sentence:
        if chunktag is None:
            if strict:
                raise ValueError("Bad conll tag sequence")
            else:
                # Treat as O
                tree.append((word,postag))
        elif chunktag.startswith('B-'):
            tree.append(Tree(chunktag[2:], [(word,postag)]))
        elif chunktag.startswith('I-'):
            if (len(tree)==0 or not isinstance(tree[-1], Tree) or
                tree[-1].node != chunktag[2:]):
                if strict:
                    raise ValueError("Bad conll tag sequence")
                else:
                    # Treat as B-*
                    tree.append(Tree(chunktag[2:], [(word,postag)]))
            else:
                tree[-1].append((word,postag))
        elif chunktag == 'O':
            tree.append((word,postag))
        else:
            raise ValueError("Bad conll tag %r" % chunktag)
    return tree 
Example 55
Project: nltk-on-gae   Author: sivu22   File: viterbi.py    Apache License 2.0 5 votes vote down vote up
def parse(self, tokens):
        # Inherit docs from ParserI

        tokens = list(tokens)
        self._grammar.check_coverage(tokens)

        # The most likely constituent table.  This table specifies the
        # most likely constituent for a given span and type.
        # Constituents can be either Trees or tokens.  For Trees,
        # the "type" is the Nonterminal for the tree's root node
        # value.  For Tokens, the "type" is the token's type.
        # The table is stored as a dictionary, since it is sparse.
        constituents = {}

        # Initialize the constituents dictionary with the words from
        # the text.
        if self._trace: print(('Inserting tokens into the most likely'+
                               ' constituents table...'))
        for index in range(len(tokens)):
            token = tokens[index]
            constituents[index,index+1,token] = token
            if self._trace > 1:
                self._trace_lexical_insertion(token, index, len(tokens))

        # Consider each span of length 1, 2, ..., n; and add any trees
        # that might cover that span to the constituents dictionary.
        for length in range(1, len(tokens)+1):
            if self._trace:
                print(('Finding the most likely constituents'+
                       ' spanning %d text elements...' % length))
            for start in range(len(tokens)-length+1):
                span = (start, start+length)
                self._add_constituents_spanning(span, constituents,
                                                tokens)

        # Return the tree that spans the entire text & have the right cat
        return constituents.get((0, len(tokens), self._grammar.start())) 
Example 56
Project: nltk-on-gae   Author: sivu22   File: viterbi.py    Apache License 2.0 5 votes vote down vote up
def _find_instantiations(self, span, constituents):
        """
        :return: a list of the production instantiations that cover a
            given span of the text.  A "production instantiation" is
            a tuple containing a production and a list of children,
            where the production's right hand side matches the list of
            children; and the children cover ``span``.  :rtype: list
            of ``pair`` of ``Production``, (list of
            (``ProbabilisticTree`` or token.

        :type span: tuple(int, int)
        :param span: The section of the text for which we are
            trying to find production instantiations.  The span is
            specified as a pair of integers, where the first integer
            is the index of the first token that should be covered by
            the production instantiation; and the second integer is
            the index of the first token that should not be covered by
            the production instantiation.
        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
        :param constituents: The most likely constituents table.  This
            table records the most probable tree representation for
            any given span and node value.  See the module
            documentation for more information.
        """
        rv = []
        for production in self._grammar.productions():
            childlists = self._match_rhs(production.rhs(), span, constituents)

            for childlist in childlists:
                rv.append( (production, childlist) )
        return rv 
Example 57
Project: nltk-on-gae   Author: sivu22   File: viterbi.py    Apache License 2.0 5 votes vote down vote up
def _match_rhs(self, rhs, span, constituents):
        """
        :return: a set of all the lists of children that cover ``span``
            and that match ``rhs``.
        :rtype: list(list(ProbabilisticTree or token)

        :type rhs: list(Nonterminal or any)
        :param rhs: The list specifying what kinds of children need to
            cover ``span``.  Each nonterminal in ``rhs`` specifies
            that the corresponding child should be a tree whose node
            value is that nonterminal's symbol.  Each terminal in ``rhs``
            specifies that the corresponding child should be a token
            whose type is that terminal.
        :type span: tuple(int, int)
        :param span: The section of the text for which we are
            trying to find child lists.  The span is specified as a
            pair of integers, where the first integer is the index of
            the first token that should be covered by the child list;
            and the second integer is the index of the first token
            that should not be covered by the child list.
        :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree)
        :param constituents: The most likely constituents table.  This
            table records the most probable tree representation for
            any given span and node value.  See the module
            documentation for more information.
        """
        (start, end) = span

        # Base case
        if start >= end and rhs == (): return [[]]
        if start >= end or rhs == (): return []

        # Find everything that matches the 1st symbol of the RHS
        childlists = []
        for split in range(start, end+1):
            l=constituents.get((start,split,rhs[0]))
            if l is not None:
                rights = self._match_rhs(rhs[1:], (split,end), constituents)
                childlists += [[l]+r for r in rights]

        return childlists 
Example 58
Project: nltk-on-gae   Author: sivu22   File: knbc.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, root, fileids, encoding=None, morphs2str=_morphs2str_default):
        """
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self.morphs2str = morphs2str 
Example 59
Project: nltk-on-gae   Author: sivu22   File: knbc.py    Apache License 2.0 5 votes vote down vote up
def demo():

    import nltk
    from nltk.corpus.util import LazyCorpusLoader

    root = nltk.data.find('corpora/knbc/corpus1')
    fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*")
               if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)]

    def _knbc_fileids_sort(x):
        cells = x.split('-')
        return (cells[0], int(cells[1]), int(cells[2]), int(cells[3]))

    knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader,
                            sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp')

    print(knbc.fileids()[:10])
    print(''.join( knbc.words()[:100] ))

    print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))

    knbc.morphs2str = lambda morphs: '/'.join(
        "%s(%s)"%(m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS'
        ).encode('utf-8')

    print('\n\n'.join( '%s' % tree for tree in knbc.parsed_sents()[:2] ))

    print('\n'.join( ' '.join("%s/%s"%(w[0], w[1].split(' ')[2]) for w in sent)
                     for sent in knbc.tagged_sents()[0:2] )) 
Example 60
Project: nltk-on-gae   Author: sivu22   File: chunked.py    Apache License 2.0 5 votes vote down vote up
def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError('expected child to be Tree or tuple')
        return tree 
Example 61
Project: nltk-on-gae   Author: sivu22   File: relextract.py    Apache License 2.0 5 votes vote down vote up
def mk_pairs(tree):
    """
    Group a chunk structure into a list of pairs of the form (list(str), ``Tree``)

    In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
    identifies pairs whose first member is a list (possibly empty) of terminal
    strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).

    :param tree: a chunk tree
    :return: a list of pairs (list(str), ``Tree``)
    :rtype: list of tuple
    """

    from nltk.tree import Tree

    pairs = []
    pair = [[], None]

    for dtr in tree:
        if not isinstance(dtr, Tree):
            pair[0].append(dtr)
        else:
            # dtr is a Tree
            pair[1] = dtr
            pairs.append(pair)
            pair = [[], None]
    return pairs 
Example 62
Project: FAParser   Author: baoy-nlp   File: ptb_utils.py    GNU General Public License v3.0 5 votes vote down vote up
def load_trees(path, strip_top=True, strip_spmrl_features=True):
    trees = []
    with open(path) as infile:
        for line in infile:
            trees.append(Tree.fromstring(line))

    if strip_top:
        for i, tree in enumerate(trees):
            if tree.label() in ("TOP", "ROOT"):
                assert len(tree) == 1
                trees[i] = tree[0]
    return trees 
Example 63
Project: IHP   Author: lasigeBioTM   File: ddi_kernels.py    MIT License 5 votes vote down vote up
def trainSVMTK(docs, pairs, dditype, model="svm_tk_classifier.model", excludesentences=[]):
    if os.path.isfile("ddi_models/" + model):
        os.remove("ddi_models/" + model)
    if os.path.isfile("ddi_models/" + model + ".txt"):
        os.remove("ddi_models/" + model + ".txt")

    #docs = use_external_data(docs, excludesentences, dditype)
    xerrors = 0
    with open("ddi_models/" + model + ".txt", 'w') as train:
        #print pairs
        for p in pairs:
            if dditype != "all" and pairs[p][relations.PAIR_DDI] and pairs[p][relations.PAIR_TYPE] != dditype:
                continue
            sid = relations.getSentenceID(p)
            if sid not in excludesentences:
                tree = pairs[p][relations.PAIR_DEP_TREE][:]
                #print "tree1:", tree
                #if len(docs[sid][ddi.SENTENCE_ENTITIES]) > 20:
                    #print line
                #    line = "1 |BT| (ROOT (NP (NN candidatedrug) (, ,) (NN candidatedrug))) |ET|"
                #    xerrors += 1
                #else:
                line = get_svm_train_line(tree, pairs[p], sid,
                                              docs[sid][relations.SENTENCE_PAIRS][p])
                if not pairs[p][relations.PAIR_DDI]:
                    line = '-' + line
                elif pairs[p][relations.PAIR_TYPE] != dditype and dditype != "all":
                    line = '-' + line

                train.write(line)
    #print "tree errors:", xerrors
    svmlightcall = Popen(["./svm-light-TK-1.2/svm-light-TK-1.2.1/svm_learn", "-t", "5",
                          "-L", "0.4", "-T", "2", "-S", "2", "-g", "10",
                          "-D", "0", "-C", "T", basedir + model + ".txt", basedir + model],
                         stdout = PIPE, stderr = PIPE)
    res  = svmlightcall.communicate()
    if not os.path.isfile("ddi_models/" + model):
        print "failed training model " + basedir + model
        print res
        sys.exit() 
Example 64
Project: luscan-devel   Author: blackye   File: util.py    GNU General Public License v2.0 5 votes vote down vote up
def accuracy(chunker, gold):
    """
    Score the accuracy of the chunker against the gold standard.
    Strip the chunk information from the gold standard and rechunk it using
    the chunker, then compute the accuracy score.

    :type chunker: ChunkParserI
    :param chunker: The chunker being evaluated.
    :type gold: tree
    :param gold: The chunk structures to score the chunker on.
    :rtype: float
    """

    gold_tags = []
    test_tags = []
    for gold_tree in gold:
        test_tree = chunker.parse(gold_tree.flatten())
        gold_tags += tree2conlltags(gold_tree)
        test_tags += tree2conlltags(test_tree)

#    print 'GOLD:', gold_tags[:50]
#    print 'TEST:', test_tags[:50]
    return _accuracy(gold_tags, test_tags)


# Patched for increased performance by Yoav Goldberg <[email protected]>, 2006-01-13
#  -- statistics are evaluated only on demand, instead of at every sentence evaluation
#
# SB: use nltk.metrics for precision/recall scoring?
# 
Example 65
Project: razzy-spinner   Author: rafasashi   File: util.py    GNU General Public License v3.0 4 votes vote down vote up
def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
                source_tagset=None, target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]

### CONLL 
Example 66
Project: razzy-spinner   Author: rafasashi   File: util.py    GNU General Public License v3.0 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    """

    stack = [Tree(root_label, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError('Error on line %d' % lineno)
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 67
Project: robot-navigation   Author: ronaldahmed   File: tree_edit.py    MIT License 4 votes vote down vote up
def __init__(self, node):
        """
        __init__ -> takes in a node from the original tree and, depending on
        whether it is a Tree node or a Tree_Token node, adjusts the attributes
        accordingly
        """

        self.children = []
        self.width = None
        
        # self.collapsed keeps track of whether the node has been collapsed
        # into a single node.  To begin with, it should be uncollapsed
        self.collapsed = 0

        
        if isinstance(node,Tree):
            self.node = node.node()
            self.type = 'Normal'
        else:
            self.node = str((node.type()).node())
            self.type = 'Tree Token'
            # tree_tokens have these two extra pieces of data which we
            # need to keep track of
            self.loc = node.loc()
            #self.span = None
            

        # the label attribute will store the text that is drawn to canvas
        self.label = None

        # the location of the node.  This can be stored to facilitate
        # collapsing the node and (given time) methods like moving the node
        self.xloc = None
        self.yloc = None

        # a list for all the arrows going from the node to all the children
        self.arrow_list = []

        # the next two are only used if the node is collapsed
        # but its easier and more streamlined to include them
        # in this class than it would be to make another class
        self.collapsed_string = None
        self.triangle = None 
Example 68
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 4 votes vote down vote up
def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
                source_tagset=None, target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char {:d}'.format(len(s)))
    return stack[0]

### CONLL 
Example 69
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    """

    stack = [Tree(root_label, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError('Error on line {:d}'.format(lineno))
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 70
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 4 votes vote down vote up
def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
                source_tagset=None, target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char {:d}'.format(len(s)))
    return stack[0]

### CONLL 
Example 71
Project: Health-Checker   Author: KriAga   File: util.py    MIT License 4 votes vote down vote up
def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/',
                source_tagset=None, target_tagset=None):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char {:d}'.format(match.start()))
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char {:d}'.format(match.start()))
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                word, tag = str2tuple(text, sep)
                if source_tagset and target_tagset:
                    tag = map_tag(source_tagset, target_tagset, tag)
                stack[-1].append((word, tag))

    if len(stack) != 1:
        raise ValueError('Expected ] at char {:d}'.format(len(s)))
    return stack[0]

### CONLL 
Example 72
Project: Health-Checker   Author: KriAga   File: util.py    MIT License 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    """

    stack = [Tree(root_label, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError('Error on line {:d}'.format(lineno))
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 73
Project: FancyWord   Author: EastonLee   File: util.py    GNU General Public License v3.0 4 votes vote down vote up
def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_label: The label to use for chunk nodes
    :type chunk_label: str
    :param root_label: The label to use for the root of the tree
    :type root_label: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(root_label, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_label, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                stack[-1].append(str2tuple(text, sep))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]

### CONLL 
Example 74
Project: FancyWord   Author: EastonLee   File: util.py    GNU General Public License v3.0 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param root_label: The node label to use for the root.
    :type root_label: str
    :rtype: Tree
    """

    stack = [Tree(root_label, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError('Error on line %d' % lineno)
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].label()
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 75
Project: nltk-on-gae   Author: sivu22   File: util.py    Apache License 2.0 4 votes vote down vote up
def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_node: The label to use for chunk nodes
    :type chunk_node: str
    :param top_node: The label to use for the root of the tree
    :type top_node: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(top_node, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_node, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                stack[-1].append(str2tuple(text, sep))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]

### CONLL 
Example 76
Project: nltk-on-gae   Author: sivu22   File: util.py    Apache License 2.0 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param top_node: The node label to use for the root.
    :type top_node: str
    :rtype: Tree
    """

    stack = [Tree(top_node, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError('Error on line %d' % lineno)
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].node
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0] 
Example 77
Project: IHP   Author: lasigeBioTM   File: ddi_kernels.py    MIT License 4 votes vote down vote up
def get_svm_train_line(tree, pair, sid):
    lmtzr = WordNetLemmatizer()
    e1id = compact_id(pair.eids[0])
    e2id = compact_id(pair.eids[1])
    tree = tree.replace(pair.entities[0].tokens[0].text, 'candidatedrug')
    tree = tree.replace(pair.entities[1].tokens[0].text, 'candidatedrug')
    #tree = tree.replace(sid.replace('.', '').replace('-', '') + 'e', 'otherdrug')
    sid2 = compact_id(sid) + 'e'
    # TODO: replace other entities
    #tree = rext.sub(sid2 + r'\d+', 'otherdrug', tree)
    #print "tree2:", tree
    if tree[0] != '(':
        tree = '(S (' + tree + ' NN))'
    #this depends on the version of nlkt
    ptree = Tree.fromstring(tree)
    #ptree = Tree.parse(tree)
    leaves = list(ptree.pos())
    lemmaleaves = []
    for t in leaves:
        pos = get_wordnet_pos(t[1])
        lemma = lmtzr.lemmatize(t[0].lower(), pos)
        lemmaleaves.append(lemma)
    #lemmaleaves = [ for t in leaves)]
    logging.debug("tree:" + tree)
    line = '1 '
    line += '|BT|'  + tree
    #bowline = '(BOW (' + ' *)('.join(lemmaleaves) + ' *)) '
    #ptree = Tree.parse(bowline)
    #ptree = ptree.pprint(indent=-1000)
    #bowline = ptree.replace('\n', ' ')
    #bowline = '|BT| ' + bowline
    #if not bowline.count("otherdrug") > 8:
    #    line += bowline
    #else:
        #print "problem with BOW!"
    #line += bowline
    line += '|ET| '
    
    #i = 1
    #for m in docsp[ddi.PAIR_SSM_VECTOR]:
    #    line += " %s:%s" % (i, m)
    #    i += 1
    #line += " 2:" + str()
    #line += " |EV|"
    line += '\n'
    return line 
Example 78
Project: IHP   Author: lasigeBioTM   File: ddi_kernels.py    MIT License 4 votes vote down vote up
def testSVMTK(sentence, pairs, pairs_list, model="svm_tk_classifier.model", tag=""):
    if os.path.isfile(basedir + tag + "svm_test_data.txt"):
            os.remove(basedir + tag + "svm_test_data.txt")
    if os.path.isfile(basedir + tag + "svm_test_output.txt"):
            os.remove(basedir + tag + "svm_test_output.txt")
    #docs = use_external_data(docs, excludesentences, dditype)
    #pidlist = pairs.keys()
    total = 0
    with open(temp_dir + tag + "svm_test_data.txt", 'w') as test:
        for pid in pairs:
            sid = pairs[pid].sid
            tree = sentence.parsetree
            
            #if len(docs[sid][ddi.SENTENCE_ENTITIES]) > 30:
                #print line
                #line = reparse_tree(line)
            #    line = "1 |BT| (ROOT (NP (NN candidatedrug) (, ,) (NN candidatedrug))) |ET|\n"
            #    xerrors += 1
            #else:
            line = get_svm_train_line(tree, pairs[pid], sid)
            line = '-' + line
            test.write(line)
            total += 1
    #print "tree errors:", xerrors, "total:", total
    svmtklightargs = ["./bin/svm-light-TK-1.2/svm-light-TK-1.2.1/svm_classify",
                          temp_dir + tag + "svm_test_data.txt",  basedir + model,
                          temp_dir + tag + "svm_test_output.txt"]
    svmlightcall = Popen(svmtklightargs, stdout=PIPE, stderr=PIPE)
    res  = svmlightcall.communicate()
    # logging.debug(res[0].split('\n')[-3:])
    #os.system(' '.join(svmtklightargs))
    if not os.path.isfile(temp_dir + tag + "svm_test_output.txt"):
        print "something went wrong with SVM-light-TK"
        print res
        sys.exit()
    with open(temp_dir + tag + "svm_test_output.txt", 'r') as out:
        lines = out.readlines()
    if len(lines) != len(pairs_list):
        print "check " + tag + "svm_test_output.txt! something is wrong"
        print res
        sys.exit()
    for p, pid in enumerate(pairs):
        score = float(lines[p])
        if float(score) < 0:
            pairs[pid].recognized_by[relations.SST_PRED] = -1
        else:
            pairs[pid].recognized_by[relations.SST_PRED] = 1
        logging.info("{} - {} SST: {}".format(pairs[pid].entities[0], pairs[pid].entities[0], score))
    return pairs 
Example 79
Project: luscan-devel   Author: blackye   File: util.py    GNU General Public License v2.0 4 votes vote down vote up
def tagstr2tree(s, chunk_node="NP", top_node="S", sep='/'):
    """
    Divide a string of bracketted tagged text into
    chunks and unchunked tokens, and produce a Tree.
    Chunks are marked by square brackets (``[...]``).  Words are
    delimited by whitespace, and each word should have the form
    ``text/tag``.  Words that do not contain a slash are
    assigned a ``tag`` of None.

    :param s: The string to be converted
    :type s: str
    :param chunk_node: The label to use for chunk nodes
    :type chunk_node: str
    :param top_node: The label to use for the root of the tree
    :type top_node: str
    :rtype: Tree
    """

    WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+')

    stack = [Tree(top_node, [])]
    for match in WORD_OR_BRACKET.finditer(s):
        text = match.group()
        if text[0] == '[':
            if len(stack) != 1:
                raise ValueError('Unexpected [ at char %d' % match.start())
            chunk = Tree(chunk_node, [])
            stack[-1].append(chunk)
            stack.append(chunk)
        elif text[0] == ']':
            if len(stack) != 2:
                raise ValueError('Unexpected ] at char %d' % match.start())
            stack.pop()
        else:
            if sep is None:
                stack[-1].append(text)
            else:
                stack[-1].append(str2tuple(text, sep))

    if len(stack) != 1:
        raise ValueError('Expected ] at char %d' % len(s))
    return stack[0]

### CONLL 
Example 80
Project: luscan-devel   Author: blackye   File: util.py    GNU General Public License v2.0 4 votes vote down vote up
def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), top_node="S"):
    """
    Return a chunk structure for a single sentence
    encoded in the given CONLL 2000 style string.
    This function converts a CoNLL IOB string into a tree.
    It uses the specified chunk types
    (defaults to NP, PP and VP), and creates a tree rooted at a node
    labeled S (by default).

    :param s: The CoNLL string to be converted.
    :type s: str
    :param chunk_types: The chunk types to be converted.
    :type chunk_types: tuple
    :param top_node: The node label to use for the root.
    :type top_node: str
    :rtype: Tree
    """

    stack = [Tree(top_node, [])]

    for lineno, line in enumerate(s.split('\n')):
        if not line.strip(): continue

        # Decode the line.
        match = _LINE_RE.match(line)
        if match is None:
            raise ValueError, 'Error on line %d' % lineno
        (word, tag, state, chunk_type) = match.groups()

        # If it's a chunk type we don't care about, treat it as O.
        if (chunk_types is not None and
            chunk_type not in chunk_types):
            state = 'O'

        # For "Begin"/"Outside", finish any completed chunks -
        # also do so for "Inside" which don't match the previous token.
        mismatch_I = state == 'I' and chunk_type != stack[-1].node
        if state in 'BO' or mismatch_I:
            if len(stack) == 2: stack.pop()

        # For "Begin", start a new chunk.
        if state == 'B' or mismatch_I:
            chunk = Tree(chunk_type, [])
            stack[-1].append(chunk)
            stack.append(chunk)

        # Add the new word token.
        stack[-1].append((word, tag))

    return stack[0]