Python nltk.chunk() Examples

The following are 6 code examples of nltk.chunk(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: transformer.py    From atap with Apache License 2.0 6 votes vote down vote up
def extract_candidate_phrases(self, sents):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        for sent in sents:
            sent = self.normalize(sent)
            if not sent: continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(
                    chunks, lambda term: term[-1] != 'O'
                ) if key
            ]
            for phrase in phrases:
                yield phrase 
Example #2
Source File: token_merger.py    From Parsivar with MIT License 5 votes vote down vote up
def train_merger(self, train_file_path, test_split=0.1):
        print("Loading Data...")
        file = open(train_file_path, "r", encoding='utf-8')
        file_content = file.read()
        file_content = file_content.split("\n\n")

        data_list = []
        for line in file_content:
            line = nltk.chunk.util.conllstr2tree(line, chunk_types=('NP',), root_label='S')
            if (len(line) > 0):
                data_list.append(line)

        # train_sents, test_sents = train_test_split(data_list, test_size=test_split, random_state=91)
        train_sents = data_list
        test_sents = []

        print("Training the model ...")

        # Transform the trees in IOB annotated sentences [(word, pos, chunk), ...]
        chunked_sents = [tree2conlltags(sent) for sent in train_sents]

        # Transform the triplets in pairs, make it compatible with the tagger interface [((word, pos), chunk), ...]
        def triplets2tagged_pairs(iob_sent):
            return [((word, pos), chunk) for word, pos, chunk in iob_sent]

        chunked_sents = [triplets2tagged_pairs(sent) for sent in chunked_sents]

        self.feature_detector = self.features
        self.tagger = ClassifierBasedTagger(
            train=chunked_sents,
            feature_detector=self.features)

        token_merger_model = self.tagger

        if len(test_sents) > 0:
            print("evaluating...")
            print(token_merger_model.evaluate(test_sents))

        return token_merger_model 
Example #3
Source File: entity_recognizer_mod.py    From nlp-services with MIT License 5 votes vote down vote up
def structure_ne(self, ne_tree):
        ne = []
        for subtree in ne_tree:
            if type(subtree) == Tree:  # If subtree is a noun chunk, i.e. NE != "O"
                ne_label = subtree.label()
                ne_string = " ".join([token for token, pos in subtree.leaves()])
                ne.append((ne_string, ne_label))
        return ne

    # Nltk Named Entity Recognizer 
Example #4
Source File: do_benchmark.py    From PyRATA with Apache License 2.0 5 votes vote down vote up
def measure_pattern_time_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
  global pattern_search_result    #Make measure_me able to modify the value
  pattern_search_result = search("%s", text_tree)
  #print ("clip.pattern len(result)="+str(len(pattern_search_result)))
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
#print ('pattern_search_time')
def pattern_search_timeit():
  runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
  average = sum(runtimes)/len(runtimes)
#  return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
  return [runtimes, average, min(runtimes), len(pattern_search_result)]
channel.send(pattern_search_timeit())
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive() 
Example #5
Source File: do_benchmark.py    From PyRATA with Apache License 2.0 5 votes vote down vote up
def write_pattern_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
size = %s
words = brown.words()[:size]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
def backslash(string):
  for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
    if ch in string:
      string=string.replace(ch,'_')
  return string  
from pattern.search import search
pattern = "%s"
pattern_search_result = search(pattern, text_tree)
measure_pattern_search()
filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern)
thefile = open(filename, 'w')
for item in pattern_search_result:
  print>>thefile, item
channel.send([filename, size, len(pattern_search_result)])
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive() 
Example #6
Source File: nltk.py    From PyRATA with Apache License 2.0 5 votes vote down vote up
def pyrata2conll (dictList, **kwargs):
  """ 
  See 3.1   Reading IOB Format and the CoNLL 2000 Corpus http://www.nltk.org/book/ch07.html
  
  can be used wi
  nltk.chunk.conllstr2tree(text, chunk_types=['NP']).draw()
  """

  if 'raw' in kwargs.keys(): 
    rawFeatureName = kwargs['raw']
  if 'pos' in kwargs.keys(): 
    posFeatureName = kwargs['pos']   
  if 'chunk' in kwargs.keys(): 
    chunkFeatureName = kwargs['chunk']

  text = ''
  for e in dictList:
    text.append(' '.join([e[rawFeatureName], e[posFeatureName], e[chunkFeatureName], '\n']))

  return text


# extend a given dictList 

# merge dictList

# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Run all the tests
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""