python source code of do

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# PyRATA
#
# Authors: 
#         Nicolas Hernandez <nicolas.hernandez@gmail.com>
#         Guan Gui 2014-08-10 13:20:03 https://www.guiguan.net/a-beautiful-linear-time-python-regex-matcher-via-nfa
# URL: 
#         https://github.com/nicolashernandez/PyRATA/
#
#
# Copyright 2017 Nicolas Hernandez
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License. 
#
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

"""
part-of-speech (POS) patterns to find and extract NPs

compare the time performance of pyrata with

pattern
http://www.clips.ua.ac.be/pages/pattern-search
The search() function takes a string (e.g., a word or a sequence of words) and returns a list of non-overlapping matches in the given sentence. 
The match() function returns the first match, or None.

python.chunk
http://www.nltk.org/book/ch07.html
http://www.nltk.org/howto/chunk.html
http://www.nltk.org/_modules/nltk/chunk.html

spacy
https://spacy.io/

textblob
http://rwet.decontextualize.com/book/textblob/

re

Reference
=========

Justeson, J., & Katz, S. (1995). Technical terminology: Some linguistic properties and an algorithm for identification in text. Natural Language Engineering, 1(1), 9-27.
https://brenocon.com/JustesonKatz1995.pdf


"""

import logging
from timeit import Timer

import nltk
from nltk.corpus import brown
import nltk.chunk
from nltk.chunk.regexp import *




import pyrata.re as pyrata_re




# time pattern v1
# https://stackoverflow.com/questions/27863832/calling-python-2-script-from-python-3
# ---------------------------------------------------------
import subprocess
def measure_pattern_time_v1(iteration_number, size, pattern):
  """ pattern """
  python2_command = 'python more/measure_pattern_time_v1.py %s %s %s' % (iteration_number, size, pattern)  # launch your python2 script using bash  arg1 arg2
  process = subprocess.Popen(python2_command.split(), stdout=subprocess.PIPE)
  output, error = process.communicate()  # receive output from the python2 script
  return output #.replace('\n','')


# time pattern v2
# https://stackoverflow.com/questions/27863832/calling-python-2-script-from-python-3
# ---------------------------------------------------------
import execnet

def measure_pattern_time_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
words = brown.words()[:%s]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
from pattern.search import search
def measure_pattern_search():
  global pattern_search_result    #Make measure_me able to modify the value
  pattern_search_result = search("%s", text_tree)
  #print ("clip.pattern len(result)="+str(len(pattern_search_result)))
from timeit import Timer
pattern_search_time = Timer(measure_pattern_search)
#print ('pattern_search_time')
def pattern_search_timeit():
  runtimes = [pattern_search_time.timeit(number=1) for i in range (0, %s)]
  average = sum(runtimes)/len(runtimes)
#  return ''.join(['timit: #runs=', str(%s), ' ; average=', str(average),' ; min=', str(min(runtimes))])
  return [runtimes, average, min(runtimes), len(pattern_search_result)]
channel.send(pattern_search_timeit())
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive()



def measure_time (Function, iteration_number):
  function_time = Timer(Function)
  runtimes = [function_time.timeit(number=1) for i in range (0, iteration_number)]
  average = sum(runtimes)/len(runtimes)
  return runtimes, average, min(runtimes)


def nltk_regex_parser():
  global nltk_regex_parser_result # tree
  nltk_regex_parser_result = regex_parser.parse(pos_tags)
  print (len(nltk_regex_parser_result))

def nltk_regex_chunker_parser():
  global nltk_regex_chunk_parser_result # tree
  nltk_regex_chunk_parser_result = chunk_parser.parse(pos_tags)
  print (len(nltk_regex_chunk_parser_result))


def spacy_rule_based_matcher():
  global spacy_rule_based_matcher_result # tree
  spacy_rule_based_matcher_result = matcher(doc)
  print (len(spacy_rule_based_matcher_result)) 


def measure_pyrata_findall():
  global pyrata_findall_result
  pyrata_findall_result = pyrata_re.findall(pyrata_grammar, dictlist)
  print (len(pyrata_findall_result)) 

def measure_nfa_pyrata_findall():
  global nfa_pyrata_findall_result
  nfa_pyrata_findall_result = pyrata_re.findall(pyrata_grammar, dictlist)
  print (len(nfa_pyrata_findall_result)) 

def time_noun_phrase_recognizers(size, analysers):
  """                            
  """

  print ('Measuring time performance on # {} words over # {} iterations for recognizing Noun Phrases'.format(size, iteration_number))
  print ('analyzer_name,\tpattern_grammar,\taveragetime,\tmintime')

  # # ----------------------------------------------------
  # # pattern 
  # # ----------------------------------------------------
  analyzer_name = 'clips.pattern'
  if analyzer_name in analysers: 

    global grammar 
    # http://www.clips.ua.ac.be/pages/pattern-search
    # | ADJP|ADVP Separator for different options.
    # * JJ* Used as a wildcard character.
    # ? JJ? Used as a suffix, constraint is optional.
    # + RB|JJ+ or JJ?+ or *+  Used as a suffix, constraint can span multiple words.


    # # # v1 
    # # pattern_time = measure_pattern_time_v1(iteration_number, size, grammar)
    # # print ('pattern_time_v1:', pattern_time)
    # # # can also be called by
    # # # python benchmark/measure_pattern_time_v1.py 1 1000 "DT? JJ|NN?+ NN"

    # v2
    pattern_grammar = 'DT? JJ?+ NN+'
    runtimes, averagetime, mintime, len_matches = measure_pattern_time_v2(iteration_number, size, pattern_grammar)
    #print ('{}'.format(len_matches))
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, pattern_grammar, averagetime, mintime, len_matches))

    #pattern_time = measure_pattern_time_v2(iteration_number, size, grammar)
    pattern_grammar = 'DT? JJ|NN?+ NN|NNS'
    runtimes, averagetime, mintime, len_matches = measure_pattern_time_v2(iteration_number, size, pattern_grammar)
    #print ('{}'.format(len_matches))
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, pattern_grammar, averagetime, mintime, len_matches))

    pattern_grammar = 'DT? JJ|NN?+ NN|NNS+'
    runtimes, averagetime, mintime, len_matches = measure_pattern_time_v2(iteration_number, size, pattern_grammar)
    #print ('{}'.format(len_matches))
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, pattern_grammar, averagetime, mintime, len_matches))

    #print ('pattern_time_v2: grammar={} {}'.format(grammar,pattern_time))


  # ----------------------------------------------------
  # data 
  # ----------------------------------------------------
  # brown

  tokens = brown.words()
  #if size != -1:
  tokens = tokens[:size]
  #  size = len(tokens)
  #print ('Info: brown.words size={}'.format(size))
  
  #print ('Info: pos_tag ')
  global pos_tags 
  pos_tags = nltk.pos_tag(tokens)


  # ----------------------------------------------------
  # nltk chunker regexparser
  # ----------------------------------------------------
  analyzer_name = 'nltk_regex_parser'
  if analyzer_name in analysers: 

    global regex_parser 

    nltk_grammar = "NP: {<DT>?<JJ>*<NN>+}"
    nltk_regexparser_result = []
    regex_parser = nltk.RegexpParser(nltk_grammar)
    runtimes, averagetime, mintime = measure_time(nltk_regex_parser, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, nltk_grammar, averagetime, mintime, str(len(nltk_regexparser_result))))


    nltk_regexparser_result = []
    nltk_grammar = "NP: {<DT>?<JJ|NN>*<NN|NNS>}"
    regex_parser = nltk.RegexpParser(nltk_grammar)
    #for subtree in nltk_regexparser_result.subtrees():
    #  if subtree.label() == "NP":
    #    print("NP: "+str(subtree.leaves()))
   
    runtimes, averagetime, mintime = measure_time(nltk_regex_parser, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, nltk_grammar, averagetime, mintime, str(len(nltk_regexparser_result))))


    nltk_grammar = "NP: {<DT>?<JJ|NN>*<NN.*>}"
    nltk_regexparser_result = []
    regex_parser = nltk.RegexpParser(nltk_grammar)
    runtimes, averagetime, mintime = measure_time(nltk_regex_parser, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, nltk_grammar, averagetime, mintime, str(len(nltk_regexparser_result))))


  # ----------------------------------------------------
  # nltk chunker regex_chunker_parser
  # ----------------------------------------------------

  analyzer_name = 'nltk_regex_chunk_parser'
  if analyzer_name in analysers: 

    global chunk_parser 

    nltk_regex_chunk_grammar = "<DT>?<JJ>*<NN>+"
    chunk_rule = ChunkRule(nltk_regex_chunk_grammar, "Noun Phrase")
    chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
    nltk_regex_chunk_parser_result = []
    runtimes, averagetime, mintime = measure_time(nltk_regex_chunker_parser, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, nltk_regex_chunk_grammar, averagetime, mintime, str(len(nltk_regex_chunk_parser_result))))


    nltk_regex_chunk_grammar = "<DT>?<JJ|NN>*<NN|NNS>"
    chunk_rule = ChunkRule(nltk_regex_chunk_grammar, "Noun Phrase")
    chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
    nltk_regex_chunk_parser_result = []
    runtimes, averagetime, mintime = measure_time(nltk_regex_chunker_parser, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, nltk_regex_chunk_grammar, averagetime, mintime, str(len(nltk_regex_chunk_parser_result))))


    nltk_regex_chunk_grammar = "<DT>?<JJ|NN>*<NN.*>"
    chunk_rule = ChunkRule(nltk_regex_chunk_grammar, "Noun Phrase")
    chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
    nltk_regex_chunk_parser_result = []
    runtimes, averagetime, mintime = measure_time(nltk_regex_chunker_parser, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, nltk_regex_chunk_grammar, averagetime, mintime, str(len(nltk_regex_chunk_parser_result))))



  # ----------------------------------------------------
  # spacy v1.x
  # ----------------------------------------------------
  import spacy                           # See "Installing spaCy"
  from spacy.matcher import Matcher
  # https://github.com/explosion/spaCy/blob/master/spacy/attrs.pyx
  # https://github.com/explosion/spaCy/issues/882

  from spacy.attrs import IS_PUNCT, LOWER, POS

  # text = ' '.join(tokens)
  # nlp = spacy.load('en')                 # You are here.
  # global matcher 
  # analyzer_name = 'spaCy'
  
  # spacy_grammar = [{POS: "DET", 'OP':"?"}, {POS: "ADJ", 'OP':"*"}, {POS: "NOUN", 'OP':"+"}]
  # matcher = Matcher(nlp.vocab)
  # matcher.add_pattern("NounPhrase", spacy_grammar)
  # # # matcher.add_pattern("NounPhrase", [{POS: "DET", 'OP':"?"}, {POS: "ADJ", 'OP':"*"}, {POS: "NOUN"}])
  # # #<DT>?<JJ|NN>*<NN|NNS>
  # global doc
  # doc = nlp(text)
  # # matches = matcher(doc)
  # # matches
  # # for ent_id, label, start, end in matches:
  # #   span = doc[start:end]
  # #   # First token is our noun_phrase_0
  # #   np_0 = span[0]
  # #   # Last token is noun_phrase_1
  # #   np_1 = span[-1]
  # #   print("span({})".format(span))
  # spacy_rule_based_matcher_result = []
  # runtimes, averagetime, mintime = measure_time(spacy_rule_based_matcher, iteration_number)
  # print ('{}\t{}\t{}\t{}'.format(analyzer_name, spacy_grammar, averagetime, mintime))


  # ----------------------------------------------------
  # spacy v2.0.
  # https://spacy.io/usage/v2#features-matcher
  # ----------------------------------------------------

  analyzer_name = 'spaCy'
  if analyzer_name in analysers: 
    text = ' '.join(tokens)
    print ("Debug: done - text = ' '.join(tokens) ")

    nlp = spacy.load('en',  disable=['ner', 'parser', 'textcat'])   #  disable=['tagger', 'ner']  'parser', 'ner' or 'textcat'.            # You are here.
    global matcher 

    spacy_grammar = [{POS: "DET", 'OP':"?"}, {POS: "ADJ", 'OP':"*"}, {POS: "NOUN", 'OP':"+"}]
    matcher = Matcher(nlp.vocab)
    matcher.add ("NounPhrase", None, spacy_grammar)
    # # matcher.add_pattern("NounPhrase", [{POS: "DET", 'OP':"?"}, {POS: "ADJ", 'OP':"*"}, {POS: "NOUN"}])
    # #<DT>?<JJ|NN>*<NN|NNS>
    global doc
    doc = nlp(text)
    print ("Debug: done - doc = nlp(text)")

    # matches = matcher(doc)
    # matches
    # for ent_id, label, start, end in matches:
    #   span = doc[start:end]
    #   # First token is our noun_phrase_0
    #   np_0 = span[0]
    #   # Last token is noun_phrase_1
    #   np_1 = span[-1]
    #   print("span({})".format(span))
    spacy_rule_based_matcher_result = []
    runtimes, averagetime, mintime = measure_time(spacy_rule_based_matcher, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, spacy_grammar, averagetime, mintime, str(len(spacy_rule_based_matcher_result))))


  # ----------------------------------------------------
  # pyrata 
  # ----------------------------------------------------
  analyzer_name='pyrata'
  if analyzer_name in analysers: 

    #sentence = 'A great sentence .'
    #dictlist =  [{'raw':word, 'pos':pos} for (word, pos) in nltk.pos_tag(nltk.word_tokenize(sentence))]
    global dictlist 
    dictlist = [{'raw':w, 'pos':p} for (w, p) in pos_tags]
    global pyrata_grammar

    pyrata_grammar = 'pos="DT"? pos="JJ"* pos="NN"+'
    nfa_pyrata_findall_result = []
    #runtimes, averagetime, mintime = measure_time(measure_pyrata_findall, iteration_number)  
    runtimes, averagetime, mintime = measure_time(measure_nfa_pyrata_findall, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, pyrata_grammar, averagetime, mintime, str(len(nfa_pyrata_findall_result))))

    pyrata_grammar = 'pos="DT"? [pos="NN" | pos="JJ"]* [pos="NN" | pos="NNS"]'
    nfa_pyrata_findall_result = []
     #runtimes, averagetime, mintime = measure_time(measure_pyrata_findall, iteration_number)  
    runtimes, averagetime, mintime = measure_time(measure_nfa_pyrata_findall, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, pyrata_grammar, averagetime, mintime, str(len(nfa_pyrata_findall_result))))


    pyrata_grammar = 'pos="DT"? [pos~"NN|JJ"]* pos~"NN.*"'
    nfa_pyrata_findall_result = []
  # #  runtimes, averagetime, mintime = measure_time(measure_pyrata_findall, iteration_number)  
    runtimes, averagetime, mintime = measure_time(measure_nfa_pyrata_findall, iteration_number)
    print ('{}\t{}\t{}\t{}\t{}'.format(analyzer_name, pyrata_grammar, averagetime, mintime, str(len(nfa_pyrata_findall_result))))




# output pattern v2
# https://stackoverflow.com/questions/27863832/calling-python-2-script-from-python-3
# ---------------------------------------------------------

def write_pattern_v2(iteration_number, size, pattern):
  gw      = execnet.makegateway("popen//python=python2.7")
  channel = gw.remote_exec("""
from nltk.corpus import brown
size = %s
words = brown.words()[:size]
text = ' '.join(words)
from pattern.en import parsetree
text_tree = parsetree(text,
 tokenize = True,         # Split punctuation marks from words?
     tags = True,         # Parse part-of-speech tags? (NN, JJ, ...)
   chunks = False,         # Parse chunks? (NP, VP, PNP, ...)
relations = False,        # Parse chunk relations? (-SBJ, -OBJ, ...)
  lemmata = False,        # Parse lemmata? (ate => eat)
 encoding = 'utf-8',       # Input string encoding.
   tagset = None)         # Penn Treebank II (default) or UNIVERSAL.
def backslash(string):
  for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
    if ch in string:
      string=string.replace(ch,'_')
  return string  
from pattern.search import search
pattern = "%s"
pattern_search_result = search(pattern, text_tree)
measure_pattern_search()
filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(pattern_search_result))+'_'+backslash(pattern)
thefile = open(filename, 'w')
for item in pattern_search_result:
  print>>thefile, item
channel.send([filename, size, len(pattern_search_result)])
  """ % (size, pattern, iteration_number, iteration_number))
  channel.send([])
  return channel.receive()  

def output_noun_phrase_recognizers(size):
  """                            
  """

  print ('Measuring time performance on # {} words over # {} iterations for recognizing Noun Phrases'.format(size, iteration_number))
  print ('analyzer_name,\tpattern_grammar,\taveragetime,\tmintime')

  # # ----------------------------------------------------
  # # pattern 
  # # ----------------------------------------------------
  analyzer_name = 'clips.pattern'
  global grammar 
  # http://www.clips.ua.ac.be/pages/pattern-search
  # | ADJP|ADVP Separator for different options.
  # * JJ* Used as a wildcard character.
  # ? JJ? Used as a suffix, constraint is optional.
  # + RB|JJ+ or JJ?+ or *+  Used as a suffix, constraint can span multiple words.


  # # # v1 
  # # pattern_time = measure_pattern_time_v1(iteration_number, size, grammar)
  # # print ('pattern_time_v1:', pattern_time)
  # # # can also be called by
  # # # python benchmark/measure_pattern_time_v1.py 1 1000 "DT? JJ|NN?+ NN"

  # v2
  pattern_grammar = 'DT? JJ?+ NN+'
  filename, data_size, result_size = write_pattern_v2(iteration_number, size, pattern_grammar)
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, pattern_grammar, result_size, filename))

  #pattern_time = measure_pattern_time_v2(iteration_number, size, grammar)
  pattern_grammar = 'DT? JJ|NN?+ NN|NNS'
  filename, data_size, result_size = write_pattern_v2(iteration_number, size, pattern_grammar)
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, pattern_grammar, result_size, filename))

  pattern_grammar = 'DT? JJ|NN?+ NN|NNS+'
  filename, data_size, result_size  = write_pattern_v2(iteration_number, size, pattern_grammar)
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, pattern_grammar, result_size, filename))

  #print ('pattern_time_v2: grammar={} {}'.format(grammar,pattern_time))


  # ----------------------------------------------------
  # data 
  # ----------------------------------------------------
  # brown

  tokens = brown.words()
  #if size != -1:
  tokens = tokens[:size]
  #  size = len(tokens)
  #print ('Info: brown.words size={}'.format(size))
  
  #print ('Info: pos_tag ')
  global pos_tags 
  pos_tags = nltk.pos_tag(tokens)


  # ----------------------------------------------------
  # nltk chunker regexparser
  # ----------------------------------------------------
  analyzer_name = 'nltk_regex_parser'
  global regex_parser 

  grammar = "NP: {<DT>?<JJ>*<NN>+}"
  result = []
  regex_parser = nltk.RegexpParser(grammar)
  result = regex_parser.parse(pos_tags)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  result_list = []
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      result_list.append(str(subtree.leaves()))
  thefile = open(filename, 'w')
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result_list), filename))



  grammar = "NP: {<DT>?<JJ|NN>*<NN|NNS>}"
  result = []
  regex_parser = nltk.RegexpParser(grammar)
  result = regex_parser.parse(pos_tags)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  result_list = []
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      result_list.append(str(subtree.leaves()))
  thefile = open(filename, 'w')
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result_list), filename))


  grammar = "NP: {<DT>?<JJ|NN>*<NN.*>}"
  result = []
  regex_parser = nltk.RegexpParser(grammar)
  result = regex_parser.parse(pos_tags)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  result_list = []
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      result_list.append(str(subtree.leaves()))
  thefile = open(filename, 'w')
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result_list), filename))


  # ----------------------------------------------------
  # nltk chunker regex_chunker_parser
  # ----------------------------------------------------

  analyzer_name = 'nltk_regex_chunk_parser'
  global chunk_parser 

  grammar = "<DT>?<JJ>*<NN>+"
  chunk_rule = ChunkRule(grammar, "Noun Phrase")
  chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
  result = []
  result = chunk_parser.parse(pos_tags)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  result_list = []
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      result_list.append(str(subtree.leaves()))
  thefile = open(filename, 'w')
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result_list), filename))

  grammar = "<DT>?<JJ|NN>*<NN|NNS>"
  chunk_rule = ChunkRule(grammar, "Noun Phrase")
  chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
  result = []
  result = chunk_parser.parse(pos_tags)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  result_list = []
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      result_list.append(str(subtree.leaves()))
  thefile = open(filename, 'w')
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result_list), filename))



  grammar = "<DT>?<JJ|NN>*<NN.*>"
  chunk_rule = ChunkRule(grammar, "Noun Phrase")
  chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP')
  result = []
  result = chunk_parser.parse(pos_tags)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  result_list = []
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      result_list.append(str(subtree.leaves()))
  thefile = open(filename, 'w')
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result_list), filename))




  # ----------------------------------------------------
  # spacy v1.x
  # ----------------------------------------------------
  import spacy                           # See "Installing spaCy"
  from spacy.matcher import Matcher
  # https://github.com/explosion/spaCy/blob/master/spacy/attrs.pyx
  # https://github.com/explosion/spaCy/issues/882

  from spacy.attrs import IS_PUNCT, LOWER, POS


  # ----------------------------------------------------
  # spacy v2.0.
  # https://spacy.io/usage/v2#features-matcher
  # ----------------------------------------------------
# https://spacy.io/api/matcher
# from spacy.matcher import Matcher
# matcher = Matcher(nlp.vocab)
# pattern = [{'LOWER': "hello"}, {'LOWER': "world"}]
# matcher.add("HelloWorld", None, pattern)
# doc = nlp(u'hello world!')
# matches = matcher(doc)


  text = ' '.join(tokens)
  print ("Debug: done - text = ' '.join(tokens) ")

  nlp = spacy.load('en',  disable=['ner', 'parser', 'textcat'])   #  disable=['tagger', 'ner']  'parser', 'ner' or 'textcat'.            # You are here.
  global matcher 
  analyzer_name = 'spaCy'
  
  grammar = [{POS: "DET", 'OP':"?"}, {POS: "ADJ", 'OP':"*"}, {POS: "NOUN", 'OP':"+"}]


  matcher = Matcher(nlp.vocab)
  matcher.add ("NounPhrase", None, grammar)
  # # matcher.add_pattern("NounPhrase", [{POS: "DET", 'OP':"?"}, {POS: "ADJ", 'OP':"*"}, {POS: "NOUN"}])
  # #<DT>?<JJ|NN>*<NN|NNS>
  global doc
  doc = nlp(text)
  print ('Debug: done - spacy doc = nlp(text) ')
  # matches = matcher(doc)
  # matches
  # for ent_id, label, start, end in matches:
  #   span = doc[start:end]
  #   # First token is our noun_phrase_0
  #   np_0 = span[0]
  #   # Last token is noun_phrase_1
  #   np_1 = span[-1]
  #   print("span({})".format(span))
  result = []
  result = matcher(doc)
  result_list = []
  for ent_id, start, end in result:
    span = doc[start:end]
    # First token is our noun_phrase_0
    np_0 = span[0]
    # Last token is noun_phrase_1
    np_1 = span[-1]
    #result_list.append([span.text, str(start), str(end)])
    result_list.append(span.text)

  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+str(backslash(grammar))
  thefile = open(filename, 'w')
  #thefile.write("\n".join([' '.join(r) for r in result_list]))
  thefile.write("\n".join(result_list))
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, str(grammar), len(result_list), filename))

  # ----------------------------------------------------
  # pyrata 
  # ----------------------------------------------------
  analyzer_name='pyrata'

  #sentence = 'A great sentence .'
  #dictlist =  [{'raw':word, 'pos':pos} for (word, pos) in nltk.pos_tag(nltk.word_tokenize(sentence))]
  global dictlist 
  dictlist = [{'raw':w, 'pos':p} for (w, p) in pos_tags]
  global pyrata_grammar

  grammar = 'pos="DT"? pos="JJ"* pos="NN"+'
  result = []
  result = pyrata_re.findall(pyrata_grammar, dictlist)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  thefile = open(filename, 'w')
  for item in result:
    print>>thefile, item
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result), filename))



  grammar = 'pos="DT"? [pos="NN" | pos="JJ"]* [pos="NN" | pos="NNS"]'
  result = []
  result = pyrata_re.findall(pyrata_grammar, dictlist)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  thefile = open(filename, 'w')
  for item in result:
    print>>thefile, item
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result), filename))

  grammar = 'pos="DT"? [pos~"NN|JJ"]* pos~"NN.*"'
  result = []
  result = pyrata_re.findall(pyrata_grammar, dictlist)
  filename = '/tmp/benchmark_'+analyzer_name+'_'+str(size)+"_"+str(len(result))+'_'+backslash(grammar)
  thefile = open(filename, 'w')
  for item in result:
    print>>thefile, item
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, len(result), filename))


def nltk_parse_clause(sentence):
  """
  Natural Language Toolkit: code_cascaded_chunker
  http://www.nltk.org/book/ch07.html#code-cascaded-chunker
  """
  grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
  cp = nltk.RegexpParser(grammar)
  #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),  ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
  parsed_sentence = cp.parse(sentence)
  #print('parsed_sentence=', parsed_sentence)

def nltk_parse_clause_in_the_whole_text():
  for s in brown_pos_tag_sents:
    nltk_parse_clause(s)

def pyrata_recognize_clause_in_short(sentence_dict_list):
  # http://www.nltk.org/book/ch07.html # Building Nested Structure with Cascaded Chunkers
#    sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
#      ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
#   sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
#   data = [{'raw':w, 'pos':p} for (w, p) in sentence]
#   print ('Debug:', data)

  # NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN :      can   
  # extend pattern='pos~"DT|JJ|NN.*"+' annotation={'ch1':'NP'} iob = True 
  action = 'extend'
  group = [0]
  iob = True
  pattern = 'pos~"DT|JJ|NN.*"+'
  annotation = {'ch1':'NP'}
  result_NP = pyrata_re.annotate (pattern, annotation, sentence_dict_list, group, action, iob)
  #print ('Debug: ch1 NP=',result_NP)

  #PP: {<IN><NP>}               # Chunk prepositions followed by NP :  may   
  #extend pattern='pos="IN" ch1-"NP"' annotation={'ch2':'PP'} iob = True 
  #       pattern='pos="IN" (ch1="B-NP" ch1="I-NP"*)"
  pattern = 'pos="IN" (ch1="B-NP" ch1="I-NP"*)'
  annotation = {'ch2':'PP'}
  result_PP = pyrata_re.annotate (pattern, annotation, result_NP, group, action, iob)
  #rint ('Debug: ch2 PP=',result_PP)

  # VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments :    might 
  # extend pattern='pos~"VB.*" (ch1-"NP"|ch2-"PP"|ch3-"CLAUSE")+$' annotation={'ch4':'VP'} iob = True
   #       pattern='pos~"VB.*" (ch1="B-NP" ch1="I-NP"*|ch2="B-PP" ch2="I-PP"*|ch3="B-CLAUSE" ch3="I-CLAUSE"*)+$'
  pattern = 'pos~"VB.*" (ch1="B-NP" ch1="I-NP"*|ch2="B-PP" ch2="I-PP"*|ch3="B-CLAUSE" ch3="I-CLAUSE"*)+$' 
  annotation = {'ch4':'VP'}
  result_VP = pyrata_re.annotate (pattern, annotation, result_PP, group, action, iob)
  #print ('Debug: ch4 VP=',result_VP)


  # CLAUSE: {<NP><VP>}           # Chunk NP, VP                         might 
  #extend pattern='ch1-"NP" ch4-"VP"' annotation={'ch3':'CLAUSE'} iob = True
  #        pattern='(ch1="B-NP" ch1="B-NP"*) (ch4="B-VP" ch4="I-VP"*)'
  pattern = '(ch1="B-NP" ch1="I-NP"*) (ch4="B-VP" ch4="I-VP"*)'
  annotation = {'ch3':'CLAUSE'}
  result_CLAUSE = pyrata_re.annotate (pattern, annotation, result_VP, group, action, iob)
  #print ('Debug: ch3 CLAUSE=',result_CLAUSE)

  # loop 2
  pattern = 'pos~"VB.*" (ch3="B-CLAUSE" ch3="I-CLAUSE"*)+$' # it is not an OR all inclusive it is the first presented which match ch1="B-NP" ch1="I-NP"*|ch2="B-PP" ch2="I-PP"*|
  annotation = {'ch5':'VP'}
  result_VP = pyrata_re.annotate (pattern, annotation, result_PP, group, action, iob)
  #print ('Debug: ch5 (loop 2) VP=',result_VP)

  pattern = '(ch1="B-NP" ch1="I-NP"*) (ch5="B-VP" ch5="I-VP"*)'
  annotation = {'ch6':'CLAUSE'}
  result_CLAUSE = pyrata_re.annotate (pattern, annotation, result_VP, group, action, iob)
  #print ('Debug: ch6 (loop 2) CLAUSE=',result_CLAUSE)    

def pyrata_recognize_clause(sentence_dict_list):
  # http://www.nltk.org/book/ch07.html # Building Nested Structure with Cascaded Chunkers
#    sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),
#      ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
#   sentence = [("John", "NNP"), ("thinks", "VBZ"), ("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
#   data = [{'raw':w, 'pos':p} for (w, p) in sentence]
#   print ('Debug:', data)

  # NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN :      can   
  # extend pattern='pos~"DT|JJ|NN.*"+' annotation={'ch1':'NP'} iob = True 
  action = 'extend'
  group = [0]
  iob = True
  pattern = 'pos~"DT|JJ|NN.*"+'
  annotation = {'ch1':'NP'}
  result_NP = pyrata_re.annotate (pattern, annotation, sentence_dict_list, group, action, iob)
  #print ('Debug: ch1 NP=',result_NP)

  #PP: {<IN><NP>}               # Chunk prepositions followed by NP :  may   
  #extend pattern='pos="IN" ch1-"NP"' annotation={'ch2':'PP'} iob = True 
  #       pattern='pos="IN" (ch1="B-NP" ch1="I-NP"*)"
  pattern = 'pos="IN" (ch1="B-NP" ch1="I-NP"*)'
  annotation = {'ch2':'PP'}
  result_PP = pyrata_re.annotate (pattern, annotation, result_NP, group, action, iob)
  #rint ('Debug: ch2 PP=',result_PP)

  # VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments :    might 
  # extend pattern='pos~"VB.*" (ch1-"NP"|ch2-"PP"|ch3-"CLAUSE")+$' annotation={'ch4':'VP'} iob = True
   #       pattern='pos~"VB.*" (ch1="B-NP" ch1="I-NP"*|ch2="B-PP" ch2="I-PP"*|ch3="B-CLAUSE" ch3="I-CLAUSE"*)+$'
  pattern = 'pos~"VB.*" (ch1="B-NP" ch1="I-NP"*|ch2="B-PP" ch2="I-PP"*|ch3="B-CLAUSE" ch3="I-CLAUSE"*)+$' 
  annotation = {'ch4':'VP'}
  result_VP = pyrata_re.annotate (pattern, annotation, result_PP, group, action, iob)
  #print ('Debug: ch4 VP=',result_VP)


  # CLAUSE: {<NP><VP>}           # Chunk NP, VP                         might 
  #extend pattern='ch1-"NP" ch4-"VP"' annotation={'ch3':'CLAUSE'} iob = True
  #        pattern='(ch1="B-NP" ch1="B-NP"*) (ch4="B-VP" ch4="I-VP"*)'
  pattern = '(ch1="B-NP" ch1="I-NP"*) (ch4="B-VP" ch4="I-VP"*)'
  annotation = {'ch3':'CLAUSE'}
  result_CLAUSE = pyrata_re.annotate (pattern, annotation, result_VP, group, action, iob)
  #print ('Debug: ch3 CLAUSE=',result_CLAUSE)

  # loop 2
  pattern = 'pos~"VB.*" (ch3="B-CLAUSE" ch3="I-CLAUSE"*)+$' # it is not an OR all inclusive it is the first presented which match ch1="B-NP" ch1="I-NP"*|ch2="B-PP" ch2="I-PP"*|
  annotation = {'ch5':'VP'}
  result_VP = pyrata_re.annotate (pattern, annotation, result_PP, group, action, iob)
  #print ('Debug: ch5 (loop 2) VP=',result_VP)

  pattern = '(ch1="B-NP" ch1="I-NP"*) (ch5="B-VP" ch5="I-VP"*)'
  annotation = {'ch6':'CLAUSE'}
  result_CLAUSE = pyrata_re.annotate (pattern, annotation, result_VP, group, action, iob)
  #print ('Debug: ch6 (loop 2) CLAUSE=',result_CLAUSE)

  # Debug: [{'raw': 'Mary', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'raw': 'the', 'pos': 'DT'}, {'raw': 'cat', 'pos': 'NN'}, {'raw': 'sit', 'pos': 'VB'}, {'raw': 'on', 'pos': 'IN'}, {'raw': 'the', 'pos': 'DT'}, {'raw': 'mat', 'pos': 'NN'}]
  # Debug: ch1 NP= [{'ch1': 'B-NP', 'raw': 'Mary', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT'}, {'ch1': 'I-NP', 'raw': 'cat', 'pos': 'NN'}, {'raw': 'sit', 'pos': 'VB'}, {'raw': 'on', 'pos': 'IN'}, {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT'}, {'ch1': 'I-NP', 'raw': 'mat', 'pos': 'NN'}]
  # Debug: ch2 PP= [{'ch1': 'B-NP', 'raw': 'Mary', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT'}, {'ch1': 'I-NP', 'raw': 'cat', 'pos': 'NN'}, {'raw': 'sit', 'pos': 'VB'}, {'raw': 'on', 'pos': 'IN', 'ch2': 'B-PP'}, {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT', 'ch2': 'I-PP'}, {'ch1': 'I-NP', 'raw': 'mat', 'pos': 'NN', 'ch2': 'I-PP'}]
  # Debug: ch4 VP= [{'ch1': 'B-NP', 'raw': 'Mary', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT'}, {'ch1': 'I-NP', 'raw': 'cat', 'pos': 'NN'}, {'ch4': 'B-VP', 'raw': 'sit', 'pos': 'VB'}, {'ch4': 'I-VP', 'raw': 'on', 'pos': 'IN', 'ch2': 'B-PP'}, {'ch4': 'I-VP', 'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT', 'ch2': 'I-PP'}, {'ch4': 'I-VP', 'ch1': 'I-NP', 'raw': 'mat', 'pos': 'NN', 'ch2': 'I-PP'}]
  # [{'ch1': 'B-NP', 'raw': 'Mary', 'pos': 'NN'}, 
  # {'raw': 'saw', 'pos': 'VBD'}, 
  # {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT'}, {'ch1': 'I-NP', 'raw': 'cat', 'pos': 'NN'}, 
  # {'ch4': 'B-VP', 'raw': 'sit', 'pos': 'VB'}, 
  # {'ch4': 'I-VP', 'raw': 'on', 'pos': 'IN', 'ch2': 'B-PP'}, 
  # {'ch4': 'I-VP', 'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT', 'ch2': 'I-PP'}, 
  # {'ch4': 'I-VP', 'ch1': 'I-NP', 'raw': 'mat', 'pos': 'NN', 'ch2': 'I-PP'}]

  # Debug: ch3 CLAUSE= [{'pos': 'NN', 'raw': 'Mary', 'ch1': 'B-NP'}, {'pos': 'VBD', 'raw': 'saw'}, {'pos': 'DT', 'raw': 'the', 'ch1': 'B-NP', 'ch3': 'B-CLAUSE'}, {'pos': 'NN', 'raw': 'cat', 'ch1': 'I-NP', 'ch3': 'I-CLAUSE'}, {'pos': 'VB', 'raw': 'sit', 'ch4': 'B-VP', 'ch3': 'I-CLAUSE'}, {'pos': 'IN', 'raw': 'on', 'ch2': 'B-PP', 'ch3': 'I-CLAUSE', 'ch4': 'I-VP'}, {'ch2': 'I-PP', 'ch4': 'I-VP', 'ch3': 'I-CLAUSE', 'pos': 'DT', 'raw': 'the', 'ch1': 'B-NP'}, {'ch2': 'I-PP', 'ch4': 'I-VP', 'ch3': 'I-CLAUSE', 'pos': 'NN', 'raw': 'mat', 'ch1': 'I-NP'}]
  # (S
  #   (NP Mary/NN)
  #   saw/VBD
  #   (CLAUSE
  #     (NP the/DT cat/NN)
  #     (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))

  # [{'pos': 'NN', 'raw': 'Mary', 'ch1': 'B-NP'}, 
  # {'pos': 'VBD', 'raw': 'saw'}, 
  # {'pos': 'DT', 'raw': 'the', 'ch1': 'B-NP', 'ch3': 'B-CLAUSE'}, 
  # {'pos': 'NN', 'raw': 'cat', 'ch1': 'I-NP', 'ch3': 'I-CLAUSE'}, 
  # {'pos': 'VB', 'raw': 'sit', 'ch4': 'B-VP', 'ch3': 'I-CLAUSE'}, 
  # {'pos': 'IN', 'raw': 'on', 'ch2': 'B-PP', 'ch3': 'I-CLAUSE', 'ch4': 'I-VP'},
  # {'ch2': 'I-PP', 'ch4': 'I-VP', 'ch3': 'I-CLAUSE', 'pos': 'DT', 'raw': 'the', 'ch1': 'B-NP'}, 
  # {'ch2': 'I-PP', 'ch4': 'I-VP', 'ch3': 'I-CLAUSE', 'pos': 'NN', 'raw': 'mat', 'ch1': 'I-NP'}]

  #Debug: ch4 (loop 2) VP= [{'raw': 'Mary', 'ch1': 'B-NP', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch3': 'B-CLAUSE', 'raw': 'the', 'ch1': 'B-NP', 'pos': 'DT'}, {'ch3': 'I-CLAUSE', 'raw': 'cat', 'ch1': 'I-NP', 'pos': 'NN'}, {'ch3': 'I-CLAUSE', 'raw': 'sit', 'ch4': 'B-VP', 'pos': 'VB'}, {'ch3': 'I-CLAUSE', 'raw': 'on', 'ch4': 'I-VP', 'ch2': 'B-PP', 'pos': 'IN'}, {'ch3': 'I-CLAUSE', 'raw': 'the', 'ch2': 'I-PP', 'ch4': 'I-VP', 'pos': 'DT', 'ch1': 'B-NP'}, {'ch3': 'I-CLAUSE', 'raw': 'mat', 'ch2': 'I-PP', 'ch4': 'I-VP', 'pos': 'NN', 'ch1': 'I-NP'}]
  #Debug: ch3 (loop 2) CLAUSE= [{'raw': 'Mary', 'ch1': 'B-NP', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch3': 'B-CLAUSE', 'raw': 'the', 'ch1': 'B-NP', 'pos': 'DT'}, {'ch3': 'I-CLAUSE', 'raw': 'cat', 'ch1': 'I-NP', 'pos': 'NN'}, {'ch3': 'I-CLAUSE', 'raw': 'sit', 'ch4': 'B-VP', 'pos': 'VB'}, {'ch3': 'I-CLAUSE', 'raw': 'on', 'ch4': 'I-VP', 'ch2': 'B-PP', 'pos': 'IN'}, {'ch3': 'I-CLAUSE', 'raw': 'the', 'ch2': 'I-PP', 'ch4': 'I-VP', 'pos': 'DT', 'ch1': 'B-NP'}, {'ch3': 'I-CLAUSE', 'raw': 'mat', 'ch2': 'I-PP', 'ch4': 'I-VP', 'pos': 'NN', 'ch1': 'I-NP'}]

  # [{'raw': 'Mary', 'ch1': 'B-NP', 'pos': 'NN'}, 
  # {'raw': 'saw', 'pos': 'VBD'}, 
  # {'ch3': 'B-CLAUSE', 'raw': 'the', 'ch1': 'B-NP', 'pos': 'DT'}, 
  # {'ch3': 'I-CLAUSE', 'raw': 'cat', 'ch1': 'I-NP', 'pos': 'NN'}, 
  # {'ch3': 'I-CLAUSE', 'raw': 'sit', 'ch4': 'B-VP', 'pos': 'VB'}, 
  # {'ch3': 'I-CLAUSE', 'raw': 'on', 'ch4': 'I-VP', 'ch2': 'B-PP', 'pos': 'IN'}, 
  # {'ch3': 'I-CLAUSE', 'raw': 'the', 'ch2': 'I-PP', 'ch4': 'I-VP', 'pos': 'DT', 'ch1': 'B-NP'}, 
  # {'ch3': 'I-CLAUSE', 'raw': 'mat', 'ch2': 'I-PP', 'ch4': 'I-VP', 'pos': 'NN', 'ch1': 'I-NP'}]

  # sentence 2
  #Debug: [{'pos': 'NNP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'pos': 'NN', 'raw': 'Mary'}, {'pos': 'VBD', 'raw': 'saw'}, {'pos': 'DT', 'raw': 'the'}, {'pos': 'NN', 'raw': 'cat'}, {'pos': 'VB', 'raw': 'sit'}, {'pos': 'IN', 'raw': 'on'}, {'pos': 'DT', 'raw': 'the'}, {'pos': 'NN', 'raw': 'mat'}]
  #Debug: ch1 NP= [{'pos': 'NNP', 'ch1': 'B-NP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'pos': 'NN', 'ch1': 'B-NP', 'raw': 'Mary'}, {'pos': 'VBD', 'raw': 'saw'}, {'pos': 'DT', 'ch1': 'B-NP', 'raw': 'the'}, {'pos': 'NN', 'ch1': 'I-NP', 'raw': 'cat'}, {'pos': 'VB', 'raw': 'sit'}, {'pos': 'IN', 'raw': 'on'}, {'pos': 'DT', 'ch1': 'B-NP', 'raw': 'the'}, {'pos': 'NN', 'ch1': 'I-NP', 'raw': 'mat'}]
  #Debug: ch2 PP= [{'pos': 'NNP', 'ch1': 'B-NP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'pos': 'NN', 'ch1': 'B-NP', 'raw': 'Mary'}, {'pos': 'VBD', 'raw': 'saw'}, {'pos': 'DT', 'ch1': 'B-NP', 'raw': 'the'}, {'pos': 'NN', 'ch1': 'I-NP', 'raw': 'cat'}, {'pos': 'VB', 'raw': 'sit'}, {'pos': 'IN', 'ch2': 'B-PP', 'raw': 'on'}, {'pos': 'DT', 'ch1': 'B-NP', 'ch2': 'I-PP', 'raw': 'the'}, {'pos': 'NN', 'ch1': 'I-NP', 'ch2': 'I-PP', 'raw': 'mat'}]
  #Debug: ch4 VP= [{'pos': 'NNP', 'ch1': 'B-NP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'pos': 'NN', 'ch1': 'B-NP', 'raw': 'Mary'}, {'pos': 'VBD', 'raw': 'saw'}, {'pos': 'DT', 'ch1': 'B-NP', 'raw': 'the'}, {'pos': 'NN', 'ch1': 'I-NP', 'raw': 'cat'}, {'pos': 'VB', 'raw': 'sit', 'ch4': 'B-VP'}, {'pos': 'IN', 'ch2': 'B-PP', 'raw': 'on', 'ch4': 'I-VP'}, {'pos': 'DT', 'ch1': 'B-NP', 'ch2': 'I-PP', 'raw': 'the', 'ch4': 'I-VP'}, {'pos': 'NN', 'ch1': 'I-NP', 'ch2': 'I-PP', 'raw': 'mat', 'ch4': 'I-VP'}]
  #Debug: ch3 CLAUSE= [{'pos': 'NNP', 'ch1': 'B-NP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'pos': 'NN', 'ch1': 'B-NP', 'raw': 'Mary'}, {'pos': 'VBD', 'raw': 'saw'}, {'pos': 'DT', 'ch1': 'B-NP', 'raw': 'the', 'ch3': 'B-CLAUSE'}, {'pos': 'NN', 'ch1': 'I-NP', 'raw': 'cat', 'ch3': 'I-CLAUSE'}, {'pos': 'VB', 'raw': 'sit', 'ch3': 'I-CLAUSE', 'ch4': 'B-VP'}, {'pos': 'IN', 'ch2': 'B-PP', 'raw': 'on', 'ch3': 'I-CLAUSE', 'ch4': 'I-VP'}, {'ch1': 'B-NP', 'ch2': 'I-PP', 'ch3': 'I-CLAUSE', 'ch4': 'I-VP', 'pos': 'DT', 'raw': 'the'}, {'ch1': 'I-NP', 'ch2': 'I-PP', 'ch3': 'I-CLAUSE', 'ch4': 'I-VP', 'pos': 'NN', 'raw': 'mat'}]
  #Debug: ch5 (loop 2) VP= [{'ch1': 'B-NP', 'raw': 'John', 'pos': 'NNP'}, {'raw': 'thinks', 'pos': 'VBZ'}, {'ch1': 'B-NP', 'raw': 'Mary', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch1': 'B-NP', 'raw': 'the', 'pos': 'DT', 'ch3': 'B-CLAUSE'}, {'ch1': 'I-NP', 'raw': 'cat', 'pos': 'NN', 'ch3': 'I-CLAUSE'}, {'ch4': 'B-VP', 'raw': 'sit', 'ch5': 'B-VP', 'pos': 'VB', 'ch3': 'I-CLAUSE'}, {'ch4': 'I-VP', 'pos': 'IN', 'ch2': 'B-PP', 'raw': 'on', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}, {'ch4': 'I-VP', 'ch1': 'B-NP', 'pos': 'DT', 'ch2': 'I-PP', 'raw': 'the', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}, {'ch4': 'I-VP', 'ch1': 'I-NP', 'pos': 'NN', 'ch2': 'I-PP', 'raw': 'mat', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}]
  #Debug: ch6 (loop 2) CLAUSE= [{'ch1': 'B-NP', 'raw': 'John', 'pos': 'NNP'}, {'raw': 'thinks', 'pos': 'VBZ'}, {'ch1': 'B-NP', 'raw': 'Mary', 'pos': 'NN'}, {'raw': 'saw', 'pos': 'VBD'}, {'ch1': 'B-NP', 'raw': 'the', 'ch6': 'B-CLAUSE', 'pos': 'DT', 'ch3': 'B-CLAUSE'}, {'ch1': 'I-NP', 'raw': 'cat', 'ch6': 'I-CLAUSE', 'pos': 'NN', 'ch3': 'I-CLAUSE'}, {'ch4': 'B-VP', 'pos': 'VB', 'ch6': 'I-CLAUSE', 'raw': 'sit', 'ch5': 'B-VP', 'ch3': 'I-CLAUSE'}, {'ch4': 'I-VP', 'ch6': 'I-CLAUSE', 'pos': 'IN', 'ch2': 'B-PP', 'raw': 'on', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}, {'ch4': 'I-VP', 'ch1': 'B-NP', 'ch6': 'I-CLAUSE', 'pos': 'DT', 'ch2': 'I-PP', 'raw': 'the', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}, {'ch4': 'I-VP', 'ch1': 'I-NP', 'ch6': 'I-CLAUSE', 'pos': 'NN', 'ch2': 'I-PP', 'raw': 'mat', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}]
  #Debug: ch5 (loop 2) VP= [{'ch1': 'B-NP', 'pos': 'NNP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'ch1': 'B-NP', 'pos': 'NN', 'raw': 'Mary'}, {'ch5': 'B-VP', 'pos': 'VBD', 'raw': 'saw'}, {'ch1': 'B-NP', 'ch3': 'B-CLAUSE', 'pos': 'DT', 'raw': 'the', 'ch5': 'I-VP'}, {'ch1': 'I-NP', 'ch3': 'I-CLAUSE', 'pos': 'NN', 'raw': 'cat', 'ch5': 'I-VP'}, {'ch3': 'I-CLAUSE', 'pos': 'VB', 'raw': 'sit', 'ch4': 'B-VP', 'ch5': 'I-VP'}, {'ch2': 'B-PP', 'pos': 'IN', 'raw': 'on', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}, {'ch1': 'B-NP', 'ch2': 'I-PP', 'pos': 'DT', 'raw': 'the', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}, {'ch1': 'I-NP', 'ch2': 'I-PP', 'pos': 'NN', 'raw': 'mat', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE'}]
  #Debug: ch6 (loop 2) CLAUSE= [{'ch1': 'B-NP', 'pos': 'NNP', 'raw': 'John'}, {'pos': 'VBZ', 'raw': 'thinks'}, {'ch1': 'B-NP', 'pos': 'NN', 'raw': 'Mary', 'ch6': 'B-CLAUSE'}, {'ch5': 'B-VP', 'pos': 'VBD', 'raw': 'saw', 'ch6': 'I-CLAUSE'}, {'ch1': 'B-NP', 'ch5': 'I-VP', 'pos': 'DT', 'ch3': 'B-CLAUSE', 'raw': 'the', 'ch6': 'I-CLAUSE'}, {'ch1': 'I-NP', 'ch5': 'I-VP', 'pos': 'NN', 'ch3': 'I-CLAUSE', 'raw': 'cat', 'ch6': 'I-CLAUSE'}, {'ch5': 'I-VP', 'pos': 'VB', 'ch3': 'I-CLAUSE', 'ch4': 'B-VP', 'raw': 'sit', 'ch6': 'I-CLAUSE'}, {'ch2': 'B-PP', 'pos': 'IN', 'raw': 'on', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE', 'ch6': 'I-CLAUSE'}, {'ch1': 'B-NP', 'ch2': 'I-PP', 'pos': 'DT', 'raw': 'the', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE', 'ch6': 'I-CLAUSE'}, {'ch1': 'I-NP', 'ch2': 'I-PP', 'pos': 'NN', 'raw': 'mat', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE', 'ch6': 'I-CLAUSE'}]


  # (S
  #   (NP John/NNP)
  #   thinks/VBZ
  #   (CLAUSE
  #     (NP Mary/NN)
  #     (VP
  #       saw/VBD
  #       (CLAUSE
  #         (NP the/DT cat/NN)
  #         (VP sit/VB (PP on/IN (NP the/DT mat/NN)))))))

  # [{'ch1': 'B-NP', 'pos': 'NNP', 'raw': 'John'}, 
  # {'pos': 'VBZ', 'raw': 'thinks'}, 
  # {'ch1': 'B-NP', 'pos': 'NN', 'raw': 'Mary', 'ch6': 'B-CLAUSE'}, 
  # {'ch5': 'B-VP', 'pos': 'VBD', 'raw': 'saw', 'ch6': 'I-CLAUSE'}, 
  # {'ch1': 'B-NP', 'ch5': 'I-VP', 'pos': 'DT', 'ch3': 'B-CLAUSE', 'raw': 'the', 'ch6': 'I-CLAUSE'}, 
  # {'ch1': 'I-NP', 'ch5': 'I-VP', 'pos': 'NN', 'ch3': 'I-CLAUSE', 'raw': 'cat', 'ch6': 'I-CLAUSE'}, 
  # {'ch5': 'I-VP', 'pos': 'VB', 'ch3': 'I-CLAUSE', 'ch4': 'B-VP', 'raw': 'sit', 'ch6': 'I-CLAUSE'}, 
  # {'ch2': 'B-PP', 'pos': 'IN', 'raw': 'on', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE', 'ch6': 'I-CLAUSE'}, 
  # {'ch1': 'B-NP', 'ch2': 'I-PP', 'pos': 'DT', 'raw': 'the', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE', 'ch6': 'I-CLAUSE'}, 
  # {'ch1': 'I-NP', 'ch2': 'I-PP', 'pos': 'NN', 'raw': 'mat', 'ch4': 'I-VP', 'ch5': 'I-VP', 'ch3': 'I-CLAUSE', 'ch6': 'I-CLAUSE'}]





def pyrata_recognize_clause_in_the_whole_text():
  for s in sentences_dict_list_list:
    pyrata_recognize_clause(s)


def test_clause():

  """                            
  """

  print ('Measuring time performance on # {} sentences over # {} iterations for recognizing Clause'.format(size, iteration_number))

  from nltk.corpus import brown
  brown_sents = brown.sents()[:size]
  import nltk
  global brown_pos_tag_sents
  brown_pos_tag_sents = [nltk.pos_tag(sentence) for sentence in brown_sents] 
  #print (brown_pos_tag_sents[0])


  # ----------------------------------------------------
  # nltk_parser 
  # ----------------------------------------------------
  analyzer_name='nltk_parser'
  

  times, averagetime, mintime = measure_time(nltk_parse_clause_in_the_whole_text, iteration_number)
  grammar = "clause"
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, averagetime, mintime))


  # ----------------------------------------------------
  # pyrata 
  # ----------------------------------------------------
  analyzer_name='pyrata'

  global sentences_dict_list_list
  sentences_dict_list_list = []

  for s in brown_pos_tag_sents:
    sentences_dict_list_list.append([{'raw':w, 'pos':p} for (w, p) in s])
  # data -> sentences_dict_list_list
  #data = data[0]
  # flatten a list of list i.e. sentences of words becomes a text of words 
  # data = [val for sublist in data for val in sublist]
  #print (data[:10])
  #print ('len(data):', len(data))

  times, averagetime, mintime = measure_time(pyrata_recognize_clause_in_the_whole_text, iteration_number)
  grammar = "clause"
  print ('{}\t{}\t{}\t{}'.format(analyzer_name, grammar, averagetime, mintime))



def backslash(string):
  for ch in [' ','?', '+', '*', '.', '[', ']', '~' , '{', '}', '|', '"', "'", ',', ':', '<', '>']:
    if ch in string:
      #string=string.replace(ch,"\\"+ch)
      string=string.replace(ch,'_')
  return string    


# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# Run benchmark
# """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if __name__ == '__main__':

#
  logging.basicConfig(format='%(levelname)s:\t%(message)s', filename='benchmark.log', level=logging.INFO)
  logger = logging.getLogger()
  logger.disabled = True

  # SET
  iteration_number = 1 #0
  
  # 
  #sizes = [10000*i for i in range (2, iteration_number)]
  analysers =  ['clips.pattern', 'nltk_regex_parser', 'nltk_regex_chunk_parser', 'spaCy', 'pyrata']
  analysers =  ['clips.pattern', 'nltk_regex_chunk_parser', 'pyrata']

  # 1161192 # # brown corpus 1 161 192 words ; can also be interpreted as number of sentences
  sizes = [10000, 50000, 100000, 200000, 300000, 500000, 750000, 1000000]

  for size in sizes:
    print(size) 
    time_noun_phrase_recognizers(size, analysers)
    #output_noun_phrase_recognizers(size)


# SET
  #size = 1 # 1161192 # # brown corpus 1 161 192 words ; can also be interpreted as number of sentences
  #iteration_number = 1
  #test_clause()