Python nltk.tokenize.TreebankWordTokenizer() Examples

The following are 15 code examples of nltk.tokenize.TreebankWordTokenizer(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize , or try the search function .
Example #1
Source File: phrasemachine.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def __init__(self):
		import nltk
		from nltk.tag import PerceptronTagger
		from nltk.tokenize import TreebankWordTokenizer
		#return pkgutil.get_data('scattertext',
		#                        'data/viz/semiotic_new.html').decode('utf-8')
		path = os.path.dirname(sys.modules['scattertext'].__file__)+'/data/'
		tokenizer_fn = path + 'punkt.english.pickle'
		tagger_fn = path + 'averaged_perceptron_tagger.pickle'
		#tokenizer_fn = os.path.abspath(resource_filename('scattertext.data', 'punkt.english.pickle'))
		#tagger_fn = os.path.abspath(resource_filename('scattertext.data', 'averaged_perceptron_tagger.pickle'))
		# Load the tagger
		self.tagger = PerceptronTagger(load=False)
		self.tagger.load(tagger_fn)

		# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
		#       Calling the TreebankWordTokenizer like this allows skipping the downloader.
		#       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
		#       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
		self.tokenize = TreebankWordTokenizer().tokenize
		self.sent_detector = nltk.data.load(tokenizer_fn)

	# http://www.nltk.org/book/ch05.html 
Example #2
Source File: phrasemachine.py    From phrasemachine with MIT License 6 votes vote down vote up
def __init__(self):
        import nltk
        from nltk.tag import PerceptronTagger
        from nltk.tokenize import TreebankWordTokenizer
        tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
        tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
        # Load the tagger
        self.tagger = PerceptronTagger(load=False)
        self.tagger.load(tagger_fn)

        # note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
        #       Calling the TreebankWordTokenizer like this allows skipping the downloader.
        #       It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
        #       https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
        self.tokenize = TreebankWordTokenizer().tokenize
        self.sent_detector = nltk.data.load(tokenizer_fn)


    # http://www.nltk.org/book/ch05.html 
Example #3
Source File: msrvtt_tagging.py    From Semantics-AssistedVideoCaptioning with MIT License 6 votes vote down vote up
def main(tag_gt, word2idx, zipname):
    with zf.ZipFile(zipname) as myzip:
        namelist = myzip.namelist()
        print('namelist:', namelist)
        datainfo = myzip.open(namelist[-1], 'r')
        info_dict = json.load(datainfo)
        sentences = info_dict['sentences']
        tokenizer = TreebankWordTokenizer()
        for sentence in sentences:
            video_id = sentence['video_id']
            video_idx = int(video_id[5:])
            caption = sentence['caption']
            words = tokenizer.tokenize(caption)
            for word in words:
                if word in word2idx:
                    tag_gt[video_idx, word2idx[word]] = 1 
Example #4
Source File: preprocessing.py    From question-generation with MIT License 6 votes vote down vote up
def tokenise(text, asbytes=True, append_eos=False):

    text = text.decode() if asbytes else text
    if use_nltk:
        sents = [s for s in sent_tokenize(text)]

        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
    else:
        for char in string.punctuation+'()-–':
            text = text.replace(char, ' '+char+' ')
        tokens = text.lower().split(' ')
    tokens = [w.encode() if asbytes else w for w in tokens if w.strip() != '']
    if append_eos:
        tokens.append(EOS.encode() if asbytes else EOS)
    # tokens = np.asarray(tokens)
    # return np.asarray(tokens)
    return tokens 
Example #5
Source File: lang_proc.py    From SearchingReddit with MIT License 5 votes vote down vote up
def stem_and_tokenize_text(text):
    sents = sent_tokenize(text)
    tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
    terms = [Term(token) for token in tokens]
    return filter(lambda term: not term.is_punctuation(), terms) 
Example #6
Source File: UDParser.py    From PredPatt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def tokenize(sentence):
    "Tokenize sentence the way parser expects."
    tokenizer = TreebankWordTokenizer()
    s = tokenizer.tokenize(sentence)
    s = ' '.join(s)
    # character replacements
    s = ''.join(REPLACEMENTS_R.get(x,x) for x in s)
    return s 
Example #7
Source File: UDParser.py    From PredPatt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def fresh(self, s, tokenized=False):
        """UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string).

        Pass in `tokenized=True` if `s` has already been tokenized, otherwise we
        apply `nltk.tokenize.TreebankWordTokenizer`.

        """
        if self.process is None:
            self._start_subprocess()
        s = str(s.strip())
        if not tokenized:
            s = tokenize(s)
        s = s.strip()
        assert '\n' not in s, "No newline characters allowed %r" % s
        try:
            self.process.stdin.write(s.encode('utf-8'))
        except IOError as e:
            #if e.errno == 32:          # broken pipe
            #    self.process = None
            #    return self(s)  # retry will restart process
            raise e
        self.process.stdin.write(b'\n')
        self.process.stdin.flush()
        out = self.process.stdout.readline()
        if sys.version_info[0] == 3:
            out = out.decode()
        return self.to_ud(out) 
Example #8
Source File: bridge.py    From castor with Apache License 2.0 5 votes vote down vote up
def parse(self, sentence):
        s_toks = TreebankWordTokenizer().tokenize(sentence)
        sentence = ' '.join(s_toks).lower()
        return sentence 
Example #9
Source File: reader.py    From variational-text-tensorflow with MIT License 5 votes vote down vote up
def get(self, text=["medical"]):
    if type(text) == str:
      text = text.lower()
      text = TreebankWordTokenizer().tokenize(text)

    try:
      data = np.array(map(self.vocab.get, text))
      return self.onehot(data), data
    except:
      unknowns = []
      for word in text:
        if self.vocab.get(word) == None:
          unknowns.append(word)
      raise Exception(" [!] unknown words: %s" % ",".join(unknowns)) 
Example #10
Source File: preprocessing.py    From question-generation with MIT License 5 votes vote down vote up
def char_pos_to_word(text, tokens, char_pos, asbytes=True):
    ix=0
    text=text.decode() if asbytes else text
    if use_nltk:
        sents = [s for s in sent_tokenize(text)]
        spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents]
        # lens = [len(sent)+1  for sent in sents]
        offsets = []
        for i,sent in enumerate(sents):
            offsets.append(text.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster?
        spans = [(span[0]+offsets[i], span[1]+offsets[i]) for i,sent in enumerate(spans) for span in sent]
        # print(char_pos)
        for ix,s in enumerate(spans):
            # print(s, tokens[ix])
            if s[1] > char_pos:
                return ix
        print('couldnt find the char pos via nltk')
        print(text, char_pos, len(text))
    else:
        tokens = [t.decode() for t in tokens]
        if char_pos>len(text):
            print('Char pos doesnt fall within size of text!')

        for t,token in enumerate(tokens):
            for char in token:
                ix = text.find(char, ix)
                ix += 1
                if ix >= char_pos:
                    return t
        print('couldnt find the char pos')
        print(text, tokens, char_pos, len(text))

# Filter a complete context down to the sentence containing the start of the answer span 
Example #11
Source File: preprocessing.py    From question-generation with MIT License 5 votes vote down vote up
def filter_context(ctxt, char_pos, window_size_before=0, window_size_after=0, max_tokens=-1):
    sents = [s for s in sent_tokenize(ctxt)]
    spans = [[s for s in TreebankWordTokenizer().span_tokenize(sent)] for sent in sents]
    # lens = [len(sent)+1  for sent in sents]
    offsets = []
    for i,sent in enumerate(sents):
        # print(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0))
        # print(len(sents[i-1]) if i>0 else 0)
        # print(offsets[i-1] if i>0 else 0)
        # print(offsets[i-1]+len(sents[i-1]) if i>0 else 0)
        offsets.append(ctxt.find(sent, offsets[i-1]+len(sents[i-1]) if i>0 else 0)) # can we do this faster?
    spans = [[(span[0]+offsets[i], span[1]+offsets[i]) for span in sent] for i,sent in enumerate(spans) ]
    for ix,sent in enumerate(spans):
        # print(sent[0][0], sent[-1][1], char_pos)
        if char_pos >= sent[0][0] and char_pos < sent[-1][1]:
            start=max(0, ix-window_size_before)
            end = min(len(sents)-1, ix+window_size_after)
            # print(start, end, start, offsets[start])
            # new_ix=char_pos-offsets[start]
            # print(new_ix)
            # print(" ".join(sents[start:end+1])[new_ix:new_ix+10])
            flat_spans=[span for sen in spans for span in sen]
            if max_tokens > -1 and len([span for sen in spans[start:end+1] for span in sen]) > max_tokens:
                for i,span in enumerate(flat_spans):
                    if char_pos < span[1]:
                        tok_ix =i
                        # print(span, char_pos)
                        break
                start_ix = max(spans[start][0][0], flat_spans[max(tok_ix-max_tokens,0)][0])
                end_ix = min(spans[end][-1][1], flat_spans[min(tok_ix+max_tokens, len(flat_spans)-1)][1])

                # if len(flat_spans[start_tok:end_tok+1]) > 21:
                # print(start_tok, end_tok, tok_ix)
                # print(flat_spans[tok_ix])
                # print(flat_spans[start_tok:end_tok])
                # print(ctxt[flat_spans[start_tok][0]:flat_spans[end_tok][1]])
                return ctxt[start_ix:end_ix], char_pos-start_ix
            else:
                return " ".join(sents[start:end+1]), char_pos - offsets[start]
    print('couldnt find the char pos')
    print(ctxt, char_pos, len(ctxt)) 
Example #12
Source File: loader.py    From question-generation with MIT License 5 votes vote down vote up
def get_vocab(corpus, vocab_size=2000):
    def tokenise(text):
        sents = [s for s in sent_tokenize(text)]
        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
        return tokens
    vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
    word_count = defaultdict(float)
    for l in corpus:
        # for w in l.lower().split():
        for w in tokenise(l):
            word_count[w] +=1
    vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
    for w in vocab_list:
        vocab[w] = len(vocab)
    return vocab 
Example #13
Source File: loader.py    From question-generation with MIT License 5 votes vote down vote up
def get_glove_vocab(path, size=2000, d=200, variant='6B', filter_to_squad=False):
    # this is a copy of the function in preprocessing.py - but we can't use it as we'd get a circular import!
    def tokenise(text):
        sents = [s for s in sent_tokenize(text)]
        tokens = [tok.lower() for sent in sents for tok in TreebankWordTokenizer().tokenize(sent)]
        return tokens

    vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
    if filter_to_squad:
        squad_words = set()
        squad_train = load_squad_triples(path, dev=False)
        squad_dev = load_squad_triples(path, dev=True)
        for triple in squad_train+squad_dev:
            squad_words |= set(tokenise(triple[0]))
            squad_words |= set(tokenise(triple[1]))
            squad_words |= set(tokenise(triple[2]))
    with open(path+'glove.'+variant+'/glove.'+variant+'.'+str(d)+'d.txt') as fp:
        entries = fp.readlines()
    for i,row in enumerate(entries):
        if len(vocab)-4>= size and size > 0:
            break
        cols = row.strip().split(' ')
        if len(cols) < d+1:
            print(row)
        if (filter_to_squad and cols[0] in squad_words) or not filter_to_squad:
            vocab[cols[0]] = len(vocab)
    return vocab

# def get_vocab(corpus, vocab_size=1000):
#     lines = [re.sub(r'([\,\?\!\.]+)',r' \1 ', line).lower() for line in corpus]
#     # lines = re.split('[\n]+',raw_data.lower())
#     vocab = {PAD:0,OOV:1, SOS:2, EOS:3}
#     word_count = defaultdict(float)
#     for l in lines:
#         for w in l.split():
#             word_count[w] +=1
#     vocab_list = sorted(word_count, key=word_count.__getitem__,reverse=True)[:min(vocab_size,len(word_count))]
#     for w in vocab_list:
#         vocab[w] = len(vocab)
#     return vocab 
Example #14
Source File: bbn2conll.py    From entity-recognition-datasets with MIT License 4 votes vote down vote up
def tokenizeit(store):
    #NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ?  Note this
    # seems to be done differently in different corpora.
    tokenizer = TreebankWordTokenizer()

    do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.',
    'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.',
    'Sgt.',
    'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.',
    'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.',
    'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.',
    'Ltda.','E.U.','I.B.M.','D.T.',
    'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.',
    'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.',
    'S.C.','Neb.',
    'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.',
    'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.',
    'Mo.','Vt.',
    'Blvd.','Ave.','Ln.','Rd.',
    'No.']
    pat = re.compile(r'[0-9][.,]{0,1}[0-9]*')

    for i,x in enumerate(store):
        if x[0] == '\n':
            store[i] =  ([x[0]], store[i][1])
        #elif any([i in x[0] for i in do_not_tokenize]) and
        #elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}:
        elif x[0] in do_not_tokenize:
            toks = [x[0]]
            store[i] =  (toks, store[i][1])
        elif shall_use_split(x[0], do_not_tokenize):
            #x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
            toks = x[0].split(' ')
            #print 'Plain split on: ', x[0]
            store[i] =  (toks, store[i][1])
        else:
            toks = tokenizer.tokenize(x[0])
#            if '$' not in x[0] and '%' not in x[0] and "'" not in x[0] and "`" not in x[0] and x[0][-1]!='.' and not pat.match(x[0]):
#                toks = regtok(x[0])
#            elif x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
#                toks = x[0].split(' ')
#            elif x[0][0:4] in do_not_tokenize:
#                toks = [x[0][0:4]]
#                toks.extend(x[0][4:].split(' '))
#                toks = [i for i in toks if i!='']
#                print toks
#            else:
#                toks = word_tokenize(x[0])
            store[i] =  (toks, store[i][1])
    return store 
Example #15
Source File: i2b2toconll.py    From entity-recognition-datasets with MIT License 4 votes vote down vote up
def tokenizeit(store):
	#NOTE: how to tokenize stuff with &, like AT&T, S&L or S&P ?  Note this
	# seems to be done differently in different corpora.
	tokenizer = TreebankWordTokenizer()

	do_not_tokenize = ['Mr.','Dr.','Mrs.','Ms.','Prof.','Jr.','Sr.','Rep.',
	'Sen.','Rev.','St.','Lt.','Gov.','Gen.','Brig.','Maj.','Col.','Capt.',
	'Sgt.','M.D.',
	'U.S.','U.K.','U.N.','L.A.','U.S.S.R.','U.S.A.','B.C.',
	'N.V.','G.m.b.H.','S.p.A.','B.V.','N.A.',
	'Pty.','S.A.','Ltd.','Inc.','Bros.','Corp.','Co.','CORP.','L.P.','A.G.',
	'Ltda.','E.U.','I.B.M.','D.T.',
	'Nov.', 'Dec.','Jan.','Feb.','Aug.','Sept.','Sep.','Oct.','a.m.','p.m.',
	'Mass.','Calif.','N.J.','N.M.','N.Y.','N.C.','N.H.','R.I.','Ky.','Va.',
	'S.C.','Neb.',
	'Wash.','Mich.','Conn.','D.C.','Ark.','Pa.','Ind.','Ariz.','Miss.','Fla.',
	'Del.','Nev.','Ore.','Tenn.','Mont.','Ill.','Ala.','Wis.','Ga.','La.',
	'Mo.','Vt.',
	'Blvd.','Ave.','Ln.','Rd.',
	'No.']
	pat = re.compile(r'[0-9][.,]{0,1}[0-9]*')

	for i,x in enumerate(store):
		if x[0] == '\n':
			store[i] =  ([x[0]], store[i][1])
		#elif any([i in x[0] for i in do_not_tokenize]) and
		#elif '$' not in x[0] and '%' not in x[0]: #x[0] in do_not_tokenize: #{'Mr.','Dr.'}:
		elif x[0] in do_not_tokenize:
			toks = [x[0]]
			store[i] =  (toks, store[i][1])
		elif shall_use_split(x[0], do_not_tokenize):
			#x[0][-4:] in {'Inc.','N.V.','Ltd.'} or x[0][-5:] in {'Corp.'} or x[0][-6:] in {'S.p.A.'} or x[0][-3:] in {'Co.'}:
			toks = x[0].split(' ')
			#print 'Plain split on: ', x[0]
			store[i] =  (toks, store[i][1])
		elif '%' in x[0]:
			toks = tokenizer.tokenize(x[0])
			store[i] = (toks, store[i][1] )
		else:
			#NOTE It seems like this is already tokenized inline in the xml,
			# so this way (just splitting spaces) may be best here.
			toks = x[0].split(' ')
			store[i] =  (toks, store[i][1])
	return store