Python nltk.tokenize.word_tokenize() Examples

The following are 30 code examples of nltk.tokenize.word_tokenize(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.tokenize , or try the search function .
Example #1
Source File: summarize.py    From Django-Bookworm with MIT License 7 votes vote down vote up
def get_summary(self, number_of_sentences=5):
        '''
            generates summary based on weighted word frequencies

            :param number_of_sentences: total number of sentences to return in summary
            :return: string of summary
        '''
        sentence_value = {}
        for sentence in self.__sentence:
            for word in self.__word_freq.keys():
                if word in word_tokenize(sentence.lower()):
                    if sentence in sentence_value:
                        sentence_value[sentence] += self.__word_freq.get(word)
                    else:
                        sentence_value[sentence] = self.__word_freq.get(word, 0)
        
        summary_sentences = heapq.nlargest(number_of_sentences, sentence_value, key=sentence_value.get)
        summary = ' '.join(summary_sentences)
        return summary 
Example #2
Source File: pre_processing.py    From TextLevelGCN with GNU General Public License v3.0 7 votes vote down vote up
def clean_text(text):
        # stop_words = stopwords.words('english')
        stop_words = []
        stop_words.extend(['!', ',' ,'.' ,'?' ,'-s' ,'-ly' ,'</s> ', 's'])
        stemmer = WordNetLemmatizer()

        text = remove_short(text)
        text = clean_str(text)

        text = word_tokenize(text)

        text = [word for word in text if word not in stop_words]

        text = [stemmer.lemmatize(word) for word in text]

        return ' '.join(text) 
Example #3
Source File: ded_detAttn.py    From tf-var-attention with MIT License 6 votes vote down vote up
def validate(self, sess, x_val, y_val, true_val):
        # Calculate BLEU on validation data
        hypotheses_val = []
        references_val = []
        symbol=[]
        if self.config['experiment'] == 'qgen':
            symbol.append('?')
        for batch_i, (input_batch, output_batch, source_sent_lengths, tar_sent_lengths) in enumerate(
                data_utils.get_batches(x_val, y_val, self.batch_size)):
            answer_logits = sess.run(self.inference_logits,
                                     feed_dict={self.input_data: input_batch,
                                                self.source_sentence_length: source_sent_lengths,
                                                self.keep_prob: 1.0})

            for k, pred in enumerate(answer_logits):
                hypotheses_val.append(
                    word_tokenize(" ".join([self.decoder_idx_word[i] for i in pred if i not in [self.pad, -1, self.eos]])) + symbol)
                references_val.append([word_tokenize(true_val[batch_i * self.batch_size + k])])

        bleu_scores = eval_utils.calculate_bleu_scores(references_val, hypotheses_val)
        self.epoch_bleu_score_val['1'].append(bleu_scores[0])
        self.epoch_bleu_score_val['2'].append(bleu_scores[1])
        self.epoch_bleu_score_val['3'].append(bleu_scores[2])
        self.epoch_bleu_score_val['4'].append(bleu_scores[3]) 
Example #4
Source File: word2vec.py    From Bidirectiona-LSTM-for-text-summarization- with MIT License 6 votes vote down vote up
def createCorpus(t):
    corpus = []
    all_sent = []
    for k in t:
        for p in t[k]:
            corpus.append(st(p))
    for sent in range(len(corpus)):
        for k in corpus[sent]:
            all_sent.append(k)
    for m in range(len(all_sent)):
        all_sent[m] = wt(all_sent[m])
    
    all_words=[]
    for sent in all_sent:
        hold=[]
        for word in sent:
            hold.append(word.lower())
        all_words.append(hold)
    return all_words 
Example #5
Source File: vqa.py    From visual_question_answering with MIT License 6 votes vote down vote up
def filter_by_ans_len(self, max_ans_len, min_freq=5):
                print("Filtering the answers by length...")
                keep_ques = {}
                for ann in tqdm(self.dataset['annotations']):
                    if len(word_tokenize(ann['best_answer'])) <= max_ans_len \
                        and ann['best_answer_count']>=min_freq:
                        keep_ques[ann['question_id']] = \
                            keep_ques.get(ann['question_id'], 0) + 1

                self.dataset['annotations'] = \
                    [ann for ann in self.dataset['annotations'] \
                    if keep_ques.get(ann['question_id'],0)>0]
                self.questions['questions'] = \
                    [ques for ques in self.questions['questions'] \
                    if keep_ques.get(ques['question_id'],0)>0]

                self.createIndex() 
Example #6
Source File: prepro.py    From visDial.pytorch with MIT License 6 votes vote down vote up
def tokenize_data(data):
    '''
    Tokenize captions, questions and answers
    Also maintain word count if required
    '''
    ques_toks, ans_toks, caption_toks = [], [], []

    print data['split']
    print 'Tokenizing captions...'
    for i in data['data']['dialogs']:
        caption = word_tokenize(i['caption'])
        caption_toks.append(caption)

    print 'Tokenizing questions...'
    for i in data['data']['questions']:
        ques_tok = word_tokenize(i + '?')
        ques_toks.append(ques_tok)

    print 'Tokenizing answers...'
    for i in data['data']['answers']:
        ans_tok = word_tokenize(i)
        ans_toks.append(ans_tok)

    return ques_toks, ans_toks, caption_toks 
Example #7
Source File: vqa.py    From visual_question_answering with MIT License 6 votes vote down vote up
def createIndex(self):
                # create index
                print('creating index...')
                imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
                qa =  {ann['question_id']: [] for ann in self.dataset['annotations']}
                qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
                max_ques_len = 0
                for ann in self.dataset['annotations']:
                        imgToQA[ann['image_id']] += [ann]
                        qa[ann['question_id']] = ann
                for ques in self.questions['questions']:
                        qqa[ques['question_id']] = ques
                        max_ques_len = max(max_ques_len,
                            len(word_tokenize(ques['question'])))
                print('index created!')

                # create class members
                self.qa = qa
                self.qqa = qqa
                self.imgToQA = imgToQA
                self.max_ques_len = max_ques_len 
Example #8
Source File: vqa.py    From visual_question_answering with MIT License 6 votes vote down vote up
def filter_by_ques_len(self, max_ques_len):
                print("Filtering the questions by length...")
                keep_ques = {}
                for ques in tqdm(self.questions['questions']):
                    if len(word_tokenize(ques['question'])) <= max_ques_len:
                        keep_ques[ques['question_id']] = \
                            keep_ques.get(ques['question_id'], 0) + 1

                self.dataset['annotations'] = \
                    [ann for ann in self.dataset['annotations'] \
                    if keep_ques.get(ann['question_id'],0)>0]
                self.questions['questions'] = \
                    [ques for ques in self.questions['questions'] \
                    if keep_ques.get(ques['question_id'],0)>0]

                self.createIndex() 
Example #9
Source File: data_utils.py    From dgm_latent_bow with MIT License 6 votes vote down vote up
def quora_read(file_path, bleu_baseline=False):
  """Read the quora dataset"""
  print("Reading quora raw data .. ")
  print("  data path: %s" % file_path)
  with open(file_path) as fd:
    lines = fd.readlines()
  sentence_sets = []
  for l in tqdm(lines):
    p0, p1 = l[:-1].lower().split("\t")
    sentence_sets.append([word_tokenize(p0), word_tokenize(p1)])

  if(bleu_baseline):
    print("calculating bleu ... ")
    hypothesis = [s[0] for s in sentence_sets]
    references = [s[1:] for s in sentence_sets]
    bleu = corpus_bleu(references, hypothesis)
    print("bleu on the training set: %.4f" % bleu)
  return sentence_sets 
Example #10
Source File: prepare_clc_fce_data.py    From NLP_Toolkit with Apache License 2.0 6 votes vote down vote up
def main():
    fce = convert_fce(args.fce_dataset_path)
    with open(args.output + "/fce-original.txt", 'w', encoding='utf-8') as out_original, \
            open(args.output + "/fce-applied.txt", 'w', encoding='utf-8') as out_applied:
        for doc in tqdm(fce, unit='doc'):
            sents = re.split(r"\n +\n", doc)
            for sent in sents:
                tokenized_sents = sent_tokenize(sent)
                for i in range(len(tokenized_sents)):
                    if re.search(r"[{>][.?!]$", tokenized_sents[i]):
                        tokenized_sents[i + 1] = tokenized_sents[i] + " " + tokenized_sents[i + 1]
                        tokenized_sents[i] = ""
                    regexp = r'{([^{}]*?)=>([^{}]*?)}'
                    original = re.sub(regexp, r"\1", tokenized_sents[i])
                    applied = re.sub(regexp, r"\2", tokenized_sents[i])
                    # filter out nested alerts
                    if original != "" and applied != "" and not re.search(r"[{}=]", original) \
                            and not re.search(r"[{}=]", applied):
                        out_original.write(" ".join(word_tokenize(original)) + "\n")
                        out_applied.write(" ".join(word_tokenize(applied)) + "\n") 
Example #11
Source File: utils.py    From text-summarization-tensorflow with MIT License 6 votes vote down vote up
def build_dataset(step, word_dict, article_max_len, summary_max_len, toy=False):
    if step == "train":
        article_list = get_text_list(train_article_path, toy)
        title_list = get_text_list(train_title_path, toy)
    elif step == "valid":
        article_list = get_text_list(valid_article_path, toy)
    else:
        raise NotImplementedError

    x = [word_tokenize(d) for d in article_list]
    x = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in x]
    x = [d[:article_max_len] for d in x]
    x = [d + (article_max_len - len(d)) * [word_dict["<padding>"]] for d in x]
    
    if step == "valid":
        return x
    else:        
        y = [word_tokenize(d) for d in title_list]
        y = [[word_dict.get(w, word_dict["<unk>"]) for w in d] for d in y]
        y = [d[:(summary_max_len - 1)] for d in y]
        return x, y 
Example #12
Source File: dont_run_me_run_the_other_script_instead.py    From punctuator2 with MIT License 6 votes vote down vote up
def process_line(line):

    tokens = word_tokenize(line)
    output_tokens = []

    for token in tokens:

        if token in INS_PUNCTS:
            output_tokens.append(INS_PUNCTS[token])
        elif token in EOS_PUNCTS:
            output_tokens.append(EOS_PUNCTS[token])
        elif is_number(token):
            output_tokens.append(NUM)
        else:
            output_tokens.append(token.lower())

    return untokenize(" ".join(output_tokens) + " ") 
Example #13
Source File: sumbasic.py    From ns with MIT License 5 votes vote down vote up
def sum_basic(lines, word_limit, update_non_redundency=True):
    def weight(sents, distribution):
        def _weight_sent(sent):
            tokens = preprocess(word_tokenize(sent))
            return reduce(lambda x,y: x+y, [distribution.get(x) for x in tokens]) / len(tokens)
            
        return [_weight_sent(sent) for sent in sents]
    
    def probability_distribution(tokens):
        N = len(tokens)
        distinct_words = set(tokens)
        
        probabilities = [tokens.count(w) / N for w in distinct_words]
        return dict(list(zip(distinct_words, probabilities)))
    
    sents = to_sents(lines)
    tokens = to_tokens(sents)
    tokens = preprocess(tokens)
    
    pd = probability_distribution(tokens)
    
    summary = "" 
    
    while len(word_tokenize(summary)) < word_limit:
        weights = weight(sents, pd)
        highest_weight_sentence = max(list(zip(sents, weights)), key=itemgetter(1))[0]
        summary += " " + highest_weight_sentence
        if update_non_redundency:
            for token in preprocess(word_tokenize(highest_weight_sentence)):
                pd[token] = pd[token] * pd[token]
        else:
            sents.remove(highest_weight_sentence)
            
   
    return summary 
Example #14
Source File: sumbasic.py    From ns with MIT License 5 votes vote down vote up
def leading(lines, word_limit):
    sents = to_sents(lines)
    summary = ""
    while len(word_tokenize(summary)) < word_limit:
        summary += " " + sents.pop(0)

# main methods 
Example #15
Source File: reviews_data.py    From company-reviews with MIT License 5 votes vote down vote up
def preprocess(reviews, stop, MIN_WORDS):
    docs = []
    doc_indexes = []
    for i,review in enumerate(reviews):
        rev_words = []
        words = [word for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]
        stripped_words = []
        for word in words:
            new_words = strip_and_split(word) # some words aren't separated correctly or have numbers
            stripped_words += [nw for nw in new_words if nw not in stop]
        if len(stripped_words) < MIN_WORDS: continue
        docs.append(stripped_words)
        doc_indexes.append(i)
    return docs, doc_indexes 
Example #16
Source File: reviews_data.py    From company-reviews with MIT License 5 votes vote down vote up
def get_combined_lower(indeed_reviews_db, glassdoor_reviews_db):
    combined = get_combined_reviews(indeed_reviews_db,glassdoor_reviews_db)
    combined_lower = []
    for review in combined:
        combined_lower.append(review.lower())#' '.join([word for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))
    return combined_lower


# Make the ratings keys standard 
Example #17
Source File: reviews_data.py    From company-reviews with MIT License 5 votes vote down vote up
def get_stemmed_separate(indeed_reviews_db, glassdoor_reviews_db):
    separate = get_separate_reviews(indeed_reviews_db, glassdoor_reviews_db)
    stemmer = PorterStemmer()
    stemmed_reviews = []
    for review in separate:
        stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))
    return stemmed_reviews 
Example #18
Source File: reviews_data.py    From company-reviews with MIT License 5 votes vote down vote up
def get_stemmed_combined_reviews(indeed_reviews_db, glassdoor_reviews_db):
    combined = get_combined_reviews(indeed_reviews_db, glassdoor_reviews_db)

    stemmer = PorterStemmer()
    stemmed_reviews = []
    for review in combined:
        stemmed_reviews.append(' '.join([stemmer.stem(word) for sent in sent_tokenize(review) for word in word_tokenize(sent.lower())]))

    return stemmed_reviews 
Example #19
Source File: utilities.py    From KDD2018_MPCN with GNU General Public License v3.0 5 votes vote down vote up
def tylib_tokenize(x, setting='split', lower=False,
                tweet_process=False):
    ''' All tokenizer in one. A convenient wrapper

    Supported - 'split','nltk_tweet'

    TODO:'treebank','nltk_word'

    Args:
        x: `list`. list of words
        setting: `str` supports different tokenizers

    Returns:
        Tokenized output `list`

    '''
    if(setting=='split'):
        tokens = x.split(' ')
    elif(setting=='nltk_tweet'):
        tokens = tweet_tokenizer.tokenize(x)
    elif(setting=='nltk'):
        tokens = word_tokenize(x)
    if(lower):
        tokens = [x.lower() for x in tokens]
    if(tweet_process):
        tokens = [tweet_processer(x) for x in tokens]
    return tokens 
Example #20
Source File: preprocess_trec.py    From sentence_classification with MIT License 5 votes vote down vote up
def preprocess(text):
    """
    Preprocess text for encoder
    """
    X = []
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    for t in text:
        sents = sent_detector.tokenize(t)
        result = ''
        for s in sents:
            tokens = word_tokenize(s)
            result += ' ' + ' '.join(tokens)
        X.append(result)
    return X 
Example #21
Source File: evaluator.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def yelp_ppl(self, texts_transfered):
        texts_transfered = [' '.join(word_tokenize(itm.lower().strip())) for itm in texts_transfered]
        sum = 0
        words = []
        length = 0
        for i, line in enumerate(texts_transfered):
            words += [word for word in line.split()]
            length += len(line.split())
            score = self.yelp_ppl_model.score(line)
            sum += score
        return math.pow(10, -sum / length) 
Example #22
Source File: evaluator.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def nltk_bleu(self, texts_origin, text_transfered):
        texts_origin = [word_tokenize(text_origin.lower().strip()) for text_origin in texts_origin]
        text_transfered = word_tokenize(text_transfered.lower().strip())
        return sentence_bleu(texts_origin, text_transfered) * 100 
Example #23
Source File: evaluator.py    From NLP_Toolkit with Apache License 2.0 5 votes vote down vote up
def yelp_style_check(self, text_transfered, style_origin):
        text_transfered = ' '.join(word_tokenize(text_transfered.lower().strip()))
        if text_transfered == '':
            return False
        label = self.classifier_yelp.predict([text_transfered])
        style_transfered = label[0][0] == '__label__positive'
        return (style_transfered != style_origin) 
Example #24
Source File: reader.py    From SEDST with MIT License 5 votes vote down vote up
def _get_tokenized_data(self, raw_data, db_data, construct_vocab):
        tokenized_data = []
        vk_map = self._value_key_map(db_data)
        for dial_id, dial in enumerate(raw_data):
            tokenized_dial = []
            for turn in dial['dial']:
                turn_num = turn['turn']
                constraint = []
                for slot in turn['usr']['slu']:
                    if slot['act'] == 'inform':
                        s = slot['slots'][0][1]
                        if s not in ['dontcare', 'none']:
                            constraint.extend(word_tokenize(s))
                degree = len(self.db_search(constraint))
                constraint.append('EOS_Z1')
                user = word_tokenize(turn['usr']['transcript']) + ['EOS_U']
                response = word_tokenize(self._replace_entity(turn['sys']['sent'], vk_map, constraint)) + ['EOS_M']
                tokenized_dial.append({
                    'dial_id': dial_id,
                    'turn_num': turn_num,
                    'user': user,
                    'response': response,
                    'constraint': constraint,
                    'degree': degree,
                })
                if construct_vocab:
                    for word in user + response + constraint:
                        self.vocab.add_item(word)
            tokenized_data.append(tokenized_dial)
        return tokenized_data 
Example #25
Source File: ngrams-example.py    From python-examples with MIT License 5 votes vote down vote up
def get_ngrams(text, n):
    ngramnums = word_tokenize(text)
    ll = [x for x in ngramnums if not re.fullmatch('[' + string.punctuation + ']+', x)]
    ll = ngrams(ll, n)
    return [' '.join(grams) for grams in ll] 
Example #26
Source File: disintegrator.py    From quantified-self with MIT License 5 votes vote down vote up
def convert2simple(self, sentence=""):
        tokenized = word_tokenize(sentence)
        tokenized = self.__filter_punctuation(tokenized)
        tokenized = self.__filter_stopwords(tokenized)
        return " ".join(self.__lemmatize(tokenized)) 
Example #27
Source File: word_counter.py    From wordcounter with Apache License 2.0 5 votes vote down vote up
def append_ext(words):
    new_words = []
    for item in words:
        word, count = item
        tag = nltk.pos_tag(word_tokenize(word))[0][1] # tag is like [('bigger', 'JJR')]
        new_words.append((word, count, tag))
    return new_words 
Example #28
Source File: word_counter.py    From wordcounter with Apache License 2.0 5 votes vote down vote up
def merge(words):
    new_words = []
    for word in words:
        if word:
            tag = nltk.pos_tag(word_tokenize(word)) # tag is like [('bigger', 'JJR')]
            pos = get_wordnet_pos(tag[0][1])
            if pos:
                lemmatized_word = lmtzr.lemmatize(word, pos)
                new_words.append(lemmatized_word)
            else:
                new_words.append(word)
    return new_words 
Example #29
Source File: data_utils.py    From dgm_latent_bow with MIT License 5 votes vote down vote up
def mscoco_read_json(file_path, bleu_baseline=False):
  """Read the mscoco dataset

  Args:
    file_path: path to the raw data, a string

  Returns:
    sentence_sets: the sentence sets, a list of paraphrase lists
  """
  print("Reading mscoco raw data .. ")
  print("  data path: %s" % file_path)
  with open(file_path, "r") as fd:
    data = json.load(fd)

  print("%d sentences in total" % len(data["annotations"]))
  
  # aggregate all sentences of the same images
  image_idx = set([d["image_id"] for d in data["annotations"]])
  paraphrases = {}
  for im in image_idx: paraphrases[im] = []
  for d in tqdm(data["annotations"]):
    im = d["image_id"]
    sent = d["caption"]
    paraphrases[im].append(word_tokenize(sent))

  sentence_sets = [paraphrases[im] for im in paraphrases]

  # blue on the training set, a baseline/ upperbound
  if(bleu_baseline):
    print("calculating bleu ... ")
    hypothesis = [s[0] for s in sentence_sets]
    references = [s[1:] for s in sentence_sets]
    bleu = corpus_bleu(references, hypothesis)
    print("bleu on the training set: %.4f" % bleu)
  return sentence_sets 
Example #30
Source File: reader.py    From ConvLab with MIT License 5 votes vote down vote up
def _tokenize(self, sent):
        return ' '.join(word_tokenize(sent))