Python nltk.word_tokenize() Examples

The following are code examples for showing how to use nltk.word_tokenize(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: Snowball   Author: davidsbatista   File: VectorSpaceModel.py    GNU General Public License v3.0 7 votes vote down vote up
def __init__(self, sentences_file, stopwords):
        self.dictionary = None
        self.corpus = None
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        documents = list()
        count = 0
        print("Gathering sentences and removing stopwords")
        for line in f_sentences:
            line = re.sub('<[A-Z]+>[^<]+</[A-Z]+>', '', line)

            # remove stop words and tokenize
            document = [word for word in nltk.word_tokenize(line.lower()) if word not in stopwords]
            documents.append(document)
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")

        f_sentences.close()

        self.dictionary = corpora.Dictionary(documents)
        self.corpus = [self.dictionary.doc2bow(text) for text in documents]
        self.tf_idf_model = TfidfModel(self.corpus)

        print(len(documents), "documents red")
        print(len(self.dictionary), " unique tokens") 
Example 2
Project: domain_discovery_API   Author: VIDA-NYU   File: preprocess.py    GNU General Public License v3.0 6 votes vote down vote up
def preprocess(self,text):
        #text = text.split(" ");
        text = word_tokenize(text)
        if self.display:
            print "After Tokenizing"
            print text
            print "\n\n"

        text=[w.strip().lower() for w in text if not w.strip() in ENGLISH_STOPWORDS and len(w.strip())>2]
        
        tc = TextCollection([text])
        words = list(set(tc))
        
        word_tf = {word: tc.tf(word, text) * len(text) for word in words}

        return word_tf 
Example 3
Project: ConvLab   Author: ConvLab   File: Mem2Seq.py    MIT License 6 votes vote down vote up
def predict(self, query):
        usr = query
        print('Mem2Seq usr:', usr)
        #example input: 'please find a restaurant called nusha .'
        self.t += 1
        print('Mem2Seq turn:', self.t)
        usr = ' '.join(word_tokenize(usr.lower()))
        self.memory += generate_memory(usr, '$u', self.t)
        src_plain = (self.memory+[['$$$$']*MEM_TOKEN_SIZE],)
        src_seqs = plain2tensor(self.lang.word2index, src_plain[0])
        words = self.model.evaluate_batch(1, src_seqs, [len(src_plain[0])], None, None, None, None, src_plain)
        row = np.transpose(words)[0].tolist()
        if '<EOS>' in row:
            row = row[:row.index('<EOS>')]
        sys = ' '.join(row)
        sys = denormalize(sys)
        print('Mem2Seq sys:', sys)
        self.memory += generate_memory(sys, '$s', self.t)
        return sys 
Example 4
Project: VMED   Author: thaihungle   File: data_util.py    MIT License 6 votes vote down vote up
def load_lines_from_file(fpath, str2tok):
    all_sens=[]
    with open(fpath) as f:
        for line in f:
            sen=[]
            line = ''.join([ch if ch in EN_WHITELIST else ' ' for ch in line.lower()])
            tokens = nltk.word_tokenize(line)
            for tok in tokens:
                if tok in str2tok:
                    sen.append(str2tok[tok])
                else:
                    sen.append(str2tok['<unknown>'])
            all_sens.append([sen,[1]*10])
    # print(all_sens)
    # raise False
    return all_sens 
Example 5
Project: Machine-Translation   Author: foamliu   File: analyze_data.py    Apache License 2.0 6 votes vote down vote up
def analyze_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show() 
Example 6
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-5-document-classification.py    MIT License 6 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 7
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.4-tfidf-svm.py    MIT License 6 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    stop_words = nltk.corpus.stopwords.words("english")

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 8
Project: OpenBottle   Author: xiaozhuchacha   File: textcat.py    MIT License 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 9
Project: OpenBottle   Author: xiaozhuchacha   File: textcat.py    MIT License 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 10
Project: formality_emnlp19   Author: jimth001   File: multi_process_tokenizer.py    MIT License 6 votes vote down vote up
def tokenizer(strings:'list of string',type:str='word',join:bool=True)->'list of results':
    assert type=='word' or type=='sen'
    results=[]
    if type=='word':
        for s in strings:
            if join:
                results.append(' '.join(nltk.word_tokenize(s)))
            else:
                results.append(nltk.word_tokenize(s))
    else:
        for s in strings:
            if join:
                results.append(' '.join(nltk.sent_tokenize(s)))
            else:
                results.append(nltk.sent_tokenize(s))
    return results 
Example 11
Project: formality_emnlp19   Author: jimth001   File: classifier_em.py    MIT License 6 votes vote down vote up
def preprocess(informal_src_list,formal_src_list,embedding_path,output_path=None,shuffle=True):
    vectors,vocab_hash=embedding_api.load_word_embedding(embedding_path)
    all_data=[]
    for src in informal_src_list:
        with open(src,'r',encoding='utf-8') as f:
            for line in f:
                d=Data(nltk.word_tokenize(line.strip()), 0, line.strip())
                d.str2index(vocab_hash,with_unk=False)
                all_data.append(d)
    for src in formal_src_list:
        with open(src,'r',encoding='utf-8') as f:
            for line in f:
                d=Data(nltk.word_tokenize(line.strip()), 1, line.strip())
                d.str2index(vocab_hash,with_unk=False)
                all_data.append(d)
    if shuffle:
        random.shuffle(all_data)
    if output_path is not None:
        pickle.dump(all_data,open(output_path,'wb'),protocol=True)
    return all_data 
Example 12
Project: Health-Checker   Author: KriAga   File: textcat.py    MIT License 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 13
Project: branchLSTM   Author: kochkinaelena   File: preprocessing.py    MIT License 6 votes vote down vote up
def str_to_wordlist(tweettext, tweet, remove_stopwords=False):

    #  Remove non-letters
    # NOTE: Is it helpful or not to remove non-letters?
    # str_text = re.sub("[^a-zA-Z]"," ", str_text)
    tweettext = cleantweet(tweettext, tweet)
    str_text = re.sub("[^a-zA-Z]", " ", tweettext)
    # Convert words to lower case and split them
    # words = str_text.lower().split()
    words = nltk.word_tokenize(str_text.lower())
    # Optionally remove stop words (false by default)
    # NOTE: generic list of stop words, should i remove them or not?
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if w not in stops]
    # 5. Return a list of words
    return(words)
#%%
# Turn tweet into average of word vectors 
Example 14
Project: qb   Author: Pinafore   File: jmlr.py    MIT License 6 votes vote down vote up
def qanta_2012_stats():
    """
    This computes and prints dataset statistics for prior versions from EMNLP 2012.
    Published results use private NAQT data, these stats are computed using only public data.
    Use nltk for word tokenization to be consistent with prior analysis.
    Use spacy for sentence tokenization to be consistent with qanta dataset preprocessing.
    (We don't use word tokenizations in dataset preprocessing, we consider it a model detail.)
    """
    with open('data/external/emnlp_2012_questions.csv') as f:
        questions_2012 = list(csv.reader(f))

    eprint('N EMNLP 2012 Questions', len(questions_2012))
    questions_2012 = [q[4] for q in questions_2012]
    tokenized_2012 = pseq(questions_2012).map(nltk.word_tokenize).list()
    n_tokens_2012 = sum(len(q) for q in tokenized_2012)
    eprint('N EMNLP 2012 Tokens', n_tokens_2012)
    n_sentences = [len(nlp(q)) for q in tqdm(questions_2012)]
    eprint('N EMNLP 2012 Sentences', sum(n_sentences)) 
Example 15
Project: cmu-ammml-project   Author: jayanthkoushik   File: extract_transc_feats.py    MIT License 6 votes vote down vote up
def get_toks(fname):
    with open(fname) as f:
        transc_str = f.read()
    transc_str = transc_str.replace("\r\n", " ")
    transc_str = transc_str.replace("\r\r", " ")
    transc_str = transc_str.replace("\n", " ")
    transc_str = transc_str.replace("-", " ")

    # Convert the transcription to lower case, but make
    # filler markers (like "umm", "uhh" etc.) upper case.
    transc_str = transc_str.lower()
    m = re.findall("[{(]([^\s]*?)[)}]", transc_str)
    for word in m:
        transc_str = re.sub("[({{]{}[)}}]".format(word),
                            " " + word.upper() + " ", transc_str)

    # Construct the feature vector.
    return nltk.word_tokenize(transc_str) 
Example 16
Project: who-are-you   Author: PawelPamula   File: __init__.py    MIT License 6 votes vote down vote up
def tweets2tags(text, hasht):
    tx=[]
    for line in text:
        tokens=word_tokenize(line)
        tags=nltk.pos_tag(tokens)
        text= [s[0] for s in tags if s[1].startswith('NN')]
        tx.extend(text)
    vectorizer = TfidfVectorizer(stop_words="english",min_df=1)
    X = vectorizer.fit_transform(tx)
    idf = vectorizer.idf_
    size=len(idf)
    idf[:size/5]=2
    idf[size/5:2*size/5]=3
    idf[2*size/5:3*size/5]=4
    idf[3*size/5:4*size/5]=5
    idf[4*size/5:]=7
    tags =  dict(zip(vectorizer.get_feature_names(), idf))
    for i in hasht:
        tags[i] = 6
    return tags 
Example 17
Project: humanitiesTutorial   Author: vierth   File: 20_corpusrep5_fast.py    Apache License 2.0 5 votes vote down vote up
def tokenizeText(text):
    words = nltk.word_tokenize(text)
    filtered = [word for word in words if word.isalnum()]
    return filtered

# Create vectorizer object. Include use_idf=False to use frequencies 
Example 18
Project: Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs   Author: kamalkraj   File: ner.py    GNU General Public License v3.0 5 votes vote down vote up
def predict(self,Sentence):
        Sentence = words =  word_tokenize(Sentence)
        Sentence = self.addCharInformation(Sentence)
        Sentence = self.padding(self.createTensor(Sentence,self.word2Idx,self.case2Idx,self.char2Idx))
        tokens, casing,char = Sentence
        tokens = np.asarray([tokens])     
        casing = np.asarray([casing])
        char = np.asarray([char])
        pred = self.model.predict([tokens, casing,char], verbose=False)[0]   
        pred = pred.argmax(axis=-1)
        pred = [self.idx2Label[x].strip() for x in pred]
        return list(zip(words,pred)) 
Example 19
Project: weather_report   Author: deniederhut   File: classifiers.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def classify(self, text):
        """Count number/kind of emotion terms"""
        if type(text) == str:
            text = [text]
        if type(text) == tuple:
            text = list(text)
        for item in text:
            self.items += 1
            self.terms += len(item.split())
            for term in word_tokenize(item):
                for syn in wn.synsets(term):
                    for path in syn.hypernym_paths():
                        if self.emotion in path:
                            self.update_from_path(path)
        return self 
Example 20
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_positive_reviews_without_sw(self):
        positive_array_without_sw=[]
        for review in self.get_positive_reviews(self):
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            positive_array_without_sw.append(" ".join(str(x) for x in review_words_without_sw))
        return positive_array_without_sw 
Example 21
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_negative_reviews_without_sw(self):
        negative_array_without_sw=[]
        for review in self.get_negative_reviews(self):
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            negative_array_without_sw.append(" ".join(str(x) for x in review_words_without_sw))
        return negative_array_without_sw 
Example 22
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_test_positive_array_without_sw(self):
        test_positive_array_without_sw=[]
        for review in self.get_test_positive_array(self):
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            test_positive_array_without_sw.append(" ".join(str(x) for x in review_words_without_sw))
        return test_positive_array_without_sw 
Example 23
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_test_negative_array_without_sw(self):
        test_negative_array_without_sw = []
        for review in self.get_test_negative_array(self):
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            test_negative_array_without_sw.append(" ".join(str(x) for x in review_words_without_sw))
        return test_negative_array_without_sw 
Example 24
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_positive_reviews_stemmed_without_sw(self):
        positive_array_stemmed_without_sw = []
        stemmer = ISRIStemmer()
        for review in self.get_positive_reviews(self):
            review_words_stemmed_without_sw=[]
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            review_words_stemmed_without_sw=[]
            for word in review_words_without_sw:
                review_words_stemmed_without_sw .append(stemmer.stem(word))
            positive_array_stemmed_without_sw.append(" ".join(str(x) for x in review_words_stemmed_without_sw))
        return positive_array_stemmed_without_sw 
Example 25
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_test_positive_array_stemmed_without_sw(self):
        stemmer = ISRIStemmer()
        test_positive_array_stemmed_without_sw=[]
        review_words_stemmed_without_sw=[]
        for review in self.get_test_positive_array(self):
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            review_words_stemmed_without_sw = []
            for word in review_words_without_sw:
                review_words_stemmed_without_sw .append(stemmer.stem(word))
            test_positive_array_stemmed_without_sw.append(" ".join(str(x) for x in review_words_stemmed_without_sw))
        return test_positive_array_stemmed_without_sw 
Example 26
Project: textmining_arabic   Author: IbrahimAlsharif   File: LoadData.py    Apache License 2.0 5 votes vote down vote up
def get_test_negative_array_stemmed_without_sw(self):
        stemmer = ISRIStemmer()
        test_negative_array_stemmed_without_sw = []
        review_words_stemmed_without_sw = []
        for review in self.get_test_negative_array(self):
            review_words = nltk.word_tokenize(review)
            review_words_without_sw = [i for i in review_words if not i in self.get_arabic_sw(self)]
            review_words_stemmed_without_sw = []
            for word in review_words_without_sw:
                review_words_stemmed_without_sw .append(stemmer.stem(word))
            test_negative_array_stemmed_without_sw.append(" ".join(str(x) for x in review_words_stemmed_without_sw))
        return test_negative_array_stemmed_without_sw 
Example 27
Project: semantic-pdf-splitter   Author: MtnFranke   File: extractor.py    MIT License 5 votes vote down vote up
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) 
Example 28
Project: botbuilder-python   Author: microsoft   File: bidaf_model_runtime.py    MIT License 5 votes vote down vote up
def _preprocess(text: str) -> Tuple[np.ndarray, np.ndarray]:
        tokens = word_tokenize(text)
        # split into lower-case word tokens, in numpy array with shape of (seq, 1)
        words = np.asarray([w.lower() for w in tokens]).reshape(-1, 1)
        # split words into chars, in numpy array with shape of (seq, 1, 1, 16)
        chars = [[c for c in t][:16] for t in tokens]
        chars = [cs + [""] * (16 - len(cs)) for cs in chars]
        chars = np.asarray(chars).reshape(-1, 1, 1, 16)
        return words, chars 
Example 29
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def read_corpus():
    """open corpus file and create word and sentence tokens
    corpus file is the base brain for Tobot which contains words/sentences
    used by nltk and sklearn to help Tobot respond to questions"""
    f = open('tobot_corpus.txt', 'r', errors='ignore')
    raw = f.read()
    f.close()
    raw = raw.lower()
    #nltk.download('punkt')
    #nltk.download('wordnet')
    #nltk.download('stopwords')
    sent_tokens = nltk.sent_tokenize(raw)
    word_tokens = nltk.word_tokenize(raw)

    return sent_tokens, word_tokens 
Example 30
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def is_question(text):
    sentences = get_sentences(text)
    isquestion = False
    for sent in sentences:
        words = nltk.word_tokenize(sent)
        if words[-1] == '?' or words[0].lower() in QUESTION_START_WORDS:
            isquestion = True
            break
    return isquestion

# end language function


# text cleaning functions 
Example 31
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def get_words(text):
    """Retrieve the words present in a given string of text.
    Filter out the most common and stop words.
    The return value is a list of tuples where the first member is a lowercase word,
    and the second member the number of time it is present in the text."""
    wordsList = nltk.word_tokenize(text)
    # remove most common words
    fdist = nltk.probability.FreqDist(wordsList)
    most_common_words = fdist.most_common(2)
    for word, count in most_common_words:
        word = word.lower()
        if word in wordsList:
            wordsList.remove(word)
    # remove stop words
    stop_words = nltk.corpus.stopwords.words("english")
    filtered_wordsList = []
    for word in wordsList:
        word = word.lower()
        if word not in stop_words:
            filtered_wordsList.append(word)
    wordsList = filtered_wordsList[:]
    del filtered_wordsList[:]
    # perform lemmatization
    text = " ".join(wordsList)
    filtered_wordsList = lem_normalize(text)
    return Counter(filtered_wordsList).items() 
Example 32
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def clean_words(text):
    """Simple text cleaner that removes all tokens that are not alphabetic
    Returns list of clean words."""
    text = text.split('.')
    text = '. '.join(text).strip()
    words = nltk.word_tokenize(text)
    words_clean = [word for word in words if word.isalpha()]
    return words_clean 
Example 33
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def lem_normalize(text):
    remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
    return lem_tokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

# end text cleaning


# database functions 
Example 34
Project: Pesquisas   Author: danilopcarlotti   File: statistical_analysis.py    Apache License 2.0 5 votes vote down vote up
def ngramFreq(self,text,n):
		''' Returns a Counter object with the frequency of each ngram in the text '''
		token = nltk.word_tokenize(text)
		Ngram = ngrams(token,n)
		return Counter(Ngram) 
Example 35
Project: cdc   Author: ckbjimmy   File: MLPipeline.py    MIT License 5 votes vote down vote up
def tokenize(text):
    import re
    tokens = word_tokenize(text)
    output = [i for i in tokens if i not in string.punctuation and not re.match("^[0-9.].*$", i) and len(i) > 2]
    output = stem_tokens(output, stemmer)
    return output 
Example 36
Project: razzy-spinner   Author: rafasashi   File: textcat.py    GNU General Public License v3.0 5 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 37
Project: RTX   Author: RTXteam   File: Question.py    MIT License 5 votes vote down vote up
def find_edge_type(string):
	"""
	Extract edge type from string
	:param string: input string (chunck of text)
	:param edge_types: edge types in the KG (see dumpdata.py)
	:return: one of the edge types
	"""
	p = nltk.stem.snowball.SnowballStemmer("english")
	st_words = set(stopwords.words('english'))
	res = None
	# remove underscores
	edge_types_space = []
	for et in edge_types:
		edge_types_space.append(et.replace('_', ' '))
	# standarize the string by making it lowercase and removing stop words
	query = string.lower()
	query_tokens = nltk.word_tokenize(query, "english")
	query_no_stop = [w for w in query_tokens if w not in st_words]
	query_clean = ""
	for word in query_no_stop:
		query_clean += p.stem(word) + " "
	# see if it matches any of the standardized edge types
	for i in range(len(edge_types_space)):
		et = edge_types_space[i]
		et_tokens = nltk.word_tokenize(et)
		et_no_stop = [w for w in et_tokens if w not in st_words]
		et_clean = ""
		for word in et_no_stop:
			if word == "assoc":
				word = "associated"
			et_clean += p.stem(word) + " "
		if query_clean == et_clean:
			res = edge_types[i]
	return res

################################################
# The Question class to store the question templates
################################################ 
Example 38
Project: Machine-Translation   Author: foamliu   File: pre_process.py    Apache License 2.0 5 votes vote down vote up
def build_wordmap_en():
    translation_path = os.path.join(train_translation_folder, train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    word_freq = Counter()

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0]
        # Update word frequency
        word_freq.update(tokens)

    # Create word map
    # words = [w for w in word_freq.keys() if word_freq[w] > min_word_freq]
    words = word_freq.most_common(input_lang_vocab_size - 4)
    word_map = {k[0]: v + 4 for v, k in enumerate(words)}
    word_map['<pad>'] = 0
    word_map['<start>'] = 1
    word_map['<end>'] = 2
    word_map['<unk>'] = 3
    print(len(word_map))
    print(words[:10])

    with open('data/WORDMAP_en.json', 'w') as file:
        json.dump(word_map, file, indent=4) 
Example 39
Project: Machine-Translation   Author: foamliu   File: pre_process.py    Apache License 2.0 5 votes vote down vote up
def build_samples():
    word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r'))
    word_map_en = json.load(open('data/WORDMAP_en.json', 'r'))

    for usage in ['train', 'valid']:
        if usage == 'train':
            translation_path_en = os.path.join(train_translation_folder, train_translation_en_filename)
            translation_path_zh = os.path.join(train_translation_folder, train_translation_zh_filename)
            filename = 'data/samples_train.json'
        else:
            translation_path_en = os.path.join(valid_translation_folder, valid_translation_en_filename)
            translation_path_zh = os.path.join(valid_translation_folder, valid_translation_zh_filename)
            filename = 'data/samples_valid.json'

        print('loading {} texts and vocab'.format(usage))
        with open(translation_path_en, 'r') as f:
            data_en = f.readlines()

        with open(translation_path_zh, 'r') as f:
            data_zh = f.readlines()

        print('building {} samples'.format(usage))
        samples = []
        for idx in tqdm(range(len(data_en))):
            sentence_zh = data_zh[idx].strip()
            seg_list = jieba.cut(sentence_zh)
            input_zh = encode_text(word_map_zh, list(seg_list))

            sentence_en = data_en[idx].strip().lower()
            tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0]
            output_en = encode_text(word_map_en, tokens)

            if len(input_zh) <= max_len and len(
                    output_en) <= max_len and UNK_token not in input_zh and UNK_token not in output_en:
                samples.append({'input': list(input_zh), 'output': list(output_en)})

        with open(filename, 'w') as f:
            json.dump(samples, f, indent=4)

        print('{} {} samples created at: {}.'.format(len(samples), usage, filename)) 
Example 40
Project: chattR   Author: patrickstocklin   File: np_extractors.py    GNU General Public License v2.0 5 votes vote down vote up
def _tokenize_sentence(self, sentence):
        '''Split the sentence into single words/tokens'''
        tokens = nltk.word_tokenize(sentence)
        return tokens 
Example 41
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-6.1-nlp-pipeline.py    MIT License 5 votes vote down vote up
def tokenize_words(targets):
    while True:
        sentence = (yield)
        words = nltk.word_tokenize(sentence)
        for target in targets:
            target.send(words) 
Example 42
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-3-sentiment-analysis.py    MIT License 5 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''
    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        stop_words='english',  # remove stop words
        min_df=1  # minimum document frequency, i.e. the word must appear more than once.
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 43
Project: Hands-on-NLP-with-NLTK-and-scikit-learn-   Author: PacktPublishing   File: nlp-4-ngrams.py    MIT License 5 votes vote down vote up
def extract_features(corpus):
    '''Extract TF-IDF features from corpus'''

    sa_stop_words = nltk.corpus.stopwords.words("english")

    # words that might invert a sentence's meaning
    white_list = [
        'what', 'but', 'if', 'because', 'as', 'until', 'against',
        'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
        'further', 'then', 'once', 'here', 'there', 'why', 'how', 'all', 'any',
        'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
        'same', 'so', 'than', 'too', 'can', 'will', 'just', 'don', 'should']

    # take these out of the standard NLTK stop word list
    sa_stop_words = [sw for sw in sa_stop_words if sw not in white_list]

    # vectorize means we turn non-numerical data into an array of numbers
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,  # for demonstration, True by default
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        min_df=2,  # minimum document frequency, i.e. the word must appear more than once.
        ngram_range=(1, 2),
        stop_words=sa_stop_words
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus 
Example 44
Project: NSCL-PyTorch-Release   Author: vacancy   File: datasets.py    MIT License 5 votes vote down vote up
def _get_metainfo(self, index):
        question = gdef.translate_question(self.questions[index])
        scene = gdef.translate_scene(self.scenes[question['image_index']])
        question['scene'] = scene

        question['image_index'] = question['image_index']
        question['image_filename'] = gdef.get_image_filename(scene)
        question['question_index'] = index
        question['question_tokenized'] = nltk.word_tokenize(question['question'])

        # program section
        has_program = False
        if 'program_nsclseq' in question:
            question['program_raw'] = question['program_nsclseq']
            question['program_seq'] = question['program_nsclseq']
            has_program = True
        elif 'program' in question:
            question['program_raw'] = question['program']
            question['program_seq'] = gdef.program_to_nsclseq(question['program'], question)
            has_program = True

        if has_program:
            question['program_tree'] = nsclseq_to_nscltree(question['program_seq'])
            question['program_qsseq'] = nsclseq_to_nsclqsseq(question['program_seq'])
            question['program_qstree'] = nscltree_to_nsclqstree(question['program_tree'])
            question['question_type'] = question['program_seq'][-1]['op']
        else:
            question['question_type'] = None

        return question 
Example 45
Project: Bi-Seq2Seq   Author: jimth001   File: biseq2seq.py    MIT License 5 votes vote down vote up
def process_query_file(self):
        source=[]
        with open(self.query_file,'r',encoding='utf-8') as f:
            for line in f:
                words=nltk.word_tokenize(line.strip())
                source.append(self.sentence2indices(words))
        return source 
Example 46
Project: Bi-Seq2Seq   Author: jimth001   File: biseq2seq.py    MIT License 5 votes vote down vote up
def process_reply_file(self):
        source=[]
        with open(self.reply_file,'r',encoding='utf-8') as f:
            for line in f:
                words=nltk.word_tokenize(line.strip())
                source.append(self.sentence2indices(words))
        return source 
Example 47
Project: Bi-Seq2Seq   Author: jimth001   File: biseq2seq.py    MIT License 5 votes vote down vote up
def process_target_file(self):
        target_output = []
        target_input = []
        with open(self.target_file, 'r', encoding='utf-8') as f:
            for line in f:
                words = nltk.word_tokenize(line.strip())
                target_input.append(self.sentence2indices(words,with_sos=True))
                target_output.append(self.sentence2indices(words,with_eos=True))
        return target_input,target_output 
Example 48
Project: Bi-Seq2Seq   Author: jimth001   File: biseq2seq_with_attention.py    MIT License 5 votes vote down vote up
def process_query_file(self,max_len):
        source=[]
        total_num = 0
        cut_num = 0
        with open(self.query_file,'r',encoding='utf-8') as f:
            for line in f:
                words=nltk.word_tokenize(line.strip())
                if len(words)>max_len:
                    words=words[:max_len]
                    cut_num += 1
                total_num+=1
                source.append(self.sentence2indices(words))
        print('总数量:', total_num)
        print('截断得句子数量', cut_num)
        return source 
Example 49
Project: Bi-Seq2Seq   Author: jimth001   File: biseq2seq_with_attention.py    MIT License 5 votes vote down vote up
def process_reply_file(self,max_len):
        source=[]
        total_num = 0
        cut_num = 0
        with open(self.reply_file,'r',encoding='utf-8') as f:
            for line in f:
                words=nltk.word_tokenize(line.strip())
                if len(words)>max_len:
                    words=words[:max_len]
                    cut_num += 1
                total_num += 1
                source.append(self.sentence2indices(words))
        print('总数量:', total_num)
        print('截断得句子数量', cut_num)
        return source 
Example 50
Project: social_mind   Author: byeongkyu   File: train_classifier.py    Apache License 2.0 5 votes vote down vote up
def dialogue_act_features(sentence):
    features = {}
    sentence_filtered = re.sub("[\'.,#!?:-]", '', sentence)

    for word in word_tokenize(sentence_filtered):
        if word not in STOPWORDS:
            features['contains({})'.format(word.lower())] = True
    return features 
Example 51
Project: social_mind   Author: byeongkyu   File: sentence_classifier.py    Apache License 2.0 5 votes vote down vote up
def dialogue_act_features(sentence):
    features = {}
    sentence_filtered = re.sub("[\'.,#!?:-]", '', sentence)

    for word in word_tokenize(sentence_filtered):
        if word not in STOPWORDS:
            features['contains({})'.format(word.lower())] = True
    return features 
Example 52
Project: formality_emnlp19   Author: jimth001   File: pinc.py    MIT License 5 votes vote down vote up
def load_file_and_tokenize(file):
    sens=[]
    with open(file,'r',encoding='utf-8') as f:
        for line in f:
            sens.append(nltk.word_tokenize(line.strip()))
    return sens 
Example 53
Project: formality_emnlp19   Author: jimth001   File: tools.py    MIT License 5 votes vote down vote up
def tokenizer(sentence,join=False,only_split=True):
    if only_split:
        if join:
            return sentence
        else:
            return sentence.split()
    else:
        if join:
            return ' '.join(nltk.word_tokenize(sentence))
        else:
            return nltk.word_tokenize(sentence) 
Example 54
Project: formality_emnlp19   Author: jimth001   File: tools.py    MIT License 5 votes vote down vote up
def break_sen_and_tokernize(para,break_sen=False):
    if break_sen:
        return nltk.word_tokenize(' '.join(break_sentence(para)))
    else:
        return nltk.word_tokenize(para) 
Example 55
Project: formality_emnlp19   Author: jimth001   File: tokenizer.py    MIT License 5 votes vote down vote up
def file_tokenize(input,output):
    with open(input,'r',encoding='utf-8') as f:
        with open(output,'w',encoding='utf-8') as fw:
            for line in f:
                fw.write(' '.join(nltk.word_tokenize(line.strip()))+'\n') 
Example 56
Project: RottenCrawler   Author: kevin940726   File: main.py    MIT License 5 votes vote down vote up
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems 
Example 57
Project: mipsqa   Author: google   File: squad_prepro.py    Apache License 2.0 5 votes vote down vote up
def _word_tokenize(text):
  # TODO(seominjoon): Consider using Stanford Tokenizer or othe tokenizers.
  return [
      word.replace('``', '"').replace("''", '"')
      for word in nltk.word_tokenize(text)
  ] 
Example 58
Project: ChatBot   Author: subpath   File: Chatbot.py    MIT License 5 votes vote down vote up
def reply(self, input_text):
        """
        Takes input_text and return predicted responce
        :param input_text: string
        :return: predicted_text: string
        """
        if input_text == self.ultimate_question:
            return '42'
        input_seq = []
        input_wids = []
        for word in nltk.word_tokenize(input_text.lower()):
            idx = 1
            if word in self.input_word2idx:
                idx = self.input_word2idx[word]
            input_wids.append(idx)
        input_seq.append(input_wids)
        input_seq = pad_sequences(input_seq, self.max_encoder_seq_length)
        states_value = self.encoder_model.predict(input_seq)
        target_seq = np.zeros((1, 1, self.num_decoder_tokens))
        target_seq[0, 0, self.target_word2idx['START']] = 1
        target_text = ''
        target_text_len = 0
        terminated = False
        while not terminated:
            output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)
            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = self.target_idx2word[sample_token_idx]
            target_text_len += 1

            if sample_word != 'START' and sample_word != 'END':
                target_text += ' ' + sample_word

            if sample_word == 'END' or target_text_len >= self.max_decoder_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, self.num_decoder_tokens))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip().replace('UNK', '') 
Example 59
Project: qb   Author: Pinafore   File: jmlr.py    MIT License 5 votes vote down vote up
def qanta_2014_stats():
    """
    This computes and prints dataset statistics for prior versions from EMNLP 2014.
    Published results use private NAQT data, these stats are computed using only public data.
    Use nltk for word tokenization to be consistent with prior analysis.
    Use spacy for sentence tokenization to be consistent with qanta dataset preprocessing.
    (We don't use word tokenizations in dataset preprocessing, we consider it a model detail.)
    """
    questions_2014 = pseq.jsonl('data/external/emnlp_2014_questions.jsonl').cache()
    eprint('N EMNLP 2014 Questions', questions_2014.len())
    n_tokens_2014 = questions_2014.map(lambda q: q['question']).map(nltk.word_tokenize).map(len).sum()
    eprint('N EMNLP 2014 Tokens', n_tokens_2014)
    n_sentences = [len(nlp(q['question'])) for q in tqdm(questions_2014.list())]
    eprint('N EMNLP 2014 Sentences', sum(n_sentences)) 
Example 60
Project: qb   Author: Pinafore   File: preprocess.py    MIT License 5 votes vote down vote up
def tokenize_question(text: str) -> List[str]:
    return word_tokenize(clean_question(text)) 
Example 61
Project: qb   Author: Pinafore   File: dataset.py    MIT License 5 votes vote down vote up
def create_qb_tokenizer(
        unigrams=True, bigrams=False, trigrams=False,
        zero_length_token='zerolengthunk', strip_qb_patterns=True):
    def tokenizer(text):
        if strip_qb_patterns:
            text = re.sub(
                '\s+', ' ',
                re.sub(regex_pattern, ' ', text, flags=re.IGNORECASE)
            ).strip().capitalize()
        import nltk
        tokens = nltk.word_tokenize(text)
        if len(tokens) == 0:
            return [zero_length_token]
        else:
            ngrams = []
            if unigrams:
                ngrams.extend(tokens)
            if bigrams:
                ngrams.extend([f'{w0}++{w1}' for w0, w1 in nltk.bigrams(tokens)])
            if trigrams:
                ngrams.extend([f'{w0}++{w1}++{w2}' for w0, w1, w2 in nltk.trigrams(tokens)])

            if len(ngrams) == 0:
                ngrams.append(zero_length_token)
            return ngrams

    return tokenizer 
Example 62
Project: box_embeddings   Author: Lorraine333   File: DataLoader.py    Apache License 2.0 5 votes vote down vote up
def read_entail(self, s, word_to_index):
        s1 = []
        s2 = []
        labels = []
        l1 = []
        l2 = []
        max_length = 0
        for line in open(s):
            if line.startswith("gold_label") or line.startswith("-"):
                continue
            tokens = line.strip().split("\t")
            t1 = []
            for a in word_tokenize(tokens[5]):
                a = a.lower()
                if a in word_to_index:
                    t1.append(word_to_index[a])
                else:
                    t1.append(word_to_index['oov'])
            s1.append(t1)
            t2 = []
            for a in word_tokenize(tokens[6]):
                a = a.lower()
                if a in word_to_index:
                    t2.append(word_to_index[a])
                else:
                    t2.append(word_to_index['oov'])
            s2.append(t2)
            l1.append(len(t1))
            l2.append(len(t2))
            labels.append(label_map[tokens[0]])
            max_length = max(max_length, len(t1))
            max_length = max(max_length, len(t2))
        return s1, s2, np.array(labels), np.array(l1), np.array(l2), max_length

    # Read predicted probabilities for pairs of phrases/sentences 
Example 63
Project: cs224n-win18-squad   Author: abisee   File: squad_preprocess.py    Apache License 2.0 5 votes vote down vote up
def tokenize(sequence):
    tokens = [token.replace("``", '"').replace("''", '"').lower() for token in nltk.word_tokenize(sequence)]
    return tokens 
Example 64
Project: Snowball   Author: davidsbatista   File: Snowball.py    GNU General Public License v3.0 5 votes vote down vote up
def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences
        where named entities are already tagged
        """
        try:
            os.path.isfile("processed_tuples.pkl")
            f = open("processed_tuples.pkl", "r")
            print("\nLoading processed tuples from disk...")
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        except IOError:
            print("\nGenerating relationship instances from sentences")
            f_sentences = codecs.open(sentences_file, encoding='utf-8')
            count = 0
            for line in f_sentences:
                count += 1
                if count % 10000 == 0:
                    sys.stdout.write(".")
                sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away,
                                    self.config.min_tokens_away, self.config.context_window_size)

                for rel in sentence.relationships:
                    if rel.arg1type == self.config.e1_type and rel.arg2type == self.config.e2_type:
                        bef_tokens = word_tokenize(rel.before)
                        bet_tokens = word_tokenize(rel.between)
                        aft_tokens = word_tokenize(rel.after)
                        if not (bef_tokens == 0 and bet_tokens == 0 and aft_tokens == 0):
                            t = Tuple(rel.ent1, rel.ent2, rel.sentence, rel.before, rel.between, rel.after, self.config)
                            self.processed_tuples.append(t)
            f_sentences.close()

            print("\n", len(self.processed_tuples), "relationships generated")
            print("Dumping relationships to file")
            f = open("processed_tuples.pkl", "wb")
            pickle.dump(self.processed_tuples, f)
            f.close() 
Example 65
Project: Snowball   Author: davidsbatista   File: Tuple.py    GNU General Public License v3.0 5 votes vote down vote up
def tokenize(self, text):
        return [word for word in word_tokenize(text.lower())
                if word not in self.config.stopwords] 
Example 66
Project: Snowball   Author: davidsbatista   File: Tuple.py    GNU General Public License v3.0 5 votes vote down vote up
def construct_words_vectors(self, words, config):
        # split text into tokens and tag them using NLTK's default English tagger
        # POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
        text_tokens = word_tokenize(words)
        tags_ptb = pos_tag(text_tokens)
        pattern = [t[0] for t in tags_ptb if
                   t[0].lower() not in config.stopwords and t[1] not in self.filter_pos]
        if len(pattern) >= 1:
            vect_ids = self.config.vsm.dictionary.doc2bow(pattern)
            return self.config.vsm.tf_idf_model[vect_ids] 
Example 67
Project: tweets-preprocessor   Author: vasisouv   File: twitter_preprocessor.py    GNU General Public License v3.0 5 votes vote down vote up
def remove_stopwords(self, extra_stopwords=None):
        if extra_stopwords is None:
            extra_stopwords = []
        text = nltk.word_tokenize(self.text)
        stop_words = set(stopwords.words('english'))

        new_sentence = []
        for w in text:
            if w not in stop_words and w not in extra_stopwords:
                new_sentence.append(w)
        self.text = ' '.join(new_sentence)
        return self 
Example 68
Project: geograpy2   Author: Corollarium   File: extraction.py    MIT License 5 votes vote down vote up
def named_entities(self):
        # word_tokenize should work well for most non-CJK languages
        text = nltk.word_tokenize(self.text)
        
        # TODO: this works only for english. Stanford's pos tagger supports
        # more languages
        # http://www.nltk.org/api/nltk.tag.html#module-nltk.tag.stanford
        # http://stackoverflow.com/questions/1639855/pos-tagging-in-german
        # PT corpus http://aelius.sourceforge.net/manual.html
        # 
        pos_tag = nltk.pos_tag(text)
        
        nes = nltk.ne_chunk(pos_tag)
        return nes 
Example 69
Project: who-are-you   Author: PawelPamula   File: tst.py    MIT License 5 votes vote down vote up
def tweets2tags(text,hasht):
        tx=[]
        for line in text:
            tokens=word_tokenize(line)
                tags=nltk.pos_tag(tokens)

                text= [s[0] for s in tags if s[1].startswith('NN')]
                tx.extend(text) 
Example 70
Project: EliIE   Author: Tian312   File: word2vec.py    MIT License 5 votes vote down vote up
def tokenize_train(train_directory,tokenized_directory):
    with codecs.open(train_directory, "r", "utf-8") as file:
	    with codecs.open(tokenized_directory, "w", "utf-8") as writer:
		    new_sens = []
		    for line in file:
			    sentences = sent_tokenize(line.strip())
			    for sen in sentences:

				    sen = word_tokenize(sen.lower())
				    new_sen = ' '.join(sen)
				    new_sens.append(new_sen)
				    writer.write(new_sen)
				    writer.write("\n")
    sentences = gensim.models.word2vec.LineSentence(tokenized_directory)
    return sentences 
Example 71
Project: corpus-to-graph-ml   Author: CatalystCode   File: data_preparation_tools.py    MIT License 5 votes vote down vote up
def remove_stop_words(sent, context=None):
    processed_tokens = []
    tokens = nltk.word_tokenize(sent)
    for t in tokens:
        # ignore stop words
        if (t in nltk.corpus.stopwords.words('english') or len(t) < 2):
            continue
        processed_tokens.append(t)

    return " ".join(processed_tokens)

# digits removal 
Example 72
Project: ConvLab   Author: ConvLab   File: Sequicity.py    MIT License 4 votes vote down vote up
def predict(self, usr):            
        print('usr:', usr)
        usr = word_tokenize(usr.lower())
        usr_words = usr + ['EOS_U']
        u_len = np.array([len(usr_words)])
        usr_indices = self.m.reader.vocab.sentence_encode(usr_words)
        u_input_np = np.array(usr_indices)[:, np.newaxis]
        u_input = cuda_(Variable(torch.from_numpy(u_input_np).long()))
        m_idx, z_idx, degree = self.m.m(mode='test', degree_input=None, z_input=None,
                                        u_input=u_input, u_input_np=u_input_np, u_len=u_len,
                                        m_input=None, m_input_np=None, m_len=None,
                                        turn_states=None, **self.kw_ret)
        venue = random.sample(degree, 1)[0] if degree else dict()
        l = [self.m.reader.vocab.decode(_) for _ in m_idx[0]]
        if 'EOS_M' in l:
            l = l[:l.index('EOS_M')]
        l_origin = []
        for word in l:
            if 'SLOT' in word:
                word = word[:-5]
                if word in venue.keys():
                    value = venue[word]
                    if value != '?':
                        l_origin.append(value)
            elif word.endswith('reference]'):
                if 'ref' in venue:
                    l_origin.append(venue['ref'])
            else:
                l_origin.append(word)
        sys = ' '.join(l_origin)
        sys = denormalize(sys)
        print('sys:', sys)
        if cfg.prev_z_method == 'separate':
            eob = self.m.reader.vocab.encode('EOS_Z2')
            if eob in z_idx[0] and z_idx[0].index(eob) != len(z_idx[0]) - 1:
                idx = z_idx[0].index(eob)
                z_idx[0] = z_idx[0][:idx + 1]
            for j, word in enumerate(z_idx[0]):
                if word >= cfg.vocab_size:
                    z_idx[0][j] = 2 #unk
            prev_z_input_np = pad_sequences(z_idx, cfg.max_ts, padding='post', truncating='pre').transpose((1, 0))
            prev_z_len = np.array([len(_) for _ in z_idx])
            prev_z_input = cuda_(Variable(torch.from_numpy(prev_z_input_np).long()))
            self.kw_ret['prev_z_len'] = prev_z_len
            self.kw_ret['prev_z_input'] = prev_z_input
            self.kw_ret['prev_z_input_np'] = prev_z_input_np
        return sys 
Example 73
Project: hora-de-decir-bye-bye   Author: lusy   File: annotate.py    GNU General Public License v3.0 4 votes vote down vote up
def main():
    '''
    import dictionaries and tokenize them
    run annotate() for all articles in data/articles-plain/

    '''
    #esp_tokenizer = nltk.data.load('tokenizers/punkt/spanish.pickle')

    # punctuation list; we need it later to annotate punctuation as such
    punctuation = [u'.', u'!', u'?', u':', u',', u';', u'-', u'"', u"'", u'(', u')']

    # ignore from English dictionary (words that are far more probable to be in esp)
    ignore_en = [u'para', u'las', u'nos', u'con', u'vive', u'persona', u'peso', u'lee', u'mis', u'dos', u'etc', u'hombre',
                 u'placer', u'hoy', u'perfecta', u'mil', u'dolor']

    # import dictionaries (as utf-8!) --> should be done here, not importing dictionaries 2000 times
    with codecs.open('data/dictionaries-common/es_ALL', encoding='utf-8') as common_dict_file:
        common_dict = common_dict_file.read()
        tokens_common = nltk.word_tokenize(common_dict)

    with codecs.open('data/dictionaries-common/en_US', encoding='utf-8') as en_dict_file:
        en_dict = en_dict_file.read()
        tokens_en = nltk.word_tokenize(en_dict)

    # import all the regional dicts
    reg_dicts_ids = [f for f in os.listdir('data/dictionaries-common/')
                    if path.isfile(path.join('data/dictionaries-common/', f)) and f.endswith('reg')]

    reg_dicts_tokens = {}
    for dict_id in reg_dicts_ids:
        path_reg_dict = 'data/dictionaries-common/%s' % dict_id
        with codecs.open(path_reg_dict, encoding='utf-8') as reg_dict_file:
            reg_dict = reg_dict_file.read()
            tokens_reg = nltk.word_tokenize(reg_dict)
            reg_dicts_tokens[dict_id] = tokens_reg # maybe prettify key: es_AR_reg -> es_AR

    #print reg_dicts_tokens.keys()
    #print reg_dicts_tokens['es_AR_reg']

    # annotate all articles
    articles_ids = [f for f in os.listdir('data/articles-plain/') if path.isfile(path.join('data/articles-plain/', f))]
    for a_id in articles_ids:
        print "Annotating article #%s" % a_id
        a_path = 'data/articles-plain/%s' % a_id
        annotate(a_path, punctuation, ignore_en, tokens_en, tokens_common, reg_dicts_tokens) 
Example 74
Project: RTX   Author: RTXteam   File: WordnetDistance.py    MIT License 4 votes vote down vote up
def sentence_similarity(sentence1, sentence2):
	"""
	Copute sentence similarity based on wordnet
	:param sentence1: input string
	:param sentence2: input string
	:return: float between 0 and 1 giving similarity of sentences
	"""
	# Tokenize and tag
	sentence1_tagged = pos_tag(word_tokenize(sentence1))
	sentence2_tagged = pos_tag(word_tokenize(sentence2))

	# Get the synsets for the tagged words
	synsets1 = [tagged_to_synset(*tagged_word) for tagged_word in sentence1_tagged]
	synsets2 = [tagged_to_synset(*tagged_word) for tagged_word in sentence2_tagged]

	# Filter out the Nones
	synsets1 = [ss for ss in synsets1 if ss]
	synsets2 = [ss for ss in synsets2 if ss]

	score, count = 0.0, 0

	# For each word in the first sentence
	for synset in synsets1:
		# Get the similarity value of the most similar word in the other sentence
		vals = [synset.path_similarity(ss) for ss in synsets2]
		best_score = -1
		# Take max ignoring None's
		for val in vals:
			if val:
				if val > best_score:
					best_score = val
		if best_score == -1:
			best_score = None

		# Check that the similarity could have been computed
		if best_score is not None:
			score += best_score
			count += 1

	# Average the values
	if count != 0:
		score /= count
		#score /= (len(sentence1) + len(sentence2)) / 2.0  # divide by the mean sentence length
	else:
		score = 0.0

	# If the number of synset's is small, no confidence in similarity
	if count <= 3:
		sentence1_set = set([i.lower() for i in word_tokenize(sentence1)])
		sentence2_set = set([i.lower() for i in word_tokenize(sentence2)])
		jaccard = len(sentence1_set.intersection(sentence2_set)) / float(len(sentence1_set.union(sentence2_set)))
		score = jaccard
	#return max(score, jaccard)
	return score 
Example 75
Project: robot-navigation   Author: ronaldahmed   File: utils.py    MIT License 4 votes vote down vote up
def get_data():
	map_data = {
		'grid'  : MapData("grid" ,getMapGrid()),
		'jelly' : MapData("jelly",getMapJelly()),
		'l'	  : MapData("l",getMapL())
	}

	for map_name, data_obj in map_data.items():
		filename = map_name + '.settrc'
		sample_id = ''
		flag_toggle = False
		toggle = 0
		actions = path = tokens = []
		start_pos = end_pos = -1
		for line in open( os.path.join(data_dir,filename) ):
			line=line.strip("\n")
			if line=='':
				#ipdb.set_trace()
				# reset variables
				flag_toggle = False
				toggle = 0
				actions = path = tokens = []
				start_pos = end_pos = -1
				sample_id = ''
				continue
			if line.startswith("Cleaned-"):
				prex = "Cleaned-"
				sample_id = line[len(prex):]
			if line.find('map=')!=-1:
				# ignore line: y=...  map=... x=...
				flag_toggle=True
				temp = line.split('\t')
				start_pos = int(temp[0][2:])	# y=...
				end_pos = int(temp[-1][2:])	# x=...
				continue
			if flag_toggle:
				if toggle==0:
					# read instructions
					tokens = word_tokenize(line)
				else:
					# read actions and path
					actions,path = get_actions_and_path(line,data_obj.map)
					# save new single-sentence sample
					data_obj.add_sample(tokens, actions, path, sample_id, start_pos, end_pos, map_name)
					# reset variables
					actions = path = tokens = []
				toggle = (toggle+1)%2
			#END-IF-TOGGLE
		#END-FOR-READ-FILE
	#END-FOR-MAPS

	return map_data

##########################################################################################
########################################################################################## 
Example 76
Project: chat-simulator   Author: Shikib   File: shikib_bot.py    MIT License 4 votes vote down vote up
def train_punctuation(self):
        # Initialize POS graph
        self.punctuation_graph = {} 

        def _add_message_to_punctuation(message):
            score = message[1]
            message = message[0]

            # Remove contractions and potentially other characters
            message = \
                "".join([ch for ch in message if ch not in "'"])

            words = word_tokenize(message)
            tagged_words = pos_tag(words)

            for gram_len in range(1, self.ngram_len+1):
                # The minus one is to ensure that we always have a word
                # right after the gram
                for i in range(len(tagged_words)-gram_len+1):
                    gram = tagged_words[i:i+gram_len]
                    
                    # Turn the gram into a hashable string.
                    tags = " ".join([t[1] for t in gram])

                    next_word = None
                    
                    if i == len(tagged_words) - gram_len:
                        next_word = 'ENDMSG'
                    else:                   
                        # Identify the type of the word that comes after the gram
                        next_word = tagged_words[i+gram_len][1]

                    if tags not in self.punctuation_graph:
                        self.punctuation_graph[tags] = {}

                    if next_word not in self.punctuation_graph[tags]:
                        self.punctuation_graph[tags][next_word] = 0

                    self.punctuation_graph[tags][next_word] += score
                    
        # Need to turn the text into the right format
        messages = self.extract_messages(self.punctuation_dataset, self.user)

        for message in messages:
            _add_message_to_punctuation(message) 
Example 77
Project: chat-simulator   Author: Shikib   File: shikib_bot.py    MIT License 4 votes vote down vote up
def train_style(self):
        # Initialize POS graph
        self.style_graph = {} 

        def _add_message_to_style(message):
            score = message[1]
            message = message[0]

            # Remove contractions and potentially other characters
            message = \
                "".join([ch for ch in message if ch not in "'"])

            words = word_tokenize(message)
            tagged_words = pos_tag(words)

            for gram_len in range(1, self.ngram_len):
                # The minus one is to ensure that we always have a word
                # right after the gram
                for i in range(len(tagged_words)-gram_len+1):
                    gram = tagged_words[i:i+gram_len]
                    
                    # Turn the gram into a hashable tuple.
                    words = " ".join([t[0] for t in gram])
                    tags = " ".join([t[1] for t in gram])
                    gram_tuple = (words,tags)

                    if i == len(tagged_words) - gram_len:
                        next_word = ('ENDMSG', 'ENDMSG')
                    else:                   
                        # Identify the type of the word that comes after the gram
                        next_word = tagged_words[i+gram_len]

                    if gram_tuple not in self.style_graph:
                        self.style_graph[gram_tuple] = {}

                    if next_word not in self.style_graph[gram_tuple]:
                        self.style_graph[gram_tuple][next_word] = 0

                    self.style_graph[gram_tuple][next_word] += score
                    
        # Need to turn the text into the right format
        messages = self.extract_messages(self.style_dataset, self.user)

        for message in messages:
            _add_message_to_style(message) 
Example 78
Project: RMDL   Author: eric-erki   File: text_feature_extraction.py    GNU General Public License v3.0 4 votes vote down vote up
def text_cleaner(text,
                 deep_clean=False,
                 stem= True,
                 stop_words=True,
                 translite_rate=True):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning

    ]

    if deep_clean:
        text = text.replace(".", "")
        text = text.replace("[", " ")
        text = text.replace(",", " ")
        text = text.replace("]", " ")
        text = text.replace("(", " ")
        text = text.replace(")", " ")
        text = text.replace("\"", "")
        text = text.replace("-", " ")
        text = text.replace("=", " ")
        text = text.replace("?", " ")
        text = text.replace("!", " ")

        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
        text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ')
        text = re.sub("(^|\W)\d+($|\W)", " ", text)
        if translite_rate:
            text = transliterate(text)
        if stem:
            text = PorterStemmer().stem(text)
        text = WordNetLemmatizer().lemmatize(text)
        if stop_words:
            stop_words = set(stopwords.words('english'))
            word_tokens = word_tokenize(text)
            text = [w for w in word_tokens if not w in stop_words]
            text = ' '.join(str(e) for e in text)
    else:
        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
    return text.lower() 
Example 79
Project: social_mind   Author: byeongkyu   File: sentence_classifier.py    Apache License 2.0 4 votes vote down vote up
def handle_domain_reply(self, msg):
        sents = self.sent_detector.tokenize(msg.reply.strip())

        msg = ReplyAnalyzed()
        msg.header.stamp = rospy.Time.now()

        for sent in sents:
            # sperate tags and text
            sent_tags = re.findall('(%[^}]+%)', sent)
            sent_text = re.sub('(%[^}]+%)', '', sent).strip()

            # if task manager select intent we use it, or we use classifier for select intent
            result = ''
            remain_tags = ''
            if not any('sm=' in tag for tag in sent_tags):
                feature = dialogue_act_features(sent_text)
                result = self.classifier.classify(feature)

                if sent_tags != []:
                    remain_tags = sent_tags[0]
            else:
                tag_text = sent_tags[0].strip('{}').split('|')
                matching = [s for s in tag_text if "sm=" in s]
                if len(matching) > 1:
                    rospy.logwarn('Only one sm tags allowed...')
                result = matching[0].split('=')[1]
                for s in tag_text:
                    if not "sm=" in s:
                        remain_tags += s + '|'
                if remain_tags != '':
                    remain_tags = '{' + remain_tags.rstrip('|') + '}'

            # select entities
            entity = EntitiesIndex()
            for i in pos_tag(word_tokenize(sent_text)):
                if(i[1] in ['RB', 'PRP', 'NN', 'PRP$']):
                    entity.entity.append(i[0])
                    entity.entity_index.append(sent_text.index(i[0]))

            msg.entities.append(entity)
            msg.sents.append(remain_tags + ' ' + sent_text)
            msg.act_type.append(result + '/%d'%len(sent_text))

        self.pub_reply_analyzed.publish(msg) 
Example 80
Project: Digital-Aristotle   Author: Gabighz   File: processing.py    GNU General Public License v3.0 4 votes vote down vote up
def pre_processing(raw_data):
    # Converts raw_data to a numpy array
    raw_data = np.array(raw_data, dtype=object)

    # Iterates through raw XML data and concatenates all text to a string
    sentence = ' '.join(raw_data[:, TEXT_INDEX])

    # Converts all words to lowercase to prevent duplication of same word with different cases.
    lowercase_string = sentence.lower()

    # Cleans each word of non-alphanumeric characters
    # e.g. so 'sensors)' and 'sensors' are not considered different words
    filtered_string = re.sub("[^a-zA-Z]", " ", lowercase_string)

    # Further filtering to keep only nouns; thus filtering stopwords as well
    # Also filters out words which have a character count of 1 or less.
    tokens = nltk.word_tokenize(filtered_string)
    tags = nltk.pos_tag(tokens)

    filtered_string = ' '.join([word for word, pos in tags
                                if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')
                                and (len(word) > 1)])

    # Compiles the filtered words to an array which contains
    # each word and its XML features
    filtered_data = []
    clean_words = filtered_string.split()

    # Appends only filtered words from raw data to an array which contains
    # each word and its XML features that were in raw data
    for word_array in raw_data:

        filtered_sentence = ""
        raw_sentence = re.sub("[^a-zA-Z]", " ", word_array[0]).split()

        for i in range(len(raw_sentence)):
            if raw_sentence[i].lower() in clean_words:
                filtered_sentence = filtered_sentence + " " + raw_sentence[i]
        # !to be improved
        if len(filtered_sentence.lstrip()) > 0:
            filtered_data.append([filtered_sentence.lstrip().lower(), word_array[1], word_array[2], word_array[3]])

    return filtered_data


# Computes the F1-score of our classifier
# Takes in a 2D array which contains each observation and their label
# Compares that to the ground truth (correct) value of each observation