Python nltk.FreqDist() Examples

The following are code examples for showing how to use nltk.FreqDist(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: OpenBottle   Author: xiaozhuchacha   File: textcat.py    MIT License 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 2
Project: OpenBottle   Author: xiaozhuchacha   File: textcat.py    MIT License 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 3
Project: Health-Checker   Author: KriAga   File: textcat.py    MIT License 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 4
Project: lisc   Author: lisc-tools   File: articles_all.py    Apache License 2.0 6 votes vote down vote up
def create_freq_dist(in_lst, exclude):
        """Create a frequency distribution.

        Parameters
        ----------
        in_lst : list of str
            Words to create the frequency distribution from.
        exclude : list of str
            Words to exclude from the frequency distribution.

        Returns
        -------
        freqs : nltk.FreqDist
            Frequency distribution of the words.
        """

        freqs = nltk.FreqDist(in_lst)

        for excl in exclude:
            try:
                freqs.pop(excl.lower())
            except KeyError:
                pass

        return freqs 
Example 5
Project: deep_throat   Author: wdbm   File: deep_throat.py    GNU General Public License v3.0 6 votes vote down vote up
def most_frequent_Brown_Corpus_words():
    import nltk
    import nltk.corpus
    words = []
    for word in nltk.corpus.brown.words():
        if word not in [
            ",",
            ".",
            "``",
            "''",
            ";",
            "?",
            "--",
            ")",
            "(",
            ":",
            "!"
        ]:
            words.append(word.lower())
    frequencies_words = nltk.FreqDist(words).most_common()
    words_most_frequent = [word[0] for word in frequencies_words]
    return words_most_frequent 
Example 6
Project: StabiHacks   Author: elektrobohemian   File: fulltext_statistics.py    Apache License 2.0 6 votes vote down vote up
def creatStatisticFiles(statFilePath, resultTxt):
    statFile = open(statFilePath, "w")
    # standard NLP workflow
    # 1) tokenize the text
    tokens = nltk.word_tokenize(resultTxt)
    nltkText=nltk.Text(tokens)
    # 2) normalize tokens
    words = [w.lower() for w in tokens]
    # 3) create vocabulary
    vocab = sorted(set(words))

    # calculate token frequencies
    fdist = nltk.FreqDist(nltkText)
    fTxt=""
    for (word,freq) in fdist.most_common(100):
        fTxt+=str(word)+"\t"+str(freq)+"\n"
    statFile.write(fTxt)
    statFile.close() 
Example 7
Project: FancyWord   Author: EastonLee   File: textcat.py    GNU General Public License v3.0 6 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 8
Project: webapp-bench   Author: edgedb   File: textgen.py    Apache License 2.0 6 votes vote down vote up
def make_TFD(self, tagged_words):
        # setup some frequency distribution dicts
        fd_by_tag = {None: nltk.FreqDist(w[0] for w in tagged_words)}

        words_by_tag = {}

        # classify words by tags
        for word, tag in tagged_words:
            fd_by_tag[tag] = fd_by_tag.get(tag, nltk.FreqDist())
            fd_by_tag[tag][word] += 1

        # set up frequecy dist by tags
        for tag, fd in fd_by_tag.items():
            vals = []
            freq = []
            for v, f in fd.most_common():
                vals.append(v)
                freq.append(f)

            words_by_tag[tag] = (vals, freq)

        return words_by_tag 
Example 9
Project: retrieval_chatbot   Author: ricosr   File: util.py    MIT License 6 votes vote down vote up
def count_none_frequency(file_name):   # just save none
    frequency_dict = {}
    with open(file_name, "rb") as fp:
        chat_ls = pickle.load(fp)
    for each_pair in chat_ls:
        for each_sentence in each_pair:
            words_ls = []
            cut_words = dict(pseg.cut(each_sentence))
            for word, flag in cut_words.items():
                if 'n' in flag:
                    words_ls.append(word)
            freq_dict_tmp = nltk.FreqDist(words_ls)
            for word, freq in freq_dict_tmp.items():
                if word in frequency_dict:
                    frequency_dict[word] = frequency_dict[word] + freq
                else:
                    frequency_dict[word] = freq
    with open("frequency_domain.py", "w", encoding="utf-8") as fwp:
        fwp.write("frequency_dict = {}".format(str(frequency_dict))) 
Example 10
Project: poetic-inner-join   Author: emdaniels   File: train.py    Apache License 2.0 6 votes vote down vote up
def tokenize_sentences(self):
        # tokenize the sentences into words and count the word frequencies
        # get most common words, build index_to_word and word_to_index vectors
        self.tokenized_sentences = [nltk.word_tokenize(sent) for sent in
                                    self.sentences]
        word_freq = nltk.FreqDist(itertools.chain(*self.tokenized_sentences))
        print("Found %d unique word tokens." % len(word_freq.items()))

        vocab = word_freq.most_common(self.vocabulary_size - 1)
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict(
            [(w, i) for i, w in enumerate(self.index_to_word)])

        print("Using vocabulary size %d." % self.vocabulary_size)
        print(
            "The least frequent word is '%s' appearing %d times." % (
            vocab[-1][0], vocab[-1][1]))

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(self.tokenized_sentences):
            self.tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token for w in
                sent] 
Example 11
Project: qa-scrapers   Author: collab-uniba   File: discretizer.py    MIT License 6 votes vote down vote up
def build_distribution_matrix(self, stems):
        distrib_matrix_filename = '{0}_distrib_matrix.txt'.format(self.db_name)
        if os.path.isfile(distrib_matrix_filename):  # load matrix from file
            self.log('Loading existing distribution matrix from {0}'.format(distrib_matrix_filename), logging.INFO)
            distrib_matrix = dict()
            with open(distrib_matrix_filename, 'rt') as f:
                csvrreader = csv.DictReader(f, delimiter=' ', lineterminator=self.linesep)
                for row in csvrreader:
                    distrib_matrix.update({row['w']: row['P(w|M)']})
                f.close()
        else:  # create matrix and save file
            self.log('Creating new distribution matrix into {0}. Please wait, this may take some time'.
                     format(distrib_matrix_filename), logging.INFO)
            distrib_matrix = FreqDist(stems)

            with open(distrib_matrix_filename, 'wt') as f:
                writer = csv.DictWriter(f, fieldnames=['w', 'P(w|M)'], delimiter=' ', lineterminator=self.linesep)
                writer.writeheader()
                for k in distrib_matrix.keys():
                    writer.writerow({'w': k, 'P(w|M)': distrib_matrix[k]})
                f.close()

        distrib_matrix = Discretizer.reduce_distribution_matrix(distrib_matrix, cutoff=1)
        return distrib_matrix 
Example 12
Project: qa-scrapers   Author: collab-uniba   File: discretization.py    MIT License 6 votes vote down vote up
def build_distribution_matrix(self, stems):
        distrib_matrix_filename = '{0}_distrib_matrix.txt'.format(self.db_name)
        if os.path.isfile(distrib_matrix_filename):  # load matrix from file
            self.log('Loading existing distribution matrix from {0}'.format(distrib_matrix_filename), logging.INFO)
            distrib_matrix = dict()
            with open(distrib_matrix_filename, 'rt') as f:
                csvrreader = csv.DictReader(f, delimiter=' ', lineterminator=self.linesep)
                for row in csvrreader:
                    distrib_matrix.update({row['w']: row['P(w|M)']})
                f.close()
        else:  # create matrix and save file
            self.log('Creating new distribution matrix into {0}. Please wait, this may take some time'.
                     format(distrib_matrix_filename), logging.INFO)
            distrib_matrix = FreqDist(stems)

            with open(distrib_matrix_filename, 'wt') as f:
                writer = csv.DictWriter(f, fieldnames=['w', 'P(w|M)'], delimiter=' ', lineterminator=self.linesep)
                writer.writeheader()
                for k in distrib_matrix.keys():
                    writer.writerow({'w': k, 'P(w|M)': distrib_matrix[k]})
                f.close()

        distrib_matrix = Discretizer.reduce_distribution_matrix(distrib_matrix, cutoff=1)
        return distrib_matrix 
Example 13
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 6 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            for sent in para:
                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Return data structure with information
        return {
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
        } 
Example 14
Project: news-crawler   Author: nolram   File: news_item.py    MIT License 6 votes vote down vote up
def as_debug_array(self, guess):
        l = []
        l.append('---')
        #l.append('lookup_key:   %s' % (self.lookup_key()))
        l.append('Categoria:     %s' % (self.category))
        l.append('Palpite:        %s' % (guess))
        l.append('URL:     %s' % (self.url))
        l.append('Titulos:   %s' % (self.title))
        l.append('')
        l.append('Todas as palavras por contagem')
        freqs = nltk.FreqDist([w.lower() for w in self.all_words])
        for w in freqs.keys():
            l.append("%-20s  %d" % (w, freqs.get(w)))
        l.append('')
        l.append('all_words, sequentially:')
        for w in self.all_words:
            l.append(w)
        return l 
Example 15
Project: razzy-spinner   Author: rafasashi   File: textcat.py    GNU General Public License v3.0 5 votes vote down vote up
def profile(self, text):
        ''' Create FreqDist of trigrams within text '''
        from nltk import word_tokenize, FreqDist

        clean_text = self.remove_punctuation(text)
        tokens = word_tokenize(clean_text)
        
        fingerprint = FreqDist()
        for t in tokens:
            token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR)
            token_trigrams = [''.join(tri) for tri in token_trigram_tuples]

            for cur_trigram in token_trigrams:
                if cur_trigram in fingerprint:
                    fingerprint[cur_trigram] += 1
                else:
                    fingerprint[cur_trigram] = 1

        return fingerprint 
Example 16
Project: RottenCrawler   Author: kevin940726   File: sentiment.py    MIT License 5 votes vote down vote up
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features 
Example 17
Project: hierarchical-attention-networks   Author: tqtg   File: data_prepare.py    MIT License 5 votes vote down vote up
def build_vocab(docs, save_path):
  print('Building vocab ...')

  sents = itertools.chain(*[text.split('<sssss>') for text in docs])
  tokenized_sents = [sent.split() for sent in sents]

  # Count the word frequencies
  word_freq = nltk.FreqDist(itertools.chain(*tokenized_sents))
  print("%d unique words found" % len(word_freq.items()))

  # Cut-off
  retained_words = [w for (w, f) in word_freq.items() if f > WORD_CUT_OFF]
  print("%d words retained" % len(retained_words))

  # Get the most common words and build index_to_word and word_to_index vectors
  # Word index starts from 2, 1 is reserved for UNK, 0 is reserved for padding
  word_to_index = {'PAD': 0, 'UNK': 1}
  for i, w in enumerate(retained_words):
    word_to_index[w] = i + 2
  index_to_word = {i: w for (w, i) in word_to_index.items()}

  print("Vocabulary size = %d" % len(word_to_index))

  with open('{}-w2i.pkl'.format(save_path), 'wb') as f:
    pickle.dump(word_to_index, f)

  with open('{}-i2w.pkl'.format(save_path), 'wb') as f:
    pickle.dump(index_to_word, f)

  return word_to_index 
Example 18
Project: wordvectors   Author: Kyubyong   File: make_wordvectors.py    MIT License 5 votes vote down vote up
def get_min_count(sents):
    '''
    Args:
      sents: A list of lists. E.g., [["I", "am", "a", "boy", "."], ["You", "are", "a", "girl", "."]]
     
    Returns:
      min_count: A uint. Should be set as the parameter value of word2vec `min_count`.   
    '''
    global vocab_size
    from itertools import chain
     
    fdist = nltk.FreqDist(chain.from_iterable(sents))
    min_count = fdist.most_common(vocab_size)[-1][1] # the count of the the top-kth word
    
    return min_count 
Example 19
Project: Python   Author: Ajinkya-Sonawane   File: sentiment.py    MIT License 5 votes vote down vote up
def get_word_features(wordList):
    wordList = nltk.FreqDist(wordList)
    features = wordList.keys()
    return features

#Function to extract words based on document features 
Example 20
Project: lisc   Author: lisc-tools   File: test_plts_words.py    Apache License 2.0 5 votes vote down vote up
def test_plot_wordcloud():

    freq_dist = FreqDist(['lots', 'of', 'words', 'words'])
    plot_wordcloud(freq_dist, 5) 
Example 21
Project: lisc   Author: lisc-tools   File: test_plts_wordcloud.py    Apache License 2.0 5 votes vote down vote up
def test_conv_freqs():

    freq_dist = FreqDist(['lots', 'of', 'words', 'words'])
    out = conv_freqs(freq_dist, 2)

    assert isinstance(out, dict) 
Example 22
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_isf_idf_dict(vocab):
    isf_fn = os.path.join(PREPROC_DATA_DIR,"squad","isf_score_dict")
    idf_fn = os.path.join(PREPROC_DATA_DIR,"squad","idf_score_dict")
    if os.path.exists(isf_fn + ".pickle") and os.path.exists(idf_fn + ".pickle"):
        isf_dict = uploadObject(isf_fn)
        idf_dict = uploadObject(idf_fn)
        return isf_dict,idf_dict

    data_gen = read_data("training")
    total_counts = nltk.FreqDist()
    total_counts_doc = nltk.FreqDist()
    nsents = 0
    ndocs = 0
    count = 0
    for sample in data_gen:
        sample_id,sents,question,labels = sample.unpack()
        ref_sents = words_to_id(sents,vocab)
        nsents += len(sents)
        ndocs += 1
        doc_set = set()
        for sent in ref_sents:
            total_counts.update(set(sent))
            doc_set.update(sent)
        total_counts_doc.update(doc_set)
        if count%10000 == 0:
            print "-->isf_dict_count:",count
        count +=1
    isf_dict = {}
    idf_dict = {}
    for wid,freq in total_counts.items():
        isf_dict[wid] = isf_score(nsents,freq)
    for wid,freq in total_counts_doc.items():
        idf_dict[wid] = isf_score(ndocs,freq)
    saveObject(isf_dict,isf_fn)
    saveObject(idf_dict,idf_fn)
    return isf_dict,idf_dict 
Example 23
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def eval_cnts(question,sents,isf_dict,idf_dict,stopwords):
    '''
    question: [wids] of queston
    sents: [... [wids]] for every sent
    '''
    question = set(question)
    idf_scores = []
    cnt_scores = []
    isf_scores = []
    local_isf_scores = []
    local_cnt_dict = nltk.FreqDist()
    n_sents = len(sents)
    for sent in sents:
        local_cnt_dict.update(set(sent))

    for sent in sents:
        sent_set = set(sent)
        inters = [x for x in question.intersection(sent_set) if x not in stopwords]
        cnt_scores.append(len(inters))
        local_isf_score = sum([isf_score(n_sents,local_cnt_dict[wid]) for wid in inters])
        
        _isf_score = sum([isf_dict[wid] if wid in isf_dict else isf_dict[UNK_ID] for wid in inters])
        idf_score = sum([idf_dict[wid] if wid in idf_dict else idf_dict[UNK_ID] for wid in inters])
        idf_scores.append(idf_score)
        isf_scores.append(_isf_score)
        local_isf_scores.append(local_isf_score)
        
    return cnt_scores,isf_scores,idf_scores,local_isf_scores 
Example 24
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def eval_cnts(question,sents,isf_dict,idf_dict,stopwords):
    '''
    question: [wids] of queston
    sents: [... [wids]] for every sent
    '''
    question = set(question)
    idf_scores = []
    cnt_scores = []
    isf_scores = []
    local_isf_scores = []
    local_cnt_dict = nltk.FreqDist()
    n_sents = len(sents)
    for sent in sents:
        local_cnt_dict.update(set(sent))

    for sent in sents:
        sent_set = set(sent)
        inters = [x for x in question.intersection(sent_set) if x not in stopwords]
        cnt_scores.append(len(inters))
        local_isf_score = sum([isf_score(n_sents,local_cnt_dict[wid]) for wid in inters])
        
        _isf_score = sum([isf_dict[wid] if wid in isf_dict else isf_dict[UNK_ID] for wid in inters])
        idf_score = sum([idf_dict[wid] if wid in idf_dict else idf_dict[UNK_ID] for wid in inters])
        idf_scores.append(idf_score)
        isf_scores.append(_isf_score)
        local_isf_scores.append(local_isf_score)
        
    return cnt_scores,isf_scores,idf_scores,local_isf_scores


#################################################################################### 
Example 25
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_isf_idf_dict(vocab):
    isf_fn = os.path.join(PREPROC_DATA_DIR,"squad","isf_score_dict")
    idf_fn = os.path.join(PREPROC_DATA_DIR,"squad","idf_score_dict")
    if os.path.exists(isf_fn + ".pickle") and os.path.exists(idf_fn + ".pickle"):
        isf_dict = uploadObject(isf_fn)
        idf_dict = uploadObject(idf_fn)
        return isf_dict,idf_dict

    data_gen = read_data("training")
    total_counts = nltk.FreqDist()
    total_counts_doc = nltk.FreqDist()
    nsents = 0
    ndocs = 0
    count = 0
    for sample in data_gen:
        sample_id,sents,question,labels = sample.unpack()
        ref_sents = words_to_id(sents,vocab)
        nsents += len(sents)
        ndocs += 1
        doc_set = set()
        for sent in ref_sents:
            total_counts.update(set(sent))
            doc_set.update(sent)
        total_counts_doc.update(doc_set)
        if count%10000 == 0:
            print "-->isf_dict_count:",count
        count +=1
    isf_dict = {}
    idf_dict = {}
    for wid,freq in total_counts.items():
        isf_dict[wid] = isf_score(nsents,freq)
    for wid,freq in total_counts_doc.items():
        idf_dict[wid] = isf_score(ndocs,freq)
    saveObject(isf_dict,isf_fn)
    saveObject(idf_dict,idf_fn)
    return isf_dict,idf_dict 
Example 26
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def get_isf_idf_dict(vocab,force=False):
    isf_fn = os.path.join(PREPROC_DATA_DIR,"wikiqa","isf_score_dict")
    idf_fn = os.path.join(PREPROC_DATA_DIR,"wikiqa","idf_score_dict")
    if os.path.exists(isf_fn + ".pickle") and os.path.exists(idf_fn + ".pickle") and not force:
        isf_dict = uploadObject(isf_fn)
        idf_dict = uploadObject(idf_fn)
        return isf_dict,idf_dict

    total_counts = nltk.FreqDist()
    total_counts_doc = nltk.FreqDist()
    nsents = 0
    ndocs = 0
    count = 0
    
    sentById,questionsById,labelsById = read_data("training")
    for _id,question in questionsById.items():
        sents = sentById[_id]
        labels = labelsById[_id]
        nsents += len(sents)
        ndocs += 1
        ref_sents = words_to_id(sents,vocab)
        doc_set = set()
        for sent in ref_sents:
            total_counts.update(set(sent))
            doc_set.update(sent)
        total_counts_doc.update(doc_set)
        if count%500 == 0:
            print "-->isf_dict_count:",count
        count +=1
    isf_dict = {}
    idf_dict = {}
    for wid,freq in total_counts.items():
        isf_dict[wid] = isf_score(nsents,freq)
    for wid,freq in total_counts_doc.items():
        idf_dict[wid] = isf_score(ndocs,freq)
    saveObject(isf_dict,isf_fn)
    saveObject(idf_dict,idf_fn)
    return isf_dict,idf_dict 
Example 27
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 5 votes vote down vote up
def eval_cnts(question,sents,isf_dict,idf_dict,stopwords):
    '''
    question: [wids] of queston
    sents: [... [wids]] for every sent
    '''
    question_set = set(question)

    count_scores = []
    idf_scores = []
    isf_scores = []
    local_isf_scores = []
    
    local_cnt_dict = nltk.FreqDist()
    n_sents = len(sents)
    for sent in sents:
        local_cnt_dict.update(set(sent))
    
    for sent in sents:
        sent_set = set(sent)
        _isf_score = idf_score = local_isf_score = 0.0
        cnt_sc = 0
        for wid in question_set:
            if wid in sent_set and wid not in stopwords:
                if wid in isf_dict:
                    _isf_score += isf_dict[wid]
                else:
                    _isf_score += isf_dict[UNK_ID]
                if wid in idf_dict:
                    idf_score += idf_dict[wid]
                    cnt_sc += 1
                else:
                    idf_score += idf_dict[UNK_ID]
                    cnt_sc += 1
                local_isf_score += isf_score(n_sents,local_cnt_dict[wid])
        idf_scores.append(idf_score)
        isf_scores.append(_isf_score)
        local_isf_scores.append(local_isf_score)
        count_scores.append(cnt_sc)
        
    return isf_scores,idf_scores,local_isf_scores,count_scores 
Example 28
Project: BERT   Author: yyht   File: utils.py    Apache License 2.0 5 votes vote down vote up
def bigram_counts(word_list):
	bgs = nltk.bigrams(word_list)
	fdist = nltk.FreqDist(bgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 29
Project: BERT   Author: yyht   File: utils.py    Apache License 2.0 5 votes vote down vote up
def trigram_counts(word_list):
	tgs = nltk.trigrams(word_list)
	fdist = nltk.FreqDist(tgs)
	d = Counter()
	for k, v in fdist.items():
		d[k] = v
	return d 
Example 30
Project: may142016   Author: ftrain   File: app.py    MIT License 5 votes vote down vote up
def get_words(tweets):
    """Given a set of tweets, return the most frequently-used words."""
    tweets = filter(lambda x: not(x.is_rt), tweets)
    tokenized = [nltk.word_tokenize(handle_strip(t.tweet_text))
                 for t in tweets]
    words = [item for sublist in tokenized for item in sublist]
    longwords = filter(lambda x: len(x) > 6, words)
    lcwords = map(lambda x: x.lower(), longwords)
    fdist = nltk.FreqDist(lcwords)
    common = fdist.most_common(100)
    common = filter(lambda x: x[1] > 4, common)
    common = map(lambda x: [x[0], 6 + int(x[1]/3)], common)
    return common 
Example 31
Project: Question-Answering   Author: arianhosseini   File: utils.py    MIT License 5 votes vote down vote up
def generate_squad_vocab(path, vocabulary_size=30000):
    import json
    import itertools
    # from operator import itemgetter
    from nltk.probability import FreqDist
    d = json.load(open(path))
    tokenized_sentences = []
    for reading in d['data']:
        for paragraph in reading['paragraphs']:
            sentence = paragraph['context'].lower()
            tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))
            for question in paragraph['qas']:
                sentence = question['question'].lower()     #TODO later check whether to add answer as well or not
                tokenized_sentences.append(nltk.tokenize.word_tokenize(sentence))

    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    print('total uniq words:', len(word_freq))
    # sorted_freq = sorted(dict(word_freq).items(), key=itemgetter(1))[::-1]
    full_vocab = word_freq.most_common(len(word_freq))
    vocab = open('vocab_full.txt','w')
    for w in full_vocab:
        vocab.write(w[0]+'\t'+str(w[1])+'\n')
    vocab.close()
    shorted_vocab = word_freq.most_common(vocabulary_size-1)
    vocab = open('vocab.txt','w')
    for w in shorted_vocab:
        vocab.write(w[0]+'\t'+str(w[1])+'\n')
    vocab.close() 
Example 32
Project: truecaser   Author: nreimers   File: TrainFunctions.py    Apache License 2.0 5 votes vote down vote up
def checkSentenceSanity(sentence):
    """ Checks the sanity of the sentence. If the sentence is for example all uppercase, it is recjected"""
    caseDist = nltk.FreqDist()
    
    for token in sentence:
        caseDist[getCasing(token)] += 1
    
    if caseDist.most_common(1)[0][0] != 'allLower':        
        return False
    
    return True 
Example 33
Project: ML-Term-Project-Team-Pikachu   Author: Chinmoy07   File: sentiment_analyzer.py    MIT License 5 votes vote down vote up
def get_word_features( all_words ):
    wordlist = nltk.FreqDist( all_words )
    word_features = wordlist.keys()
    return word_features 
Example 34
Project: jroc   Author: domenicosolazzo   File: PosManager.py    GNU General Public License v3.0 5 votes vote down vote up
def __commonWords(self, pos,  number=100):
        """
        Find common words in the text.
        """
        from nltk import FreqDist

        vocab = FreqDist(pos)

        common = [(word[0], index) for (word, index) in vocab.most_common(100) if word[1] == 'NN' or word[1] == 'NNS'  or word[1] == 'NNP'  or word[1] == 'NNPS']
        return common 
Example 35
Project: jroc   Author: domenicosolazzo   File: NLTKTagger.py    GNU General Public License v3.0 5 votes vote down vote up
def __commonWords(self, pos,  number=100):
        """
        Find common words in the text.
        """
        from nltk import FreqDist

        vocab = FreqDist(pos)
        common = [word[0] for (word, _) in vocab.most_common(100) if word[1] == 'NN' or word[1] == 'NNS'  or word[1] == 'NNP'  or word[1] == 'NNPS']
        return common 
Example 36
Project: jroc   Author: domenicosolazzo   File: NLTKTagger.py    GNU General Public License v3.0 5 votes vote down vote up
def __commonWords(self, pos,  number=100):
        """
        Find common words in the text.
        """
        from nltk import FreqDist

        vocab = FreqDist(pos)
        common = [word[0] for (word, _) in vocab.most_common(100) if word[1] == 'NN' or word[1] == 'NNS'  or word[1] == 'NNP'  or word[1] == 'NNPS']
        return common 
Example 37
Project: Natural-Language-Processing-with-Python-Cookbook   Author: PacktPublishing   File: Similarity.py    MIT License 5 votes vote down vote up
def TF(self, sentence):
        words = nltk.word_tokenize(sentence.lower())
        freq = nltk.FreqDist(words)
        dictionary = {}
        for key in freq.keys():
            norm = freq[key]/float(len(words))
            dictionary[key] = norm
        return dictionary 
Example 38
Project: NBA-Basketball-Web-Crawler   Author: ThomasDang93   File: webCrawler.py    Apache License 2.0 5 votes vote down vote up
def extract_terms(text):
    """Write a function to extract at least 10 important terms from the pages using an importance measure
    such as term frequency. First, it’s a good idea to lower-case everything, remove stopwords and punctuation.
    Then build a vocabulary of unique terms. Create a dictionary of unique terms where the key is the token and
    the value is the count across all documents.  Print the top 25-40 terms."""
    translate_table = dict((ord(char), None) for char in string.punctuation)
    text = text.translate(translate_table)
    tokens = nltk.word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    fdist = nltk.FreqDist(tokens)

    return set(fdist.most_common(TOP_TERMS_PER_PAGE)) 
Example 39
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: topic_type.py    GNU General Public License v3.0 5 votes vote down vote up
def generate_common_pos(all_pos):
    return [pos for pos, freq in nltk.FreqDist(all_pos).most_common()] 
Example 40
Project: dsw-ont-ijcai   Author: anonymous-ijcai   File: topic_type.py    GNU General Public License v3.0 5 votes vote down vote up
def generate_common_suffixes(all_suffixes):
    return [suffix for suffix, freq in nltk.FreqDist(all_suffixes).most_common()] 
Example 41
Project: hierarchical-attention-networks   Author: chritter   File: data_prepare.py    MIT License 5 votes vote down vote up
def build_vocab(docs, save_path):
  '''
  Create index to vocab (and inverse) dicts and pickle to disk.
  :param docs:
  :param save_path:
  :return: word to index lookup
  '''
  print('Building vocab ...')

  # tokenize corpus
  sents = itertools.chain(*[text.split('<sssss>') for text in docs])
  tokenized_sents = [sent.split() for sent in sents]

  # Count the word frequencies
  word_freq = nltk.FreqDist(itertools.chain(*tokenized_sents))
  print("%d unique words found" % len(word_freq.items()))

  # Cut-off words with less frequency then WORD_CUT_OFF; not mentioned in paper!
  retained_words = [w for (w, f) in word_freq.items() if f > WORD_CUT_OFF]
  print("%d words retained" % len(retained_words))

  # Get the most common words and build index_to_word and word_to_index vectors
  # Word index starts from 2, 1 is reserved for UNK, 0 is reserved for padding
  word_to_index = {'PAD': 0, 'UNK': 1}
  for i, w in enumerate(retained_words):
    word_to_index[w] = i + 2
  index_to_word = {i: w for (w, i) in word_to_index.items()}

  print("Vocabulary size = %d" % len(word_to_index))

  with open('{}-w2i.pkl'.format(save_path), 'wb') as f:
    pickle.dump(word_to_index, f)

  with open('{}-i2w.pkl'.format(save_path), 'wb') as f:
    pickle.dump(index_to_word, f)

  return word_to_index 
Example 42
Project: augmented_seq2seq   Author: suriyadeepan   File: data.py    GNU General Public License v3.0 5 votes vote down vote up
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    vocab = [ item for item in vocab if item[1] > 1 ]
    # index2word
    index2word = ['_'] + ['UNK'] + list(POS_TAGS.keys()) + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist 
Example 43
Project: augmented_seq2seq   Author: suriyadeepan   File: data.py    GNU General Public License v3.0 5 votes vote down vote up
def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist 
Example 44
Project: apachecn_ml   Author: ys1305   File: 文本摘要.py    GNU General Public License v3.0 5 votes vote down vote up
def summaryScoredtxt(self, text):
        # 将文章分成句子
        sentences = self._split_sentences(text)

        # 生成分词
        words = [
            w for sentence in sentences for w in jieba.cut(sentence)
            if w not in self.stopwrods if len(w) > 1 and w != '\t'
        ]
        # words = []
        # for sentence in sentences:
        #     for w in jieba.cut(sentence):
        #         if w not in stopwords and len(w) > 1 and w != '\t':
        #             words.append(w)

        # 统计词频
        wordfre = nltk.FreqDist(words)

        # 获取词频最高的前N个词
        topn_words = [
            w[0]
            for w in sorted(
                wordfre.items(), key=lambda d: d[1], reverse=True)
        ][:self.N]

        # 根据最高的n个关键词,给句子打分
        scored_sentences = self._score_sentences(sentences, topn_words)

        # 利用均值和标准差过滤非重要句子
        avg = numpy.mean([s[1] for s in scored_sentences])  # 均值
        std = numpy.std([s[1] for s in scored_sentences])  # 标准差
        summarySentences = []
        for (sent_idx, score) in scored_sentences:
            if score > (avg + 0.5 * std):
                summarySentences.append(sentences[sent_idx])
                print(sentences[sent_idx])
        return summarySentences 
Example 45
Project: apachecn_ml   Author: ys1305   File: 文本摘要.py    GNU General Public License v3.0 5 votes vote down vote up
def summaryTopNtxt(self, text):
        # 将文章分成句子
        sentences = self._split_sentences(text)

        # 根据句子列表生成分词列表
        words = [
            w for sentence in sentences for w in jieba.cut(sentence)
            if w not in self.stopwrods if len(w) > 1 and w != '\t'
        ]
        # words = []
        # for sentence in sentences:
        #     for w in jieba.cut(sentence):
        #         if w not in stopwords and len(w) > 1 and w != '\t':
        #             words.append(w)

        # 统计词频
        wordfre = nltk.FreqDist(words)

        # 获取词频最高的前N个词
        topn_words = [
            w[0]
            for w in sorted(
                wordfre.items(), key=lambda d: d[1], reverse=True)
        ][:self.N]

        # 根据最高的n个关键词,给句子打分
        scored_sentences = self._score_sentences(sentences, topn_words)

        top_n_scored = sorted(
            scored_sentences, key=lambda s: s[1])[-self.TOP_SENTENCES:]
        top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
        summarySentences = []
        for (idx, score) in top_n_scored:
            print(sentences[idx])
            summarySentences.append(sentences[idx])

        return sentences 
Example 46
Project: message-analyzer   Author: glasses-n-contacts   File: analyzer.py    MIT License 5 votes vote down vote up
def word_frequencies(self):
        tokens = self.word_tokenize(True)
        return nltk.FreqDist(tokens) 
Example 47
Project: chameleon_recsys   Author: gabrielspmoreira   File: tokenization.py    MIT License 5 votes vote down vote up
def get_words_freq(tokenized_articles):
    words_freq = FreqDist([word for article in tokenized_articles for word in article])
    return words_freq 
Example 48
Project: combine-FEVER-NSMN   Author: easonnie   File: nltk_utils.py    MIT License 5 votes vote down vote up
def get_nltk_freq_words():
    """Use Brown corpus frequent words
    More corpora: https://www.nltk.org/book/ch02.html
    """
    freq_dict = nltk.FreqDist(brown.words())

    for fileid in gutenberg.fileids():
        freq_dict.update(nltk.FreqDist(gutenberg.words(fileid)))

    freq_words = [k for k, v in freq_dict.items() if v > 10]
    return freq_words, freq_dict 
Example 49
Project: nltk-on-gae   Author: sivu22   File: nltk_utils.py    Apache License 2.0 5 votes vote down vote up
def getTagsAndFreqDist(inputText):
    # Take out some punctuation to avoid cases like: grandmother - grandmother's
    # or door - cottage-door
    inputText = re.sub("[.\-'']", " ", inputText)

    tokenizedText = nltk.word_tokenize(inputText)
    tags = nltk.pos_tag(tokenizedText)
    freqDist = nltk.FreqDist(tokenizedText)

    return (tags, freqDist) 
Example 50
Project: minke   Author: DistrictDataLabs   File: corpus.py    MIT License 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        } 
Example 51
Project: minke   Author: DistrictDataLabs   File: corpus.py    MIT License 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in para:
                counts['sents'] += 1

                for word, tag in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self._resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self._resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        } 
Example 52
Project: A-Hierarchical-Latent-Structure-for-Variational-Conversation-Modeling   Author: ctr4si   File: vocab.py    MIT License 5 votes vote down vote up
def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer 
Example 53
Project: sparv-pipeline   Author: spraakbanken   File: train_nst_comp_model.py    MIT License 5 votes vote down vote up
def make_model(nst_infile, picklefile, protocol=-1):
    """ Train a POS probability model on the NST lexicon and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has compounded POS tags (SUC set) as keys (e.g. "NN+NN")
    and smoothed probabilities as values."""
    # Collect all compounds from nst data
    nst_full_compounds = set()
    with open(nst_infile, encoding='UTF-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            comp = fields[3].replace("!", "")
            pos = fields[4]
            if "+" in comp and "_" not in word and not (comp.startswith("+") or comp.startswith("-")):
                nst_full_compounds.add((word, comp, pos))

    # Build POS probability model
    pos_fdist = FreqDist()
    for _w, _c, pos in nst_full_compounds:
        if '+' in pos:
            pos = re.sub(r"\+LN", "", pos)
            pos_fdist[pos] += 1

    pd = LidstoneProbDist(pos_fdist, 0.001, pos_fdist.B())

    # Save probability model as pickle
    with open(picklefile, "wb") as f:
        pickle.dump(pd, f, protocol=protocol) 
Example 54
Project: sparv-pipeline   Author: spraakbanken   File: train_stats_model.py    MIT License 5 votes vote down vote up
def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1):
    """Train a probability model on a korp statistics file and save it as a pickle file.
    The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys
    and smoothed probabilities as values."""
    fdist = FreqDist()
    with open(stats_infile, encoding='utf-8') as f:
        for line in f:
            fields = line[:-1].split('\t')
            word = fields[0]
            # Skip word forms that occur fewer times than min_freq
            if int(fields[4]) < min_freq:
                break
            # Get rid of all urls
            if word.startswith("http://"):
                continue
            # # Words that only occur once may only contain letters and hyphens
            # if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word):
            #     continue
            # if len(word) > 100:
            #     continue
            simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1]
            fdist[(word, simple_msd)] += int(fields[4])

    pd = LidstoneProbDist(fdist, smoothingparam, fdist.B())

    # Save probability model as pickle
    with open(picklefile, "wb") as p:
        pickle.dump(pd, p, protocol=protocol) 
Example 55
Project: roorkee-bot   Author: abhishekjiitr   File: roorkee-bot.py    GNU General Public License v3.0 5 votes vote down vote up
def classify(self,features):
		votes=[]
		for c in self.classifiers:
			v=c.classify(features)
			votes.append(v)
		#otes=nltk.FreqDist(votes)

		return most_common(votes) 
Example 56
Project: neural_chat   Author: natashamjaques   File: vocab.py    MIT License 5 votes vote down vote up
def __init__(self, tokenizer=None, max_size=None, min_freq=1):
        """Basic Vocabulary object"""

        self.vocab_size = 0
        self.freqdist = FreqDist()
        self.tokenizer = tokenizer 
Example 57
Project: youtube-sentiment-helper   Author: dillonmabry   File: utility.py    MIT License 5 votes vote down vote up
def top_freq_words(corpus, topwords):
    """
    Method to return frequency distribution of words from corpus text
    Args:
        corpus: the corpus of comments as a single string
    """
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(corpus)
    swords = stopwords.words('english')
    freq_words = FreqDist(w.lower() for w in words if w not in swords)
    return freq_words.most_common(topwords) 
Example 58
Project: RealtimeSentimentAnalysis   Author: zHaytam   File: sentiment_analysis.py    MIT License 5 votes vote down vote up
def __load_most_used_words(self):
        positive_file = open('./data/positive_reviews.txt', 'r')
        negative_file = open('./data/negative_reviews.txt', 'r')

        positive_reviews = [(review, 'pos') for review in positive_file.readlines()]
        negative_reviews = [(review, 'neg') for review in negative_file.readlines()]
        all_reviews = positive_reviews + negative_reviews

        all_words = []
        for review in all_reviews:
            all_words.extend(self.extract_words(review[0]))

        all_words = nltk.FreqDist(all_words)
        self.most_used_words = [w[0] for w in sorted(all_words.items(), key=lambda x: x[1], reverse=True)][:10000] 
Example 59
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and returns a dictionary with a
        variety of metrics concerning the state of the corpus.
        """
        # Structures to perform counting.
        counts = nltk.FreqDist()
        tokens = nltk.FreqDist()
        started = time.time()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in self._sent_tokenizer.tokenize(para):
                counts['sents'] += 1

                for word in self._word_tokenizer.tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files': n_fileids,
            'topics': n_topics,
            'paras': counts['paras'],
            'sents': counts['sents'],
            'words': counts['words'],
            'vocab': len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc': float(counts['paras']) / float(n_fileids),
            'sppar': float(counts['sents']) / float(counts['paras']),
            'secs': time.time() - started,
        } 
Example 60
Project: atap   Author: foxbook   File: reader.py    Apache License 2.0 5 votes vote down vote up
def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()

        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in sent_tokenize(para):
                counts['sents'] += 1

                for word in wordpunct_tokenize(sent):
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        } 
Example 61
Project: Focus_Locality_Extraction   Author: openeventdata   File: SIFpreprocessing.py    MIT License 5 votes vote down vote up
def parse_files(trainlist):
    
    corpus= ''
    for trainl in trainlist:
        text = trainl.lower().replace('\n', ' ')    
        text = unicode(text, errors='ignore')
        corpus += text.replace('\n', ' ') +'\n'

#    with open (testFile, 'r') as f: 
#        text = f.read().lower().replace('\n', ' ')
#        text = unicode(text, errors='ignore')
#        corpus += text.replace('\n', ' ') +'\n'
    
    vocabDic = nltk.FreqDist(w.lower() for w in nltk.tokenize.word_tokenize(corpus))
   # vocabDic = word_features.keys()
    
    vocabDic1 = [(w,v) for (w,v) in vocabDic.items() if (w not in to_filter and not w.isdigit())]
#    vocabulary = [w for w in vocabDic if (w not in to_filter and not w.isdigit())]
    
    vocabulary = [w for (w,v) in vocabDic1] 
    vocabFreq = [v for (w,v) in vocabDic1] 

#    f= open ('vocab.txt','w')
#    f.write("\n".join(map(lambda x: str(x), vocabulary)) )
    
    return corpus, vocabulary, vocabFreq 
Example 62
Project: Focus_Locality_Extraction   Author: openeventdata   File: SIFpreprocessing_test.py    MIT License 5 votes vote down vote up
def parse_files(trainlist):
    
    corpus= ''
    for trainl in trainlist:
        text = trainl.lower().replace('\n', ' ')    
        #text = unicode(text, errors='ignore')
        corpus += text.replace('\n', ' ') +'\n'
   
    vocabDic = nltk.FreqDist(w.lower() for w in nltk.tokenize.word_tokenize(corpus))
   
    vocabDic1 = [(w,v) for (w,v) in vocabDic.items() if (w not in to_filter and not w.isdigit())]
    vocabulary = [w for (w,v) in vocabDic1] 
    vocabFreq = [v for (w,v) in vocabDic1] 
    
    return corpus, vocabulary, vocabFreq 
Example 63
Project: homer   Author: wyounas   File: analyzer.py    MIT License 5 votes vote down vote up
def get_n_most_repeated_words(self, n):
        """Gets us n most repeated words in the text. """
        words = nltk.word_tokenize(self.text)
        stopwords = nltk.corpus.stopwords.words('english')
        all_words_except_stop = nltk.FreqDist(w.lower() for w in words if w[0].isalpha() and w not in stopwords)
        return [word for word, freq in all_words_except_stop.most_common(n)] 
Example 64
Project: codenn   Author: sriniiyer   File: SVM.py    MIT License 5 votes vote down vote up
def getFeat(self, line):
        listItem = [0]*self.noFeat
        fileFreqDist = nltk.FreqDist(SVM.tokenize(line))

        i = 0
        for key in self.trainKeys:
            if fileFreqDist.has_key(key):
                listItem[i] = fileFreqDist.get(key)
            i = i + 1
        return listItem 
Example 65
Project: vanilla-neural-nets   Author: cavaunpeu   File: training_data.py    MIT License 5 votes vote down vote up
def _remove_uncommon_words(cls, tokenized_corpus, vocabulary_size):
        word_count = nltk.FreqDist( itertools.chain(*tokenized_corpus) )
        word_count = [cls.WORD_COUNT_ITEM(word=word, count=count) for word, count in word_count.items()]
        word_count = sorted(word_count, key=lambda item: (item.count, item.word), reverse=True)
        most_common_words = [word_count_item.word for word_count_item in word_count[:vocabulary_size - \
            cls.NUMBER_OF_WORDS_TO_ADD_IN_MANUALLY + 1]]

        tokenized_corpus = [
            [word if word in most_common_words else cls.UNKNOWN_TOKEN for word in sentence]\
            for sentence in tokenized_corpus
        ]
        return tokenized_corpus 
Example 66
Project: news-crawler   Author: nolram   File: nltk_classificador.py    MIT License 5 votes vote down vote up
def identify_top_words(self, all_words):
        freq_dist = nltk.FreqDist(w.lower() for w in all_words)
        return list(freq_dist)[:1000] 
Example 67
Project: news-crawler   Author: nolram   File: news_item.py    MIT License 5 votes vote down vote up
def word_freq_dist(self):
        freqs = nltk.FreqDist()  # class nltk.probability.FreqDist
        for w in self.all_words:
            freqs.inc(w, 1)
        return freqs 
Example 68
Project: daftpunk   Author: nicr9   File: worker.py    MIT License 5 votes vote down vote up
def description_and_tokens(self, id_, timestamp, soup):
        overview = soup.find(id="description")
        for scr in overview.find_all('script'):
            scr.clear()

        desc = overview.text
        tokens = word_tokenize(desc)
        freqdist = FreqDist(tokens)

        self.redis.set('daftpunk:%s:description' % id_, desc)
        for token, freq in freqdist.iteritems():
            self.redis.zadd('daftpunk:%s:tokens' % id_, freq, token) 
Example 69
Project: newsclouds-engine   Author: inmagik   File: clouds.py    MIT License 4 votes vote down vote up
def compute_frequencies(
    text,
    encoding="latin-1", language='italian', min_len=3):

    # NLTK's default stopwords. musst be loaded if not present
    default_stopwords = set(nltk.corpus.stopwords.words(language))
    words = nltk.word_tokenize(text)
    # default_stopwords = set(nltk.corpus.stopwords.words(language))
    # fp = codecs.open(input_file, 'r', encoding)
    # words = nltk.word_tokenize(fp.read())

    seen_in_chunks = []
    chunks = []
    lines = text.split("\n")
    for line in lines:
        chunk = get_continuous_chunks(line)
        chunks.extend(chunk)
        for x in chunk:
            seen_in_chunks.extend(x.split(" "))
    
    words = [word for word in words if word not in seen_in_chunks]
    words.extend(chunks)

    # Remove punctuation
    # text = text.translate(None, string.punctuation)
    # Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) >= int(min_len)]

    # Remove numbers
    #words = [word for word in words if not word.isnumeric()]
    
    # Stemming words seems to make matters worse, disabled
    #stemmer = nltk.stem.snowball.SnowballStemmer('italian')
    #words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = [word for word in words if word.lower() not in default_stopwords]

    # Remove custom list of words
    words = [word for word in words if word.lower() not in common_articleswords]

    # Calculate frequency distribution
    fdist = nltk.FreqDist(words)

    # Output top 50 words
    frequencies = []
    for word, frequency in fdist.most_common(400):
        print('%s;%d' % (word, frequency))
        frequencies.append((word, frequency))
        # frequencies.append((word.encode(encoding), frequency))

    return frequencies 
Example 70
Project: Document-Models-with-Ext-Information   Author: shashiongithub   File: utils.py    BSD 2-Clause "Simplified" License 4 votes vote down vote up
def get_isf_idf_dict(vocab):
    isf_fn = os.path.join(PREPROC_DATA_DIR,"newsqa","isf_score_dict")
    idf_fn = os.path.join(PREPROC_DATA_DIR,"newsqa","idf_score_dict")
    if os.path.exists(isf_fn + ".pickle") and os.path.exists(idf_fn + ".pickle"):
        isf_dict = uploadObject(isf_fn)
        idf_dict = uploadObject(idf_fn)
        return isf_dict,idf_dict

    data_dict = pd.read_csv(os.path.join(DATA_CSV_DIR,'train.csv'),encoding='utf-8')

    sid_set = set()
    total_counts = nltk.FreqDist()
    total_counts_doc = nltk.FreqDist()
    nsents = 0
    ndocs = 0
    count = 0
    for sid,content in zip(data_dict["story_id"], data_dict["story_text"]):
        if sid in sid_set:
            continue
        sid_set.add(sid)
        sents = nltk.sent_tokenize(content)
        sents = [sent.split() for sent in sents] # sentences are already tokenized
        nsents += len(sents)
        ndocs += 1
        ref_sents = words_to_id(sents,vocab)
        doc_set = set()
        for sent in ref_sents:
            total_counts.update(set(sent))
            doc_set.update(sent)
        total_counts_doc.update(doc_set)
        if count%10000 == 0:
            print("-->isf_dict_count:",count)
        count +=1
    isf_dict = {}
    idf_dict = {}
    for wid,freq in total_counts.items():
        isf_dict[wid] = isf_score(nsents,freq)
    for wid,freq in total_counts_doc.items():
        idf_dict[wid] = isf_score(ndocs,freq)
    saveObject(isf_dict,isf_fn)
    saveObject(idf_dict,idf_fn)
    return isf_dict,idf_dict 
Example 71
Project: chirp   Author: 9b   File: __init__.py    MIT License 4 votes vote down vote up
def get_article(item, source, reprocess=False):
    """Take the initial set of listings and enrich the content."""
    article = dict()
    encoded = item.get('link').encode('utf-8')
    article['feed_source'] = source.replace('www.google.com', 'google.com')
    article['uuid'] = hashlib.sha256(encoded).hexdigest()
    processed = is_found(article['uuid'])
    if processed and not reprocess:
        # logger.debug("Skipping %s", article['uuid'])
        return {'article': processed, 'from_store': True}
    article['title'] = item.get('title', None)
    href = item.get('link', None)
    article['href'] = strip_google(href)
    article['source'] = derive_source(article['href'])
    article['collected'] = now_time()
    article['published'] = item.get('published', None)
    article['summary'] = item.get('summary', None)

    page_content = get_page_content(article['href'])
    if not page_content:
        logger.debug("No content found: %s" % article['href'])
        return {'article': None, 'from_store': True}
    paragraphs = justext.justext(page_content,
                                 justext.get_stoplist("English"),
                                 no_headings=True,
                                 max_heading_distance=150,
                                 length_high=140,
                                 max_link_density=0.4,
                                 stopwords_low=0.2,
                                 stopwords_high=0.3)
    text_content = list()
    for paragraph in paragraphs:
        if paragraph.is_boilerplate:
            continue
        text_content.append(paragraph.text)
    text_content = '\n'.join(text_content)
    tokens = get_tokens(text_content)

    article['word_count'] = len(tokens)
    article['read_time'] = round(float(article['word_count'])/250, 2)
    clean = cleaned_tokens(tokens)
    article['tokens'] = [{t[0]:t[1]}
                         for t in nltk.FreqDist(clean).most_common(100)]
    article['tags'] = [list(x.keys())[0] for x in article['tokens'][0:7]]
    article['sentiment'] = get_sentiment(text_content)
    articles = mongo.db[app.config['ARTICLES_COLLECTION']]
    if not reprocess or not processed:
        try:
            articles.insert(article)
        except Exception as e:
            pass
    if processed:
        print(processed)
        articles.update({'_id': ObjectId(processed['_id'])}, {'$set': article})
    monitor = get_monitor_obj(article['feed_source'])
    return {'article': article, 'monitor': monitor, 'from_store': False} 
Example 72
Project: graphAttack   Author: jgolebiowski   File: prepareTextData.py    MIT License 4 votes vote down vote up
def prepareWordTokenized(seedName):
    inputName = seedName + ".txt"
    
    doLowercase = True
    vocabSize = 10000
    unknown_token = "UNKNOWN_TOKEN"
    
    
    with open(inputName, "r") as fp:
        text = fp.read()
    
    tokensRaw = nltk.word_tokenize(text)
    if doLowercase:
        tokens = [w.lower() for w in tokensRaw]
    else:
        tokens = tokensRaw
    fdist = nltk.FreqDist(word for word in tokens)
    vocab = fdist.most_common(vocabSize - 1)
    
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    index_to_word = dict(enumerate(index_to_word))
    word_to_index = {v: k for k, v in index_to_word.items()}
    
    vocabSize = len(word_to_index)
    textLength = len(tokens)
    
    # Replace unknown words with unknown_tokens
    tokensReplaced = [w if w in word_to_index else unknown_token for w in tokens]
    
    hotText = np.zeros((textLength, vocabSize), dtype="float32")
    for index in range(textLength):
        hotText[index, word_to_index[tokensReplaced[index]]] = 1
    
    pickleFilename = seedName + ".pkl"
    with open(pickleFilename, "wb") as fp:
        pickle.dump((hotText, index_to_word, word_to_index), fp)
    # exampleLength = 24
    # nExamples = textLength - exampleLength
    
    # x = np.zeros((nExamples, exampleLength, vocabSize), dtype=int)
    # y = np.zeros((nExamples, exampleLength, vocabSize), dtype=int)
    # pointer = 0
    
    # for index in range(nExamples):
    #     x[index] = hotText[pointer: pointer + exampleLength]
    #     y[index] = hotText[pointer + 1: pointer + exampleLength + 1]
    #     pointer += 1
    
    # pickleFilename = seedName + "Matrix.pkl"
    # with open(pickleFilename, "wb") as fp:
    #     pickle.dump((x, y, index_to_word, word_to_index), fp) 
Example 73
Project: numpy-ml   Author: ddbourgin   File: tests.py    GNU General Public License v3.0 4 votes vote down vote up
def train(self, corpus_fp, vocab=None, encoding=None):
        N = self.N
        H = self.hyperparameters
        models, counts = {}, {}
        grams = {n: [] for n in range(1, N + 1)}
        gg = {n: [] for n in range(1, N + 1)}
        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]

        n_words = 0
        tokens = set([])

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                words = tokenize_words(line, filter_punc, filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for n in range(1, N + 1):
                    grams[n].append(
                        nltk.ngrams(
                            words,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol="<bol>",
                            right_pad_symbol="<eol>",
                        )
                    )

                    gg[n].extend(
                        list(
                            nltk.ngrams(
                                words,
                                n,
                                pad_left=True,
                                pad_right=True,
                                left_pad_symbol="<bol>",
                                right_pad_symbol="<eol>",
                            )
                        )
                    )

        for n in range(1, N + 1):
            counts[n] = nltk.FreqDist(gg[n])
            models[n] = nltk.lm.MLE(order=n)
            models[n].fit(grams[n], tokens)

        self.counts = counts
        self.n_words = n_words
        self._models = models
        self.n_tokens = len(vocab) if vocab is not None else len(tokens) 
Example 74
Project: numpy-ml   Author: ddbourgin   File: tests.py    GNU General Public License v3.0 4 votes vote down vote up
def train(self, corpus_fp, vocab=None, encoding=None):
        N = self.N
        H = self.hyperparameters
        models, counts = {}, {}
        grams = {n: [] for n in range(1, N + 1)}
        gg = {n: [] for n in range(1, N + 1)}
        filter_punc, filter_stop = H["filter_punctuation"], H["filter_stopwords"]

        n_words = 0
        tokens = set()

        with open(corpus_fp, "r", encoding=encoding) as text:
            for line in text:
                words = tokenize_words(line, filter_punc, filter_stop)

                if vocab is not None:
                    words = vocab.filter(words, H["unk"])

                if len(words) == 0:
                    continue

                n_words += len(words)
                tokens.update(words)

                # calculate n, n-1, ... 1-grams
                for n in range(1, N + 1):
                    grams[n].append(
                        nltk.ngrams(
                            words,
                            n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol="<bol>",
                            right_pad_symbol="<eol>",
                        )
                    )

                    gg[n].extend(
                        list(
                            nltk.ngrams(
                                words,
                                n,
                                pad_left=True,
                                pad_right=True,
                                left_pad_symbol="<bol>",
                                right_pad_symbol="<eol>",
                            )
                        )
                    )

        for n in range(1, N + 1):
            counts[n] = nltk.FreqDist(gg[n])
            models[n] = nltk.lm.Lidstone(order=n, gamma=self.K)
            models[n].fit(grams[n], tokens)

        self.counts = counts
        self._models = models
        self.n_words = n_words
        self.n_tokens = len(vocab) if vocab is not None else len(tokens) 
Example 75
Project: naacl18-multitask_argument_mining   Author: UKPLab   File: preprocessing.py    Apache License 2.0 4 votes vote down vote up
def createMatrices(sentences, mappings, padOneTokenSentence=True):
    data = []
    numTokens = 0
    numUnknownTokens = 0    
    missingTokens = FreqDist()
    paddedSentences = 0

    for sentence in sentences:
        row = {name: [] for name in list(mappings.keys())+['raw_tokens']}
        
        for mapping, str2Idx in mappings.items():    
            if mapping not in sentence:
                continue
                    
            for entry in sentence[mapping]:                
                if mapping.lower() == 'tokens':
                    numTokens += 1
                    idx = str2Idx['UNKNOWN_TOKEN']
                    
                    if entry in str2Idx:
                        idx = str2Idx[entry]
                    elif entry.lower() in str2Idx:
                        idx = str2Idx[entry.lower()]
                    elif wordNormalize(entry) in str2Idx:
                        idx = str2Idx[wordNormalize(entry)]
                    else:
                        numUnknownTokens += 1    
                        missingTokens[wordNormalize(entry)] += 1
                        
                    row['raw_tokens'].append(entry)
                elif mapping.lower() == 'characters':  
                    idx = []
                    for c in entry:
                        if c in str2Idx:
                            idx.append(str2Idx[c])
                        else:
                            idx.append(str2Idx['UNKNOWN'])                           
                                      
                else:
                    idx = str2Idx[entry]
                                    
                row[mapping].append(idx)
                
        if len(row['tokens']) == 1 and padOneTokenSentence:
            paddedSentences += 1
            for mapping, str2Idx in mappings.items():
                if mapping.lower() == 'tokens':
                    row['tokens'].append(mappings['tokens']['PADDING_TOKEN'])
                    row['raw_tokens'].append('PADDING_TOKEN')
                elif mapping.lower() == 'characters':
                    row['characters'].append([0])
                else:
                    row[mapping].append(0)
            
        data.append(row)
    
    if numTokens > 0:           
        logging.info("Unknown-Tokens: %.2f%%" % (numUnknownTokens/float(numTokens)*100))
        
    return data 
Example 76
Project: ml-rnn-text-generation   Author: rmandyam   File: utils.py    MIT License 4 votes vote down vote up
def get_raw_input(fname,vocabulary_size) : 
    print "==> Loading text from %s" % fname

    # list of all lines in input
    lines = []
    for i, line in enumerate(open(fname)):
	line = line.strip()
        line = line.replace('.', ' . ') 
        #line = line.lower() # convert to all lower case
	lines.append(line)

    #print "No. of Lines in Input..%d " % len(lines)

    #tokenize words
    tokenized_lines = [nltk.word_tokenize(line) for line in lines]

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_lines))
    print "Found %d unique words tokens." % len(word_freq.items())

    # Get the most common words and build index_to_word and word_to_index vectors
    most_common_vocab = word_freq.most_common(vocabulary_size-1)
    ivocab_1 = [x[0] for x in most_common_vocab]
    ivocab_1.append(unknown_token)
    vocab_1 = dict([(w,i) for i,w in enumerate(ivocab_1)])

    #Make sure, sentence_end and sentence_start are at indices 0 and 1, so that when we pad vectors with 0 everything kind of makes sense..
    ix_for_sent_end = vocab_1[sentence_end]
    curr_word_at_0  = ivocab_1[0]
    if not ix_for_sent_end == 0 :
       #swap places
       vocab_1[sentence_end]=0
       vocab_1[curr_word_at_0]=ix_for_sent_end
       ivocab_1[0]=sentence_end
       ivocab_1[ix_for_sent_end]=curr_word_at_0

    ix_for_sent_start = vocab_1[sentence_start]
    curr_word_at_1  = ivocab_1[1]
    if not ix_for_sent_start == 1 :
       #swap places
       vocab_1[sentence_start]=1
       vocab_1[curr_word_at_1]=ix_for_sent_start
       ivocab_1[1]=sentence_start
       ivocab_1[ix_for_sent_start]=curr_word_at_1

    return lines, vocab_1, ivocab_1 
Example 77
Project: ml-rnn-text-generation   Author: rmandyam   File: utils.py    MIT License 4 votes vote down vote up
def get_raw_input(fname,vocabulary_size) : 
    print "==> Loading text from %s" % fname

    # list of all lines in input
    lines = []
    for i, line in enumerate(open(fname)):
	line = line.strip()
        line = line.replace('.', ' . ') 
        #line = line.lower() # convert to all lower case
	lines.append(line)

    #print "No. of Lines in Input..%d " % len(lines)

    #tokenize words
    tokenized_lines = [nltk.word_tokenize(line) for line in lines]

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_lines))
    print "Found %d unique words tokens." % len(word_freq.items())

    # Get the most common words and build index_to_word and word_to_index vectors
    most_common_vocab = word_freq.most_common(vocabulary_size-1)
    ivocab_1 = [x[0] for x in most_common_vocab]
    ivocab_1.append(unknown_token)
    vocab_1 = dict([(w,i) for i,w in enumerate(ivocab_1)])

    #Make sure, sentence_end and sentence_start are at indices 0 and 1, so that when we pad vectors with 0 everything kind of makes sense..
    ix_for_sent_end = vocab_1[sentence_end]
    curr_word_at_0  = ivocab_1[0]
    if not ix_for_sent_end == 0 :
       #swap places
       vocab_1[sentence_end]=0
       vocab_1[curr_word_at_0]=ix_for_sent_end
       ivocab_1[0]=sentence_end
       ivocab_1[ix_for_sent_end]=curr_word_at_0

    ix_for_sent_start = vocab_1[sentence_start]
    curr_word_at_1  = ivocab_1[1]
    if not ix_for_sent_start == 1 :
       #swap places
       vocab_1[sentence_start]=1
       vocab_1[curr_word_at_1]=ix_for_sent_start
       ivocab_1[1]=sentence_start
       ivocab_1[ix_for_sent_start]=curr_word_at_1

    return lines, vocab_1, ivocab_1 
Example 78
Project: ml-rnn-text-generation   Author: rmandyam   File: utils.py    MIT License 4 votes vote down vote up
def get_raw_input(fname,vocabulary_size) : 
    print "==> Loading text from %s" % fname

    # list of all lines in input
    lines = []
    for i, line in enumerate(open(fname)):
	line = line.strip()
        line = line.replace('.', ' . ') 
        #line = line.lower() # convert to all lower case
	lines.append(line)

    #print "No. of Lines in Input..%d " % len(lines)

    #tokenize words
    tokenized_lines = [nltk.word_tokenize(line) for line in lines]

    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_lines))
    print "Found %d unique words tokens." % len(word_freq.items())

    # Get the most common words and build index_to_word and word_to_index vectors
    most_common_vocab = word_freq.most_common(vocabulary_size-1)
    ivocab_1 = [x[0] for x in most_common_vocab]
    ivocab_1.append(unknown_token)
    vocab_1 = dict([(w,i) for i,w in enumerate(ivocab_1)])

    #Make sure, sentence_end and sentence_start are at indices 0 and 1, so that when we pad vectors with 0 everything kind of makes sense..
    ix_for_sent_end = vocab_1[sentence_end]
    curr_word_at_0  = ivocab_1[0]
    if not ix_for_sent_end == 0 :
       #swap places
       vocab_1[sentence_end]=0
       vocab_1[curr_word_at_0]=ix_for_sent_end
       ivocab_1[0]=sentence_end
       ivocab_1[ix_for_sent_end]=curr_word_at_0

    ix_for_sent_start = vocab_1[sentence_start]
    curr_word_at_1  = ivocab_1[1]
    if not ix_for_sent_start == 1 :
       #swap places
       vocab_1[sentence_start]=1
       vocab_1[curr_word_at_1]=ix_for_sent_start
       ivocab_1[1]=sentence_start
       ivocab_1[ix_for_sent_start]=curr_word_at_1

    return lines, vocab_1, ivocab_1 
Example 79
Project: codenn   Author: sriniiyer   File: SVM.py    MIT License 4 votes vote down vote up
def train(self, posTrainCorpus, negTrainCorpus):
        tokens = []

        fp = open(posTrainCorpus, 'r')
        for line in fp:
          tokens += SVM.tokenize(line)
        fp.close()

        fn = open(negTrainCorpus, 'r')
        for line in fn:
          tokens += SVM.tokenize(line)
        fn.close()

        #Create Frequency Distribution from both Positive and Negative Corpora
        trainFreq = nltk.FreqDist(tokens) 

        #No of Features
        self.noFeat = len(trainFreq)

        #Get Keys to maintain Order
        self.trainKeys = trainFreq.keys()

        #Create OrderedDict for features: Use this as sample for all files
        ordFeat = OrderedDict()
        for key in trainFreq.keys():
            ordFeat.update( {key: trainFreq.freq(key)} )

        posFeatList = self.featureList(posTrainCorpus)
        negFeatList = self.featureList(negTrainCorpus)
        featList = posFeatList + negFeatList

        noPos = len(posFeatList)
        noNeg = len(negFeatList)

        labels = []

        for j in range(noPos):
            labels.append(1)
        for k in range(noNeg):
            labels.append(0)

        #Create numpy Array for word frequencies : Feature Vector
        trainFreqArr = np.array(featList)
        trainLabels = np.array(labels)


        #Fit SVM
        # docClassifier = svm.SVC( C=1000)
        self.docClassifier = svm.LinearSVC()
        self.docClassifier.fit(trainFreqArr, trainLabels) 
Example 80
Project: Semantic-Texual-Similarity-Toolkits   Author: rgtjf   File: features_embedding_online.py    MIT License 4 votes vote down vote up
def pooling(word_sa, emb_type, dim, pooling_types='avg', convey='idf'):
    idf_weight = dict_utils.DictLoader().load_dict('idf')
    embedding = Embedding()

    vdist = nltk.FreqDist(word_sa)
    length = float(len(word_sa))

    if pooling_types == 'avg':
        function = np.average
    elif pooling_types == 'min':
        function = np.amin
    elif pooling_types == 'max':
        function = np.amax
    else:
        print(pooling_types)
        raise NotImplementedError


    vec = []
    for word in word_sa:
        if emb_type == 'word2vec':
            st, w2v = embedding.get_word2vec(word)
        elif emb_type == 'glove':
            st, w2v = embedding.get_glove(word)
        elif emb_type == 'paragram':
            st, w2v = embedding.get_paragram(word)
        elif emb_type == 'glove300':
            st, w2v = embedding.get_glove300(word)

        if convey == 'idf':
            w = idf_weight.get(word, 10.0)
        elif convey == 'tfidf':
            w = vdist[word] * idf_weight.get(word, 10.0)
        else:
            raise NotImplementedError

        w2v = w * np.array(w2v)
        vec.append(w2v)

    if len(vec) == 0:
        vec = np.zeros((dim,))
    else:
        vec = function(vec, axis=0)

    return vec