Python nltk.corpus() Examples

The following are 30 code examples for showing how to use nltk.corpus(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 6 votes vote down vote up
def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset 
Example 2
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 6 votes vote down vote up
def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic 
Example 3
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 6 votes vote down vote up
def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2) 
Example 4
Project: razzy-spinner   Author: rafasashi   File: glue.py    License: GNU General Public License v3.0 6 votes vote down vote up
def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger 
Example 5
Project: razzy-spinner   Author: rafasashi   File: relextract.py    License: GNU General Public License v3.0 6 votes vote down vote up
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
Example 6
Project: razzy-spinner   Author: rafasashi   File: relextract.py    License: GNU General Public License v3.0 6 votes vote down vote up
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print() 
Example 7
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    License: Apache License 2.0 6 votes vote down vote up
def evaluate(trueValues, predicted, decimals, note):
	print note
	label = 1
	avg = 'weighted'
	a = accuracy_score(trueValues, predicted)
	p = precision_score(trueValues, predicted, pos_label=label, average=avg)
	r = recall_score(trueValues, predicted, pos_label=label, average=avg)
	avg_f1 = f1_score(trueValues, predicted, pos_label=label, average=avg)
	fclasses = f1_score(trueValues, predicted, average=None)
	f1c1 = fclasses[0]; f1c2 = fclasses[1]
	fw = (f1c1 + f1c2)/2.0

	print 'accuracy:\t', str(round(a,decimals))
	print 'precision:\t', str(round(p,decimals))
	print 'recall:\t', str(round(r,decimals))
	print 'avg f1:\t', str(round(avg_f1,decimals))
	print 'c1 f1:\t', str(round(f1c1,decimals))
	print 'c2 f1:\t', str(round(f1c2,decimals))
	print 'avg(c1,c2):\t', str(round(fw,decimals))
	print '------------'

###################################################################################


# split a parallel or comparable corpus into two parts 
Example 8
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    License: Apache License 2.0 6 votes vote down vote up
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
################################################################################## 
Example 9
Project: comparable-text-miner   Author: motazsaad   File: textpro.py    License: Apache License 2.0 6 votes vote down vote up
def aligning_doc_by_interlanguage_links(source_doc, target_corpus, source_language, target_language, output_path):
	
	source = None
	target = None
	
	source_title = get_title_from_interlanguage_links(source_doc, source_language)			
		
	for d in target_corpus:
		target_title = get_title_from_interlanguage_links(d, target_language)
		if source_title == target_title:
			source = source_doc
			target = d
			
	return source, target		
				
##################################################################################

# takes a wikipedia corpus (extracted by WikiExtractor.py) and splits the corpus into documents and clean them 
Example 10
Project: tmtoolkit   Author: WZBSocialScienceCenter   File: _common.py    License: Apache License 2.0 6 votes vote down vote up
def pos_tag_convert_penn_to_wn(tag):
    """
    Convert POS tag from Penn tagset to WordNet tagset.

    :param tag: a tag from Penn tagset
    :return: a tag from WordNet tagset or None if no corresponding tag could be found
    """
    from nltk.corpus import wordnet as wn

    if tag in ['JJ', 'JJR', 'JJS']:
        return wn.ADJ
    elif tag in ['RB', 'RBR', 'RBS']:
        return wn.ADV
    elif tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return wn.NOUN
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return wn.VERB
    return None 
Example 11
Project: partisan-discourse   Author: DistrictDataLabs   File: learn.py    License: Apache License 2.0 6 votes vote down vote up
def documents(self, fold=None, train=False, test=False):
        """
        A generator of documents being streamed from disk. Each document is
        a list of paragraphs, which are a list of sentences, which in turn is
        a list of tuples of (token, tag) pairs. All preprocessing is done by
        NLTK and the CorpusReader object this object wraps.

        If a fold is specified (should be an integer between 0 and folds),
        then the loader will return documents from that fold. Further, train
        or test must be specified to split the fold correctly. This method
        allows us to maintain the generator properties of document reads.
        """
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.tagged(fileids=fileid))


##########################################################################
## Normalize Transformer
########################################################################## 
Example 12
Project: Sentence-similarity-classifier-for-pyTorch   Author: demelin   File: sick_extender.py    License: MIT License 6 votes vote down vote up
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5,
                 min_substitutions=2, num_candidates=5, concatenate_corpora=True):
        self.sick_path = sick_path
        self.target_directory = target_directory
        self.lm_path = lm_path
        self.wsd_algorithm = wsd_algorithm
        self.sampling_parameter = sampling_parameter
        self.min_substitutions = min_substitutions
        self.num_candidates = num_candidates
        self.concatenate_corpora = concatenate_corpora
        self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt')
        self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt')
        # Filter the original SICK corpus to match the expected format, and create file for LM training
        if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path):
            self.filter_sick()
        if self.lm_path is None:
            raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.')
        else:
            self.language_model = kenlm.LanguageModel(self.lm_path) 
Example 13
Project: Sentence-similarity-classifier-for-pyTorch   Author: demelin   File: sick_extender.py    License: MIT License 6 votes vote down vote up
def filter_sick(self):
        """ Processes the original S.I.C.K. corpus into a format where each line contains the two compared sentences
        followed by their relatedness score. """
        # Filter the SICK dataset for sentences and relatedness score only
        df_origin = pd.read_table(self.sick_path)
        df_classify = df_origin.loc[:, ['sentence_A', 'sentence_B', 'relatedness_score']]
        # Scale relatedness score to to lie ∈ [0, 1] for training of the classifier
        df_classify['relatedness_score'] = df_classify['relatedness_score'].apply(
            lambda x: "{:.4f}".format(float(x)/5.0))

        df_noscore = df_origin.loc[:, ['sentence_A', 'sentence_B']]
        df_noscore = df_noscore.stack()

        # Write the filtered set to a .csv file
        df_classify.to_csv(self.filtered_path, sep='\t', index=False, header=False)
        print('Filtered corpus saved to %s.' % self.filtered_path)

        # Write a score-free set to a .csv file to be used in the training of the KN language model
        df_noscore.to_csv(self.noscore_path, index=False, header=False)
        print('Filtered corpus saved to %s.' % self.noscore_path) 
Example 14
Project: Sentence-similarity-classifier-for-pyTorch   Author: demelin   File: sick_extender.py    License: MIT License 6 votes vote down vote up
def line_prep(self, line):
        """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
        # Split line into sentences + score
        s1, s2, sim_score = line.split('\t')
        # Tokenize
        s1_tokens = word_tokenize(s1)
        s2_tokens = word_tokenize(s2)
        # Assign part of speech tags
        s1_penn_pos = nltk.pos_tag(s1_tokens)
        s2_penn_pos = nltk.pos_tag(s2_tokens)
        # Convert to WordNet POS tags and store word position in sentence for replacement
        # Each tuple contains (word, WordNet_POS_tag, position)
        s1_wn_pos = list()
        s2_wn_pos = list()
        for idx, item in enumerate(s1_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
        for idx, item in enumerate(s2_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))

        # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
        return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score 
Example 15
Project: gobbli   Author: RTIInternational   File: wordnet.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True) 
Example 16
Project: coling2018_fake-news-challenge   Author: UKPLab   File: feature_engineering.py    License: Apache License 2.0 6 votes vote down vote up
def latent_semantic_indexing_gensim_300_concat_holdout(headlines, bodies):
    """
    Takes all the data (holdout+test+train) and interpretes the headlines and bodies as different
    documents. Instead of combining them, they are appended. Then it tokenizes these ~50k headline-docs and ~50k body-docs,
    builds a Tfidf-Matrix out of them and creates a LSI-Model out of it. In the next step the headlines and
    bodies for the feature generation are also treated as different documents and merely appended. Also, they are tokenized and
    a Tfifd-Matrix is built. This matix is passed to the learned LSI-Model and a Matrix is being returned.
    In this matrix, each document is represented as a vector with length(topics) of (topic-id, distance of this doc to the topic).
    The probabilities are then taken as a feature vector for the document. The first half of the matrix represent the headline docs,
    the latter half represent the body docs. In the end, the feature vectors of the headlines get concatenated with its body feature vector.

    The differences to the latent_semantic_indexing_gensim_300_concat_OLD are:
        - holdout data is also used
        - a Tfidf matrix is built and used to create the LSI model and also to retrieve the features instead of just a corpus to build the LSI model and
            passing each headline and body separately into the LSI model to retrieve its features (does it make a difference, since dictionary already takes
            tfidf into account?)
        - the vectors are taken fully and not just the cosinus distance between them
    """
    return topic_models.latent_semantic_indexing_gensim_concat(headlines, bodies, n_topics=300, include_holdout=True,
                                                               include_unlbled_test=False) 
Example 17
Project: coling2018_fake-news-challenge   Author: UKPLab   File: feature_engineering.py    License: Apache License 2.0 6 votes vote down vote up
def latent_semantic_indexing_gensim_300_concat_holdout_unlbled_test(headlines, bodies):
    """
    Takes all the data (holdout+test+train) and interpretes the headlines and bodies as different
    documents. Instead of combining them, they are appended. Then it tokenizes these ~50k headline-docs and ~50k body-docs,
    builds a Tfidf-Matrix out of them and creates a LSI-Model out of it. In the next step the headlines and
    bodies for the feature generation are also treated as different documents and merely appended. Also, they are tokenized and
    a Tfifd-Matrix is built. This matix is passed to the learned LSI-Model and a Matrix is being returned.
    In this matrix, each document is represented as a vector with length(topics) of (topic-id, distance of this doc to the topic).
    The probabilities are then taken as a feature vector for the document. The first half of the matrix represent the headline docs,
    the latter half represent the body docs. In the end, the feature vectors of the headlines get concatenated with its body feature vector.

    The differences to the latent_semantic_indexing_gensim_300_concat_OLD are:
        - holdout data is also used
        - a Tfidf matrix is built and used to create the LSI model and also to retrieve the features instead of just a corpus to build the LSI model and
            passing each headline and body separately into the LSI model to retrieve its features (does it make a difference, since dictionary already takes
            tfidf into account?)
        - the vectors are taken fully and not just the cosinus distance between them
    """
    return topic_models.latent_semantic_indexing_gensim_concat(headlines, bodies, n_topics=300, include_holdout=True,
                                                               include_unlbled_test=True) 
Example 18
Project: atap   Author: foxbook   File: transformer.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, nfeatures=100, tofull=False):
        """
        Pass in a directory that holds the lexicon in corpus.dict and the
        TFIDF model in tfidf.model (for now).

        Set tofull = True if the next thing is a Scikit-Learn estimator
        otherwise keep False if the next thing is a Gensim model.
        """
        self._lexicon_path = "lexigram.dict"
        self._tfidf_path = "tfidf.model"
        self.nfeatures = nfeatures
        self.lexicon = None
        self.tfidf = None
        self.tofull = tofull

        self.load() 
Example 19
Project: pliers   Author: tyarkoni   File: test_text_filters.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_token_removal_filter():
    stim = TextStim(text='this is not a very long sentence')
    filt = TokenRemovalFilter()
    assert filt.transform(stim).text == 'long sentence'

    filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is'])
    assert filt2.transform(stim).text == 'this not very long sentence'

    stim2 = TextStim(text='More. is Real, sentence that\'ll work')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    from nltk.corpus import stopwords
    tokens = set(stopwords.words('english')) | set(string.punctuation)
    filt3 = TokenRemovalFilter(tokens=tokens)
    assert filt3.transform(stim2).text == 'More Real sentence \'ll work' 
Example 20
def info_content(lookup_word):
    """
    Uses the Brown corpus available in NLTK to calculate a Laplace
    smoothed frequency distribution of words, then uses this information
    to compute the information content of the lookup_word.
    """
    global N
    if N == 0:
        # poor man's lazy evaluation
        for sent in brown.sents():
            for word in sent:
                word = word.lower()
                if not word in brown_freqs:
                    brown_freqs[word] = 0
                brown_freqs[word] = brown_freqs[word] + 1
                N = N + 1
    lookup_word = lookup_word.lower()
    n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word]
    return 1.0 - (math.log(n + 1) / math.log(N + 1)) 
Example 21
Project: razzy-spinner   Author: rafasashi   File: util.py    License: GNU General Public License v3.0 5 votes vote down vote up
def demo_liu_hu_lexicon(sentence, plot=False):
    """
    Basic example of sentiment classification using Liu and Hu opinion lexicon.
    This function simply counts the number of positive, negative and neutral words
    in the sentence and classifies it depending on which polarity is more represented.
    Words that do not appear in the lexicon are considered as neutral.

    :param sentence: a sentence whose polarity has to be classified.
    :param plot: if True, plot a visual representation of the sentence polarity.
    """
    from nltk.corpus import opinion_lexicon
    from nltk.tokenize import treebank

    tokenizer = treebank.TreebankWordTokenizer()
    pos_words = 0
    neg_words = 0
    tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)]

    x = list(range(len(tokenized_sent))) # x axis for the plot
    y = []

    for word in tokenized_sent:
        if word in opinion_lexicon.positive():
            pos_words += 1
            y.append(1) # positive
        elif word in opinion_lexicon.negative():
            neg_words += 1
            y.append(-1) # negative
        else:
            y.append(0) # neutral

    if pos_words > neg_words:
        print('Positive')
    elif pos_words < neg_words:
        print('Negative')
    elif pos_words == neg_words:
        print('Neutral')

    if plot == True:
        _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']) 
Example 22
Project: razzy-spinner   Author: rafasashi   File: __init__.py    License: GNU General Public License v3.0 5 votes vote down vote up
def teardown_module(module=None):
    import nltk.corpus
    for name in dir(nltk.corpus):
        obj = getattr(nltk.corpus, name, None)
        if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'):
            obj._unload() 
Example 23
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def jcn_similarity(self, other, ic, verbose=False):
        """
        Jiang-Conrath Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type  ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
        """

        if self == other:
            return _INF

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)

        # If either of the input synsets are the root synset, or have a
        # frequency of 0 (sparse data problem), return 0.
        if ic1 == 0 or ic2 == 0:
            return 0

        ic_difference = ic1 + ic2 - 2 * lcs_ic

        if ic_difference == 0:
            return _INF

        return 1 / ic_difference 
Example 24
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def morphy(self, form, pos=None):
        """
        Find a possible base form for the given form, with the given
        part of speech, by checking WordNet's list of exceptional
        forms, and by recursively stripping affixes for this part of
        speech until a form in WordNet is found.

        >>> from nltk.corpus import wordnet as wn
        >>> print(wn.morphy('dogs'))
        dog
        >>> print(wn.morphy('churches'))
        church
        >>> print(wn.morphy('aardwolves'))
        aardwolf
        >>> print(wn.morphy('abaci'))
        abacus
        >>> wn.morphy('hardrock', wn.ADV)
        >>> print(wn.morphy('book', wn.NOUN))
        book
        >>> wn.morphy('book', wn.ADJ)
        """

        if pos is None:
            morphy = self._morphy
            analyses = chain(a for p in POS_LIST for a in morphy(form, p))
        else:
            analyses = self._morphy(form, pos)

        # get the first one we find
        first = list(islice(analyses, 1))
        if len(first) == 1:
            return first[0]
        else:
            return None 
Example 25
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def ic(self, icfile):
        """
        Load an information content file from the wordnet_ic corpus
        and return a dictionary.  This dictionary has just two keys,
        NOUN and VERB, whose values are dictionaries that map from
        synsets to information content values.

        :type icfile: str
        :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
        :return: An information content dictionary
        """
        ic = {}
        ic[NOUN] = defaultdict(float)
        ic[VERB] = defaultdict(float)
        for num, line in enumerate(self.open(icfile)):
            if num == 0: # skip the header
                continue
            fields = line.split()
            offset = int(fields[0][:-1])
            value = float(fields[1])
            pos = _get_pos(fields[0])
            if len(fields) == 3 and fields[2] == "ROOT":
                # Store root count.
                ic[pos][0] += value
            if value != 0:
                ic[pos][offset] = value
        return ic


######################################################################
# Similarity metrics
######################################################################

# TODO: Add in the option to manually add a new root node; this will be
# useful for verb similarity as there exist multiple verb taxonomies.

# More information about the metrics is available at
# http://marimba.d.umn.edu/similarity/measures.html 
Example 26
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _get_pos(field):
    if field[-1] == 'n':
        return NOUN
    elif field[-1] == 'v':
        return VERB
    else:
        msg = "Unidentified part of speech in WordNet Information Content file for field %s" % field
        raise ValueError(msg)


# unload corpus after tests 
Example 27
Project: razzy-spinner   Author: rafasashi   File: wordnet.py    License: GNU General Public License v3.0 5 votes vote down vote up
def teardown_module(module=None):
    from nltk.corpus import wordnet
    wordnet._unload()


######################################################################
# Demo
###################################################################### 
Example 28
Project: razzy-spinner   Author: rafasashi   File: relextract.py    License: GNU General Public License v3.0 5 votes vote down vote up
def conllned(trace=1):
    """
    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
    from CoNLL 2002.
    """

    from nltk.corpus import conll2002

    vnv = """
    (
    is/V|    # 3rd sing present and
    was/V|   # past forms of the verb zijn ('be')
    werd/V|  # and also present
    wordt/V  # past of worden ('become)
    )
    .*       # followed by anything
    van/Prep # followed by van ('of')
    """
    VAN = re.compile(vnv, re.VERBOSE)

    print()
    print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
    print("=" * 45)


    for doc in conll2002.chunked_sents('ned.train'):
        lcon = rcon = False
        if trace:
                lcon = rcon = True
        for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10):
            print(rtuple(rel, lcon=True, rcon=True))

#############################################
## Spanish CONLL2002: (PER, ORG)
############################################# 
Example 29
Project: Quora-Question-Pairs   Author: rupak-118   File: MaLSTM_train.py    License: MIT License 5 votes vote down vote up
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs_list = []
        for word in row.split():
            word = word.lower()
            word = re.sub(r"[^a-zA-Z0-9^.']"," ",word)
            word = re.sub(r"what's", "what is ", word)
            word = re.sub(r"\'ve", " have ", word)
            word = re.sub(r"can't", "cannot ", word)
            word = re.sub(r"n't", " not ", word)
            word = re.sub(r"i'm", "i am ", word)
            word = re.sub(r"\'re", " are ", word)
            word = re.sub(r"\'d", " would ", word)
            word = re.sub(r"\'ll", " will ", word)
            # If the word contains numbers with decimals, this will preserve it
            if bool(re.search(r'\d', word) and re.search(r'\.', word)) and word not in keep_list:
                keep_list.append(word)
            # Preserves certain frequently occuring dot words
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                qs_list.append(p1)
            else : qs_list.append(word)
        
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs_list)))
    return cleaned_corpus 
Example 30
Project: Quora-Question-Pairs   Author: rupak-118   File: test.py    License: MIT License 5 votes vote down vote up
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs_list = []
        for word in row.split():
            word = word.lower()
            word = re.sub(r"[^a-zA-Z0-9^.']"," ",word)
            word = re.sub(r"what's", "what is ", word)
            word = re.sub(r"\'ve", " have ", word)
            word = re.sub(r"can't", "cannot ", word)
            word = re.sub(r"n't", " not ", word)
            word = re.sub(r"i'm", "i am ", word)
            word = re.sub(r"\'re", " are ", word)
            word = re.sub(r"\'d", " would ", word)
            word = re.sub(r"\'ll", " will ", word)
            # If the word contains numbers with decimals, this will preserve it
            if bool(re.search(r'\d', word) and re.search(r'\.', word)) and word not in keep_list:
                keep_list.append(word)
            # Preserves certain frequently occuring dot words
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                qs_list.append(p1)
            else : qs_list.append(word)
        
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs_list)))
    
    return cleaned_corpus