Python nltk.corpus() Examples
The following are 30 code examples for showing how to use nltk.corpus(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 6 votes |
def closure(self, rel, depth=-1): """Return the transitive closure of source under the rel relationship, breadth-first >>> from nltk.corpus import wordnet as wn >>> dog = wn.synset('dog.n.01') >>> hyp = lambda s:s.hypernyms() >>> list(dog.closure(hyp)) [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')] """ from nltk.util import breadth_first synset_offsets = [] for synset in breadth_first(self, rel, depth): if synset._offset != self._offset: if synset._offset not in synset_offsets: synset_offsets.append(synset._offset) yield synset
Example 2
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 6 votes |
def res_similarity(self, other, ic, verbose=False): """ Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. Synsets whose LCS is the root node of the taxonomy will have a score of 0 (e.g. N['dog'][0] and N['table'][0]). """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return lcs_ic
Example 3
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 6 votes |
def lin_similarity(self, other, ic, verbose=False): """ Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects, in the range 0 to 1. """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return (2.0 * lcs_ic) / (ic1 + ic2)
Example 4
Project: razzy-spinner Author: rafasashi File: glue.py License: GNU General Public License v3.0 | 6 votes |
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant') ], backoff=trigram_tagger) return main_tagger
Example 5
Project: razzy-spinner Author: rafasashi File: relextract.py License: GNU General Public License v3.0 | 6 votes |
def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG #############################################
Example 6
Project: razzy-spinner Author: rafasashi File: relextract.py License: GNU General Public License v3.0 | 6 votes |
def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [rel for doc in conll2002.chunked_sents('esp.train') for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] for r in rels[:10]: print(clause(r, relsym='DE')) print()
Example 7
Project: comparable-text-miner Author: motazsaad File: textpro.py License: Apache License 2.0 | 6 votes |
def evaluate(trueValues, predicted, decimals, note): print note label = 1 avg = 'weighted' a = accuracy_score(trueValues, predicted) p = precision_score(trueValues, predicted, pos_label=label, average=avg) r = recall_score(trueValues, predicted, pos_label=label, average=avg) avg_f1 = f1_score(trueValues, predicted, pos_label=label, average=avg) fclasses = f1_score(trueValues, predicted, average=None) f1c1 = fclasses[0]; f1c2 = fclasses[1] fw = (f1c1 + f1c2)/2.0 print 'accuracy:\t', str(round(a,decimals)) print 'precision:\t', str(round(p,decimals)) print 'recall:\t', str(round(r,decimals)) print 'avg f1:\t', str(round(avg_f1,decimals)) print 'c1 f1:\t', str(round(f1c1,decimals)) print 'c2 f1:\t', str(round(f1c2,decimals)) print 'avg(c1,c2):\t', str(round(fw,decimals)) print '------------' ################################################################################### # split a parallel or comparable corpus into two parts
Example 8
Project: comparable-text-miner Author: motazsaad File: textpro.py License: Apache License 2.0 | 6 votes |
def build_lsi_model(corpus_name, corpus_path, topics=300): logging.info( 'building lsi model for %s corpus', corpus_name ) dictFile = corpus_path + corpus_name + '.dict' corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm' logging.info( 'loading dictionary ...' ) dictionary = corpora.Dictionary.load(dictFile) logging.info( 'loading tfidf corpus ...' ) corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file) logging.info( 'building lsi model' ) lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics) logging.info( 'saving lsi' ) lsiFile = corpus_path + corpus_name + '.lsi' lsi.save(lsiFile) logging.info( 'lsi model is ready' ) ##################################################################################
Example 9
Project: comparable-text-miner Author: motazsaad File: textpro.py License: Apache License 2.0 | 6 votes |
def aligning_doc_by_interlanguage_links(source_doc, target_corpus, source_language, target_language, output_path): source = None target = None source_title = get_title_from_interlanguage_links(source_doc, source_language) for d in target_corpus: target_title = get_title_from_interlanguage_links(d, target_language) if source_title == target_title: source = source_doc target = d return source, target ################################################################################## # takes a wikipedia corpus (extracted by WikiExtractor.py) and splits the corpus into documents and clean them
Example 10
Project: tmtoolkit Author: WZBSocialScienceCenter File: _common.py License: Apache License 2.0 | 6 votes |
def pos_tag_convert_penn_to_wn(tag): """ Convert POS tag from Penn tagset to WordNet tagset. :param tag: a tag from Penn tagset :return: a tag from WordNet tagset or None if no corresponding tag could be found """ from nltk.corpus import wordnet as wn if tag in ['JJ', 'JJR', 'JJS']: return wn.ADJ elif tag in ['RB', 'RBR', 'RBS']: return wn.ADV elif tag in ['NN', 'NNS', 'NNP', 'NNPS']: return wn.NOUN elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']: return wn.VERB return None
Example 11
Project: partisan-discourse Author: DistrictDataLabs File: learn.py License: Apache License 2.0 | 6 votes |
def documents(self, fold=None, train=False, test=False): """ A generator of documents being streamed from disk. Each document is a list of paragraphs, which are a list of sentences, which in turn is a list of tuples of (token, tag) pairs. All preprocessing is done by NLTK and the CorpusReader object this object wraps. If a fold is specified (should be an integer between 0 and folds), then the loader will return documents from that fold. Further, train or test must be specified to split the fold correctly. This method allows us to maintain the generator properties of document reads. """ for fileid in self.fileids(fold, train, test): yield list(self.corpus.tagged(fileids=fileid)) ########################################################################## ## Normalize Transformer ##########################################################################
Example 12
Project: Sentence-similarity-classifier-for-pyTorch Author: demelin File: sick_extender.py License: MIT License | 6 votes |
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5, min_substitutions=2, num_candidates=5, concatenate_corpora=True): self.sick_path = sick_path self.target_directory = target_directory self.lm_path = lm_path self.wsd_algorithm = wsd_algorithm self.sampling_parameter = sampling_parameter self.min_substitutions = min_substitutions self.num_candidates = num_candidates self.concatenate_corpora = concatenate_corpora self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt') self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt') # Filter the original SICK corpus to match the expected format, and create file for LM training if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path): self.filter_sick() if self.lm_path is None: raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.') else: self.language_model = kenlm.LanguageModel(self.lm_path)
Example 13
Project: Sentence-similarity-classifier-for-pyTorch Author: demelin File: sick_extender.py License: MIT License | 6 votes |
def filter_sick(self): """ Processes the original S.I.C.K. corpus into a format where each line contains the two compared sentences followed by their relatedness score. """ # Filter the SICK dataset for sentences and relatedness score only df_origin = pd.read_table(self.sick_path) df_classify = df_origin.loc[:, ['sentence_A', 'sentence_B', 'relatedness_score']] # Scale relatedness score to to lie ∈ [0, 1] for training of the classifier df_classify['relatedness_score'] = df_classify['relatedness_score'].apply( lambda x: "{:.4f}".format(float(x)/5.0)) df_noscore = df_origin.loc[:, ['sentence_A', 'sentence_B']] df_noscore = df_noscore.stack() # Write the filtered set to a .csv file df_classify.to_csv(self.filtered_path, sep='\t', index=False, header=False) print('Filtered corpus saved to %s.' % self.filtered_path) # Write a score-free set to a .csv file to be used in the training of the KN language model df_noscore.to_csv(self.noscore_path, index=False, header=False) print('Filtered corpus saved to %s.' % self.noscore_path)
Example 14
Project: Sentence-similarity-classifier-for-pyTorch Author: demelin File: sick_extender.py License: MIT License | 6 votes |
def line_prep(self, line): """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """ # Split line into sentences + score s1, s2, sim_score = line.split('\t') # Tokenize s1_tokens = word_tokenize(s1) s2_tokens = word_tokenize(s2) # Assign part of speech tags s1_penn_pos = nltk.pos_tag(s1_tokens) s2_penn_pos = nltk.pos_tag(s2_tokens) # Convert to WordNet POS tags and store word position in sentence for replacement # Each tuple contains (word, WordNet_POS_tag, position) s1_wn_pos = list() s2_wn_pos = list() for idx, item in enumerate(s1_penn_pos): if self.get_wordnet_pos(item[1]) != 'OTHER': s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item))) for idx, item in enumerate(s2_penn_pos): if self.get_wordnet_pos(item[1]) != 'OTHER': s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item))) # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score
Example 15
Project: gobbli Author: RTIInternational File: wordnet.py License: Apache License 2.0 | 6 votes |
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"): try: from nltk.corpus import wordnet import nltk except ImportError: raise ImportError( "WordNet-based data augmentation requires nltk to be installed." ) self.wn = wordnet try: import spacy from spacy.tokens import Token except ImportError: raise ImportError( "WordNet-based data augmentation requires spaCy and a language " "model to be installed (for part of speech tagging)." ) if not skip_download_check: nltk.download("wordnet") self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False) Token.set_extension("replacement", default=None, force=True)
Example 16
Project: coling2018_fake-news-challenge Author: UKPLab File: feature_engineering.py License: Apache License 2.0 | 6 votes |
def latent_semantic_indexing_gensim_300_concat_holdout(headlines, bodies): """ Takes all the data (holdout+test+train) and interpretes the headlines and bodies as different documents. Instead of combining them, they are appended. Then it tokenizes these ~50k headline-docs and ~50k body-docs, builds a Tfidf-Matrix out of them and creates a LSI-Model out of it. In the next step the headlines and bodies for the feature generation are also treated as different documents and merely appended. Also, they are tokenized and a Tfifd-Matrix is built. This matix is passed to the learned LSI-Model and a Matrix is being returned. In this matrix, each document is represented as a vector with length(topics) of (topic-id, distance of this doc to the topic). The probabilities are then taken as a feature vector for the document. The first half of the matrix represent the headline docs, the latter half represent the body docs. In the end, the feature vectors of the headlines get concatenated with its body feature vector. The differences to the latent_semantic_indexing_gensim_300_concat_OLD are: - holdout data is also used - a Tfidf matrix is built and used to create the LSI model and also to retrieve the features instead of just a corpus to build the LSI model and passing each headline and body separately into the LSI model to retrieve its features (does it make a difference, since dictionary already takes tfidf into account?) - the vectors are taken fully and not just the cosinus distance between them """ return topic_models.latent_semantic_indexing_gensim_concat(headlines, bodies, n_topics=300, include_holdout=True, include_unlbled_test=False)
Example 17
Project: coling2018_fake-news-challenge Author: UKPLab File: feature_engineering.py License: Apache License 2.0 | 6 votes |
def latent_semantic_indexing_gensim_300_concat_holdout_unlbled_test(headlines, bodies): """ Takes all the data (holdout+test+train) and interpretes the headlines and bodies as different documents. Instead of combining them, they are appended. Then it tokenizes these ~50k headline-docs and ~50k body-docs, builds a Tfidf-Matrix out of them and creates a LSI-Model out of it. In the next step the headlines and bodies for the feature generation are also treated as different documents and merely appended. Also, they are tokenized and a Tfifd-Matrix is built. This matix is passed to the learned LSI-Model and a Matrix is being returned. In this matrix, each document is represented as a vector with length(topics) of (topic-id, distance of this doc to the topic). The probabilities are then taken as a feature vector for the document. The first half of the matrix represent the headline docs, the latter half represent the body docs. In the end, the feature vectors of the headlines get concatenated with its body feature vector. The differences to the latent_semantic_indexing_gensim_300_concat_OLD are: - holdout data is also used - a Tfidf matrix is built and used to create the LSI model and also to retrieve the features instead of just a corpus to build the LSI model and passing each headline and body separately into the LSI model to retrieve its features (does it make a difference, since dictionary already takes tfidf into account?) - the vectors are taken fully and not just the cosinus distance between them """ return topic_models.latent_semantic_indexing_gensim_concat(headlines, bodies, n_topics=300, include_holdout=True, include_unlbled_test=True)
Example 18
Project: atap Author: foxbook File: transformer.py License: Apache License 2.0 | 6 votes |
def __init__(self, nfeatures=100, tofull=False): """ Pass in a directory that holds the lexicon in corpus.dict and the TFIDF model in tfidf.model (for now). Set tofull = True if the next thing is a Scikit-Learn estimator otherwise keep False if the next thing is a Gensim model. """ self._lexicon_path = "lexigram.dict" self._tfidf_path = "tfidf.model" self.nfeatures = nfeatures self.lexicon = None self.tfidf = None self.tofull = tofull self.load()
Example 19
Project: pliers Author: tyarkoni File: test_text_filters.py License: BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_token_removal_filter(): stim = TextStim(text='this is not a very long sentence') filt = TokenRemovalFilter() assert filt.transform(stim).text == 'long sentence' filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is']) assert filt2.transform(stim).text == 'this not very long sentence' stim2 = TextStim(text='More. is Real, sentence that\'ll work') try: nltk.data.find('corpora/stopwords') except LookupError: nltk.download('stopwords') from nltk.corpus import stopwords tokens = set(stopwords.words('english')) | set(string.punctuation) filt3 = TokenRemovalFilter(tokens=tokens) assert filt3.transform(stim2).text == 'More Real sentence \'ll work'
Example 20
Project: Semantic-Texual-Similarity-Toolkits Author: rgtjf File: short_sentence_similarity.py License: MIT License | 6 votes |
def info_content(lookup_word): """ Uses the Brown corpus available in NLTK to calculate a Laplace smoothed frequency distribution of words, then uses this information to compute the information content of the lookup_word. """ global N if N == 0: # poor man's lazy evaluation for sent in brown.sents(): for word in sent: word = word.lower() if not word in brown_freqs: brown_freqs[word] = 0 brown_freqs[word] = brown_freqs[word] + 1 N = N + 1 lookup_word = lookup_word.lower() n = 0 if not lookup_word in brown_freqs else brown_freqs[lookup_word] return 1.0 - (math.log(n + 1) / math.log(N + 1))
Example 21
Project: razzy-spinner Author: rafasashi File: util.py License: GNU General Public License v3.0 | 5 votes |
def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: print('Positive') elif pos_words < neg_words: print('Negative') elif pos_words == neg_words: print('Neutral') if plot == True: _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive'])
Example 22
Project: razzy-spinner Author: rafasashi File: __init__.py License: GNU General Public License v3.0 | 5 votes |
def teardown_module(module=None): import nltk.corpus for name in dir(nltk.corpus): obj = getattr(nltk.corpus, name, None) if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'): obj._unload()
Example 23
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 5 votes |
def jcn_similarity(self, other, ic, verbose=False): """ Jiang-Conrath Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. """ if self == other: return _INF ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) # If either of the input synsets are the root synset, or have a # frequency of 0 (sparse data problem), return 0. if ic1 == 0 or ic2 == 0: return 0 ic_difference = ic1 + ic2 - 2 * lcs_ic if ic_difference == 0: return _INF return 1 / ic_difference
Example 24
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 5 votes |
def morphy(self, form, pos=None): """ Find a possible base form for the given form, with the given part of speech, by checking WordNet's list of exceptional forms, and by recursively stripping affixes for this part of speech until a form in WordNet is found. >>> from nltk.corpus import wordnet as wn >>> print(wn.morphy('dogs')) dog >>> print(wn.morphy('churches')) church >>> print(wn.morphy('aardwolves')) aardwolf >>> print(wn.morphy('abaci')) abacus >>> wn.morphy('hardrock', wn.ADV) >>> print(wn.morphy('book', wn.NOUN)) book >>> wn.morphy('book', wn.ADJ) """ if pos is None: morphy = self._morphy analyses = chain(a for p in POS_LIST for a in morphy(form, p)) else: analyses = self._morphy(form, pos) # get the first one we find first = list(islice(analyses, 1)) if len(first) == 1: return first[0] else: return None
Example 25
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 5 votes |
def ic(self, icfile): """ Load an information content file from the wordnet_ic corpus and return a dictionary. This dictionary has just two keys, NOUN and VERB, whose values are dictionaries that map from synsets to information content values. :type icfile: str :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") :return: An information content dictionary """ ic = {} ic[NOUN] = defaultdict(float) ic[VERB] = defaultdict(float) for num, line in enumerate(self.open(icfile)): if num == 0: # skip the header continue fields = line.split() offset = int(fields[0][:-1]) value = float(fields[1]) pos = _get_pos(fields[0]) if len(fields) == 3 and fields[2] == "ROOT": # Store root count. ic[pos][0] += value if value != 0: ic[pos][offset] = value return ic ###################################################################### # Similarity metrics ###################################################################### # TODO: Add in the option to manually add a new root node; this will be # useful for verb similarity as there exist multiple verb taxonomies. # More information about the metrics is available at # http://marimba.d.umn.edu/similarity/measures.html
Example 26
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 5 votes |
def _get_pos(field): if field[-1] == 'n': return NOUN elif field[-1] == 'v': return VERB else: msg = "Unidentified part of speech in WordNet Information Content file for field %s" % field raise ValueError(msg) # unload corpus after tests
Example 27
Project: razzy-spinner Author: rafasashi File: wordnet.py License: GNU General Public License v3.0 | 5 votes |
def teardown_module(module=None): from nltk.corpus import wordnet wordnet._unload() ###################################################################### # Demo ######################################################################
Example 28
Project: razzy-spinner Author: rafasashi File: relextract.py License: GNU General Public License v3.0 | 5 votes |
def conllned(trace=1): """ Find the copula+'van' relation ('of') in the Dutch tagged training corpus from CoNLL 2002. """ from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become) ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) print() print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") print("=" * 45) for doc in conll2002.chunked_sents('ned.train'): lcon = rcon = False if trace: lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10): print(rtuple(rel, lcon=True, rcon=True)) ############################################# ## Spanish CONLL2002: (PER, ORG) #############################################
Example 29
Project: Quora-Question-Pairs Author: rupak-118 File: MaLSTM_train.py License: MIT License | 5 votes |
def text_clean(corpus, keep_list): ''' Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed) Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained even after the cleaning process Output : Returns the cleaned text corpus ''' cleaned_corpus = pd.Series() for row in corpus: qs_list = [] for word in row.split(): word = word.lower() word = re.sub(r"[^a-zA-Z0-9^.']"," ",word) word = re.sub(r"what's", "what is ", word) word = re.sub(r"\'ve", " have ", word) word = re.sub(r"can't", "cannot ", word) word = re.sub(r"n't", " not ", word) word = re.sub(r"i'm", "i am ", word) word = re.sub(r"\'re", " are ", word) word = re.sub(r"\'d", " would ", word) word = re.sub(r"\'ll", " will ", word) # If the word contains numbers with decimals, this will preserve it if bool(re.search(r'\d', word) and re.search(r'\.', word)) and word not in keep_list: keep_list.append(word) # Preserves certain frequently occuring dot words if word not in keep_list: p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word) qs_list.append(p1) else : qs_list.append(word) cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs_list))) return cleaned_corpus
Example 30
Project: Quora-Question-Pairs Author: rupak-118 File: MaLSTM_train.py License: MIT License | 5 votes |
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True): ''' Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.) Input : 'corpus' - Text corpus on which pre-processing tasks will be performed 'keep_list' - List of words to be retained during cleaning process 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should be performed or not 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter Stemmer. 'snowball' corresponds to Snowball Stemmer Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together Output : Returns the processed text corpus ''' if cleaning == True: corpus = text_clean(corpus, keep_list) ''' All stopwords except the 'wh-' words are removed ''' if remove_stopwords == True: wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom'] stop = set(stopwords.words('english')) for word in wh_words: stop.remove(word) corpus = [[x for x in x.split() if x not in stop] for x in corpus] else : corpus = [[x for x in x.split()] for x in corpus] if lemmatization == True: lem = WordNetLemmatizer() corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus] if stemming == True: if stem_type == 'snowball': stemmer = SnowballStemmer(language = 'english') corpus = [[stemmer.stem(x) for x in x] for x in corpus] else : stemmer = PorterStemmer() corpus = [[stemmer.stem(x) for x in x] for x in corpus] return corpus