Python nltk.data() Examples
The following are 30 code examples for showing how to use nltk.data(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
You may check out the related API usage on the sidebar.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example 1
Project: razzy-spinner Author: rafasashi File: chat80.py License: GNU General Public License v3.0 | 6 votes |
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname) raise
Example 2
Project: razzy-spinner Author: rafasashi File: chat80.py License: GNU General Public License v3.0 | 6 votes |
def val_dump(rels, db): """ Make a ``Valuation`` from a list of relation metadata bundles and dump to persistent database. :param rels: bundle of metadata needed for constructing a concept :type rels: list of dict :param db: name of file to which data is written. The suffix '.db' will be automatically appended. :type db: string """ concepts = process_bundle(rels).values() valuation = make_valuation(concepts, read=True) db_out = shelve.open(db, 'n') db_out.update(valuation) db_out.close()
Example 3
Project: clickbait Author: bhargaviparanjape File: experiments.py License: MIT License | 6 votes |
def add_full_stops_to_the_end(infile, outfile): #clean data of small titles nad add full stops for NLTK to work output_format = '{}.\n'.format with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout: for line in fin: if line[0] == ' ': pass #ignore headlines with less than three words elif len(line.split()) <= 3: pass elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'): print >> fout, line.decode('utf-8'), else: print >> fout, output_format(line.strip()).decode('utf-8'), ############################################ # Convert All except first word and quotes # to lower case # ############################################
Example 4
Project: SemevalAspectBasedSentimentAnalysis Author: pedrobalage File: semaphore.py License: GNU General Public License v2.0 | 6 votes |
def clean_raw_text(text, file_name=''): ''' cleans all text input and places the cleaned text in the 'samples' folder, one line at the time (as required by semaphore). ''' import re import nltk, nltk.data sent_detector=nltk.data.load('tokenizers/punkt/english.pickle') raw_text=text clean_file=file_name if file_name else 'clean_text.txt' text=re.sub(r'-+(\n)\s*', '', raw_text) text=re.sub(r'(\n)+', '', text) text= '\n'.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())]) open(clean_file, 'w').write(text)
Example 5
Project: SemevalAspectBasedSentimentAnalysis Author: pedrobalage File: semaphore.py License: GNU General Public License v2.0 | 6 votes |
def semaphore(text='', files='', semaphore=release): ''' This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each sentence (the text can be completely raw). ''' os.chdir(semaphore) if text: sample='../samples/cleaned.txt' if files: text=text+' '.join([open(f, 'r').read() for f in files]) #I just name the newly cleaned file by the name of the first file in the file list + "_clean": sample='../samples/' + files[0].split('/')[-1][:-4] + '_clean.txt' if text: clean_raw_text(text, file_name=sample) else: sample='../samples/sample.txt' run_semaphore(release=semaphore, sample=sample) return import_semaphore()
Example 6
Project: SemevalAspectBasedSentimentAnalysis Author: pedrobalage File: semaphore.py License: GNU General Public License v2.0 | 6 votes |
def mysemaphore(text, path, semaphore=release): import shutil ''' This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each sentence (the text can be completely raw). ''' os.chdir(semaphore) sample='../samples/cleaned.txt' open(sample, 'w').write(text) run_semaphore(release=semaphore, sample=sample) shutil.copy (semaphore+'/../samples/output.txt',path) return import_semaphore()
Example 7
Project: DNN-Sentiment Author: awjuliani File: data_helpers.py License: MIT License | 6 votes |
def load_data_and_labels(): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # Load data from files positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines()) positive_examples = [s.strip() for s in positive_examples] negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines()) negative_examples = [s.strip() for s in negative_examples] # Split by words #x_text = list(open("./trainUNK.txt", "r").readlines()) x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] x_text = [s.split(" ") for s in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
Example 8
Project: DNN-Sentiment Author: awjuliani File: data_helpers.py License: MIT License | 6 votes |
def load_data_for_books(path): text = ''.join(open(path).readlines()).decode('utf8') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') book = tokenizer.tokenize(text) #book = re.split(r' *[\.\?!][\'"\)\]]* *', text) #book = list(open(path, "r").readlines()) book = [s.strip() for s in book] book = [clean_str(sent) for sent in book] book = [s.split(" ") for s in book] x_text = book y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T sentences, labels = x_text,y sentences_padded = pad_sentences(sentences) sentencesT, labelsT = load_data_and_labels() sentences_paddedT = pad_sentences(sentencesT) vocabulary, vocabulary_inv = build_vocab(sentences_paddedT) x, y = build_input_data(sentences_padded, labels, vocabulary) return [x, y, vocabulary, vocabulary_inv, sentencesT]
Example 9
Project: DNN-Sentiment Author: awjuliani File: data_helpers.py License: MIT License | 6 votes |
def batch_iter(data, batch_size, num_epochs, shuffle=True): """ Generates a batch iterator for a dataset. """ data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index]
Example 10
Project: pyrouge Author: bheinzerling File: sentence_splitter.py License: MIT License | 6 votes |
def __init__(self, language="en", punkt_data_path=None): self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} self.log = log.get_global_console_logger() try: import nltk.data except ImportError: self.log.error( "Cannot import NLTK data for the sentence splitter. Please " "check if the 'punkt' NLTK-package is installed correctly.") try: if not punkt_data_path: punkt_data_path = self.lang2datapath[language] self.sent_detector = nltk.data.load(punkt_data_path) except KeyError: self.log.error( "No sentence splitter data for language {}.".format(language)) except: self.log.error( "Could not load sentence splitter data: {}".format( self.lang2datapath[language]))
Example 11
Project: V1EngineeringInc-Docs Author: V1EngineeringInc File: chat80.py License: Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn( "Make sure the database file %s is installed and uncompressed." % dbname ) raise
Example 12
Project: razzy-spinner Author: rafasashi File: test_corpus_views.py License: GNU General Public License v3.0 | 5 votes |
def data(self): for name in self.names: f = nltk.data.find(name) with f.open() as fp: file_data = fp.read().decode('utf8') yield f, file_data
Example 13
Project: razzy-spinner Author: rafasashi File: test_corpus_views.py License: GNU General Public License v3.0 | 5 votes |
def test_correct_values(self): # Check that corpus views produce the correct sequence of values. for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(list(v), file_data.split()) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(list(v), self.linetok.tokenize(file_data))
Example 14
Project: razzy-spinner Author: rafasashi File: test_corpus_views.py License: GNU General Public License v3.0 | 5 votes |
def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
Example 15
Project: razzy-spinner Author: rafasashi File: grammar.py License: GNU General Public License v3.0 | 5 votes |
def fcfg_demo(): import nltk.data g = nltk.data.load('grammars/book_grammars/feat0.fcfg') print(g) print()
Example 16
Project: razzy-spinner Author: rafasashi File: chat80.py License: GNU General Public License v3.0 | 5 votes |
def augment(self, data): """ Add more data to the ``Concept``'s extension set. :param data: a new semantic value :type data: string or pair of strings :rtype: set """ self._extension.add(data) self.extension = sorted(list(self._extension)) return self._extension
Example 17
Project: razzy-spinner Author: rafasashi File: chat80.py License: GNU General Public License v3.0 | 5 votes |
def _str2records(filename, rel): """ Read a file into memory and convert each relation clause into a list. """ recs = [] contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") for line in contents.splitlines(): if line.startswith(rel): line = re.sub(rel+r'\(', '', line) line = re.sub(r'\)\.$', '', line) record = line.split(',') recs.append(record) return recs
Example 18
Project: razzy-spinner Author: rafasashi File: chat80.py License: GNU General Public License v3.0 | 5 votes |
def val_load(db): """ Load a ``Valuation`` from a persistent database. :param db: name of file from which data is read. The suffix '.db' should be omitted from the name. :type db: string """ dbname = db+".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: db_in = shelve.open(db) from nltk.sem import Valuation val = Valuation(db_in) # val.read(db_in.items()) return val #def alpha(str): #""" #Utility to filter out non-alphabetic constants. #:param str: candidate constant #:type str: string #:rtype: bool #""" #try: #int(str) #return False #except ValueError: ## some unknown values in records are labeled '?' #if not str == '?': #return True
Example 19
Project: py-nltk-svo Author: klintan File: svo.py License: MIT License | 5 votes |
def __init__(self): """ Initialize the SVO Methods """ self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"] self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] self.adjective_types = ["JJ", "JJR", "JJS"] self.pred_verb_phrase_siblings = None self.parser = stanford.StanfordParser() self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
Example 20
Project: estnltk Author: estnltk File: text.py License: GNU General Public License v2.0 | 5 votes |
def spans(self, layer): """Retrieve (start, end) tuples denoting the spans of given layer elements. Returns ------- list of (int, int) List of (start, end) tuples. """ spans = [] for data in self[layer]: spans.append((data[START], data[END])) return spans
Example 21
Project: estnltk Author: estnltk File: text.py License: GNU General Public License v2.0 | 5 votes |
def starts(self, layer): """Retrieve start positions of elements if given layer.""" starts = [] for data in self[layer]: starts.append(data[START]) return starts
Example 22
Project: estnltk Author: estnltk File: text.py License: GNU General Public License v2.0 | 5 votes |
def ends(self, layer): """Retrieve end positions of elements if given layer.""" ends = [] for data in self[layer]: ends.append(data[END]) return ends
Example 23
Project: estnltk Author: estnltk File: text.py License: GNU General Public License v2.0 | 5 votes |
def tag_timexes(self): """Create ``timexes`` layer. Depends on morphological analysis data in ``words`` layer and tags it automatically, if it is not present.""" if not self.is_tagged(ANALYSIS): self.tag_analysis() if not self.is_tagged(TIMEXES): if self.__timex_tagger is None: self.__timex_tagger = load_default_timex_tagger() self.__timex_tagger.tag_document(self, **self.__kwargs) return self
Example 24
Project: estnltk Author: estnltk File: text.py License: GNU General Public License v2.0 | 5 votes |
def spelling(self): """Flag incorrectly spelled words. Returns a list of booleans, where element at each position denotes, if the word at the same position is spelled correctly. """ if not self.is_tagged(WORDS): self.tokenize_words() return [data[SPELLING] for data in vabamorf.spellcheck(self.word_texts, suggestions=False)]
Example 25
Project: luscan-devel Author: blackye File: grammar.py License: GNU General Public License v2.0 | 5 votes |
def fcfg_demo(): import nltk.data g = nltk.data.load('grammars/book_grammars/feat0.fcfg') print g print
Example 26
Project: sumpy Author: kedz File: preprocessor.py License: Apache License 2.0 | 5 votes |
def build_sent_tokenizer(self): """Return a function that splits a string into a sequence of sentences.""" if self._sentence_tokenizer is not None: tok = self._sentence_tokenizer else: tok = nltk.data.load('tokenizers/punkt/english.pickle').tokenize return tok
Example 27
Project: sumpy Author: kedz File: preprocessor.py License: Apache License 2.0 | 5 votes |
def _build_sent_tokenizer(self): """Return a function that splits a string into a sequence of sentences.""" if self._sentence_tokenizer is not None: return self._sentence_tokenizer else: return nltk.data.load('tokenizers/punkt/english.pickle').tokenize
Example 28
Project: sumpy Author: kedz File: preprocessor.py License: Apache License 2.0 | 5 votes |
def build_stopwords(self): if self.remove_stopwords is True: if self._stopwords is None: path = pkg_resources.resource_filename( "sumpy", os.path.join("data", "smart_common_words.txt.gz")) with gzip.open(path, u"r") as f: self._stopwords = set( [word.strip().decode(u"utf-8").lower() for word in f.readlines()]) return lambda word: word in self._stopwords else: return lambda word: False
Example 29
Project: semeval2017-scienceie Author: UKPLab File: extras.py License: Apache License 2.0 | 5 votes |
def __punkt_sentence_splitter(self): # print("initializing punkt sentence splitter") sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') return lambda x: sent_tokenizer.tokenize(x)
Example 30
Project: semeval2017-scienceie Author: UKPLab File: extras.py License: Apache License 2.0 | 5 votes |
def __init__(self, src): firstLine = True header = [] data = [] with codecs.open(src, "r", "utf-8") as f: for line in f: line = line.strip().split("\t")[1:] if firstLine: firstLine = False header = line else: data += [line] header = [x.split("/")[1].split(".")[0] for x in header] self.domain_data = {x: y for (x,y) in zip(header, data[0])} # TODO add averaging/max vote