Python nltk.data() Examples
The following are 30
code examples of nltk.data().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
nltk
, or try the search function
.
Example #1
Source File: data_helpers.py From DNN-Sentiment with MIT License | 6 votes |
def load_data_and_labels(): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # Load data from files positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines()) positive_examples = [s.strip() for s in positive_examples] negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines()) negative_examples = [s.strip() for s in negative_examples] # Split by words #x_text = list(open("./trainUNK.txt", "r").readlines()) x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] x_text = [s.split(" ") for s in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
Example #2
Source File: semaphore.py From SemevalAspectBasedSentimentAnalysis with GNU General Public License v2.0 | 6 votes |
def mysemaphore(text, path, semaphore=release): import shutil ''' This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each sentence (the text can be completely raw). ''' os.chdir(semaphore) sample='../samples/cleaned.txt' open(sample, 'w').write(text) run_semaphore(release=semaphore, sample=sample) shutil.copy (semaphore+'/../samples/output.txt',path) return import_semaphore()
Example #3
Source File: data_helpers.py From DNN-Sentiment with MIT License | 6 votes |
def load_data_for_books(path): text = ''.join(open(path).readlines()).decode('utf8') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') book = tokenizer.tokenize(text) #book = re.split(r' *[\.\?!][\'"\)\]]* *', text) #book = list(open(path, "r").readlines()) book = [s.strip() for s in book] book = [clean_str(sent) for sent in book] book = [s.split(" ") for s in book] x_text = book y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T sentences, labels = x_text,y sentences_padded = pad_sentences(sentences) sentencesT, labelsT = load_data_and_labels() sentences_paddedT = pad_sentences(sentencesT) vocabulary, vocabulary_inv = build_vocab(sentences_paddedT) x, y = build_input_data(sentences_padded, labels, vocabulary) return [x, y, vocabulary, vocabulary_inv, sentencesT]
Example #4
Source File: data_helpers.py From DNN-Sentiment with MIT License | 6 votes |
def batch_iter(data, batch_size, num_epochs, shuffle=True): """ Generates a batch iterator for a dataset. """ data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index]
Example #5
Source File: chat80.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 6 votes |
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn( "Make sure the database file %s is installed and uncompressed." % dbname ) raise
Example #6
Source File: semaphore.py From SemevalAspectBasedSentimentAnalysis with GNU General Public License v2.0 | 6 votes |
def semaphore(text='', files='', semaphore=release): ''' This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each sentence (the text can be completely raw). ''' os.chdir(semaphore) if text: sample='../samples/cleaned.txt' if files: text=text+' '.join([open(f, 'r').read() for f in files]) #I just name the newly cleaned file by the name of the first file in the file list + "_clean": sample='../samples/' + files[0].split('/')[-1][:-4] + '_clean.txt' if text: clean_raw_text(text, file_name=sample) else: sample='../samples/sample.txt' run_semaphore(release=semaphore, sample=sample) return import_semaphore()
Example #7
Source File: chat80.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname) raise
Example #8
Source File: semaphore.py From SemevalAspectBasedSentimentAnalysis with GNU General Public License v2.0 | 6 votes |
def clean_raw_text(text, file_name=''): ''' cleans all text input and places the cleaned text in the 'samples' folder, one line at the time (as required by semaphore). ''' import re import nltk, nltk.data sent_detector=nltk.data.load('tokenizers/punkt/english.pickle') raw_text=text clean_file=file_name if file_name else 'clean_text.txt' text=re.sub(r'-+(\n)\s*', '', raw_text) text=re.sub(r'(\n)+', '', text) text= '\n'.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())]) open(clean_file, 'w').write(text)
Example #9
Source File: chat80.py From razzy-spinner with GNU General Public License v3.0 | 6 votes |
def val_dump(rels, db): """ Make a ``Valuation`` from a list of relation metadata bundles and dump to persistent database. :param rels: bundle of metadata needed for constructing a concept :type rels: list of dict :param db: name of file to which data is written. The suffix '.db' will be automatically appended. :type db: string """ concepts = process_bundle(rels).values() valuation = make_valuation(concepts, read=True) db_out = shelve.open(db, 'n') db_out.update(valuation) db_out.close()
Example #10
Source File: sentence_splitter.py From pyrouge with MIT License | 6 votes |
def __init__(self, language="en", punkt_data_path=None): self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"} self.log = log.get_global_console_logger() try: import nltk.data except ImportError: self.log.error( "Cannot import NLTK data for the sentence splitter. Please " "check if the 'punkt' NLTK-package is installed correctly.") try: if not punkt_data_path: punkt_data_path = self.lang2datapath[language] self.sent_detector = nltk.data.load(punkt_data_path) except KeyError: self.log.error( "No sentence splitter data for language {}.".format(language)) except: self.log.error( "Could not load sentence splitter data: {}".format( self.lang2datapath[language]))
Example #11
Source File: experiments.py From clickbait with MIT License | 6 votes |
def add_full_stops_to_the_end(infile, outfile): #clean data of small titles nad add full stops for NLTK to work output_format = '{}.\n'.format with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout: for line in fin: if line[0] == ' ': pass #ignore headlines with less than three words elif len(line.split()) <= 3: pass elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'): print >> fout, line.decode('utf-8'), else: print >> fout, output_format(line.strip()).decode('utf-8'), ############################################ # Convert All except first word and quotes # to lower case # ############################################
Example #12
Source File: reformat_20ng.py From outlier-exposure with Apache License 2.0 | 5 votes |
def reformat(in_path, out_path): with open(in_path, 'r') as f: lines = f.readlines() new_lines = [line.split() for line in lines] labels = [line[0] for line in new_lines] # extract label from line labels = [class_names.index(label) for label in labels] # switch to index data = [' '.join(line[1:]) for line in new_lines] # extract text from line with open(out_path, 'w') as myfile: wr = csv.writer(myfile) for label, inp in zip(labels, data): wr.writerow([label, inp])
Example #13
Source File: open_ended_coders.py From mpeds with MIT License | 5 votes |
def _getCLIFF(self, text): ''' Retrieve organizations and location via CLIFF. :param text: text from which locations should be extracted :type text: string :return: CLIFF location results :rtype: dictionary ''' if text != text: return ([],{}) obj = None data = self._urlencode_utf8({ 'q': text }) while obj is None: url = 'http://%s/parse/text' % self.cliff_url req = urllib2.Request(url, data) res = urllib2.urlopen(req) obj = json.loads(res.read()) if obj is not None: if obj['status'] == 'error': return ([], {}) elif obj['status'] != 'ok': obj = None continue locs = obj['results']['places'] return locs
Example #14
Source File: open_ended_coders.py From mpeds with MIT License | 5 votes |
def __init__(self): self.SENT_DETECTOR = nltk.data.load('tokenizers/punkt/english.pickle') self.NUM_MAP = None self.RE = None self.S_PREFIX = None self.P_SUBJ = None self.AGW = None self.SWS = None
Example #15
Source File: extras.py From semeval2017-scienceie with Apache License 2.0 | 5 votes |
def __init__(self, src): firstLine = True header = [] data = [] with codecs.open(src, "r", "utf-8") as f: for line in f: line = line.strip().split("\t")[1:] if firstLine: firstLine = False header = line else: data += [line] header = [x.split("/")[1].split(".")[0] for x in header] self.domain_data = {x: y for (x,y) in zip(header, data[0])} # TODO add averaging/max vote
Example #16
Source File: upper_bound_ilp.py From acl2017-interactive_summarizer with Apache License 2.0 | 5 votes |
def load_data(self, docs, models): ''' Load the data into :doc_sent_dict :sentences Parameters: docs: List of list of docs each doc is represented with its filename and sents [['filename1', ['sent1','sent2','sent3']],['filename2'], ['sent1','sent2','sent3']] ] models: List of list of models each doc is represented with its filename and sents [['filename1', ['sent1','sent2','sent3']],['filename2'], ['sent1','sent2','sent3']] ] ''' self.docs = docs self.models = models self.sentences = [] self.doc_sent_dict = {} doc_id = 0 for doc_id, doc in enumerate(docs): _, doc_sents = doc total = len(self.sentences) for sent_id, sentence in enumerate(doc_sents): token_sentence = word_tokenize(sentence, self.LANGUAGE) sentence_s = Sentence(token_sentence, doc_id, sent_id+1) untokenized_form = untokenize(token_sentence) sentence_s.untokenized_form = untokenized_form sentence_s.length = len(untokenized_form.split(' ')) self.doc_sent_dict[total+sent_id] = "%s_%s" % (str(doc_id), str(sent_id)) self.sentences.append(sentence_s)
Example #17
Source File: data_helpers.py From DNN-Sentiment with MIT License | 5 votes |
def load_data(): """ Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data sentences, labels = load_data_and_labels() sentences_padded = pad_sentences(sentences) vocabulary, vocabulary_inv = build_vocab(sentences_padded) x, y = build_input_data(sentences_padded, labels, vocabulary) return [x, y, vocabulary, vocabulary_inv]
Example #18
Source File: preprocessor.py From sumpy with Apache License 2.0 | 5 votes |
def _build_sent_tokenizer(self): """Return a function that splits a string into a sequence of sentences.""" if self._sentence_tokenizer is not None: return self._sentence_tokenizer else: return nltk.data.load('tokenizers/punkt/english.pickle').tokenize
Example #19
Source File: test_corpus_views.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def data(self): for name in self.names: f = nltk.data.find(name) with f.open() as fp: file_data = fp.read().decode('utf8') yield f, file_data
Example #20
Source File: test_corpus_views.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_correct_values(self): # Check that corpus views produce the correct sequence of values. for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(list(v), file_data.split()) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(list(v), self.linetok.tokenize(file_data))
Example #21
Source File: test_corpus_views.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
Example #22
Source File: childes_fixt.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def setup_module(module): from nose import SkipTest import nltk.data try: nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') except LookupError as e: print(e) raise SkipTest( "The CHILDES corpus is not found. " "It should be manually downloaded and saved/unpacked " "to [NLTK_Data_Dir]/corpora/childes/" )
Example #23
Source File: grammar.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def fcfg_demo(): import nltk.data g = nltk.data.load('grammars/book_grammars/feat0.fcfg') print(g) print()
Example #24
Source File: chat80.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def augment(self, data): """ Add more data to the ``Concept``'s extension set. :param data: a new semantic value :type data: string or pair of strings :rtype: set """ self._extension.add(data) self.extension = sorted(list(self._extension)) return self._extension
Example #25
Source File: chat80.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def _str2records(filename, rel): """ Read a file into memory and convert each relation clause into a list. """ recs = [] contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") for line in contents.splitlines(): if line.startswith(rel): line = re.sub(rel + r'\(', '', line) line = re.sub(r'\)\.$', '', line) record = line.split(',') recs.append(record) return recs
Example #26
Source File: chat80.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def process_bundle(rels): """ Given a list of relation metadata bundles, make a corresponding dictionary of concepts, indexed by the relation name. :param rels: bundle of metadata needed for constructing a concept :type rels: list(dict) :return: a dictionary of concepts, indexed by the relation name. :rtype: dict(str): Concept """ concepts = {} for rel in rels: rel_name = rel['rel_name'] closures = rel['closures'] schema = rel['schema'] filename = rel['filename'] concept_list = clause2concepts(filename, rel_name, schema, closures) for c in concept_list: label = c.prefLabel if label in concepts: for data in c.extension: concepts[label].augment(data) concepts[label].close() else: concepts[label] = c return concepts
Example #27
Source File: chat80.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International | 5 votes |
def val_load(db): """ Load a ``Valuation`` from a persistent database. :param db: name of file from which data is read. The suffix '.db' should be omitted from the name. :type db: str """ dbname = db + ".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: db_in = shelve.open(db) from nltk.sem import Valuation val = Valuation(db_in) # val.read(db_in.items()) return val # def alpha(str): # """ # Utility to filter out non-alphabetic constants. #:param str: candidate constant #:type str: string #:rtype: bool # """ # try: # int(str) # return False # except ValueError: ## some unknown values in records are labeled '?' # if not str == '?': # return True
Example #28
Source File: bbn2conll.py From entity-recognition-datasets with MIT License | 5 votes |
def write_all_to_conll(DESC_DECISION='merge'): """ Convert all the BBN data to CONLL-format and save in CONLLDIR. For a description of the DESC_DECISION argument, see the documentation for function fix_iobtag. """ filenames = os.listdir(DATADIR) for filename in filenames: writeconll(filename)
Example #29
Source File: bbn2conll.py From entity-recognition-datasets with MIT License | 5 votes |
def parse_docs(filename): #with open('wsj00a.qa') as fd: with open(filename,'r') as fd: data = fd.read() root = ET.fromstring(data) docs = root.getchildren() #c1 = docs[1] store = [] for ii,doc in enumerate(docs): #print ii p = parse_doc(doc) store.extend(p) return store
Example #30
Source File: preprocessor.py From sumpy with Apache License 2.0 | 5 votes |
def build_stopwords(self): if self.remove_stopwords is True: if self._stopwords is None: path = pkg_resources.resource_filename( "sumpy", os.path.join("data", "smart_common_words.txt.gz")) with gzip.open(path, u"r") as f: self._stopwords = set( [word.strip().decode(u"utf-8").lower() for word in f.readlines()]) return lambda word: word in self._stopwords else: return lambda word: False