Python nltk.data() Examples

The following are 30 code examples for showing how to use nltk.data(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: razzy-spinner   Author: rafasashi   File: chat80.py    License: GNU General Public License v3.0 6 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise 
Example 2
Project: razzy-spinner   Author: rafasashi   File: chat80.py    License: GNU General Public License v3.0 6 votes vote down vote up
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: string
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close() 
Example 3
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    License: MIT License 6 votes vote down vote up
def add_full_stops_to_the_end(infile, outfile):
	#clean data of small titles nad add full stops for NLTK to work
	output_format = '{}.\n'.format
	with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
		for line in fin:
			if line[0] == ' ':
				pass
			#ignore headlines with less than three words
			elif len(line.split()) <= 3:
				pass
			elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
				print >> fout, line.decode('utf-8'),
			else:
				print >> fout, output_format(line.strip()).decode('utf-8'),



############################################
#   Convert All except first word and quotes
# 	to lower case 				           #
############################################ 
Example 4
Project: SemevalAspectBasedSentimentAnalysis   Author: pedrobalage   File: semaphore.py    License: GNU General Public License v2.0 6 votes vote down vote up
def clean_raw_text(text, file_name=''):

    '''
    cleans all text input and places the cleaned text in the 'samples' folder, one line at the time (as required by semaphore).
    '''

    import re
    import nltk, nltk.data

    sent_detector=nltk.data.load('tokenizers/punkt/english.pickle')

    raw_text=text
    clean_file=file_name if file_name else 'clean_text.txt'

    text=re.sub(r'-+(\n)\s*', '', raw_text)
    text=re.sub(r'(\n)+', '', text)

    text= '\n'.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())])
    open(clean_file, 'w').write(text) 
Example 5
Project: SemevalAspectBasedSentimentAnalysis   Author: pedrobalage   File: semaphore.py    License: GNU General Public License v2.0 6 votes vote down vote up
def semaphore(text='', files='', semaphore=release):
    '''
    This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
    sentence (the text can be completely raw).
    '''
    os.chdir(semaphore)

    if text:
        sample='../samples/cleaned.txt'
    if files:
        text=text+' '.join([open(f, 'r').read() for f in files])
        #I just name the newly cleaned file by the name of the first file in the file list + "_clean":
        sample='../samples/' + files[0].split('/')[-1][:-4] + '_clean.txt'

    if text:
        clean_raw_text(text, file_name=sample)

    else:
        sample='../samples/sample.txt'

    run_semaphore(release=semaphore, sample=sample)

    return import_semaphore() 
Example 6
Project: SemevalAspectBasedSentimentAnalysis   Author: pedrobalage   File: semaphore.py    License: GNU General Public License v2.0 6 votes vote down vote up
def mysemaphore(text, path, semaphore=release):
    import shutil

    '''
    This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
    sentence (the text can be completely raw).
    '''
    os.chdir(semaphore)

    sample='../samples/cleaned.txt'

    open(sample, 'w').write(text)

    run_semaphore(release=semaphore, sample=sample)

    shutil.copy (semaphore+'/../samples/output.txt',path)

    return import_semaphore() 
Example 7
Project: DNN-Sentiment   Author: awjuliani   File: data_helpers.py    License: MIT License 6 votes vote down vote up
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    #x_text = list(open("./trainUNK.txt", "r").readlines())
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y] 
Example 8
Project: DNN-Sentiment   Author: awjuliani   File: data_helpers.py    License: MIT License 6 votes vote down vote up
def load_data_for_books(path):
	text = ''.join(open(path).readlines()).decode('utf8')
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	book = tokenizer.tokenize(text)
	#book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
	#book = list(open(path, "r").readlines())
	book = [s.strip() for s in book]
	book = [clean_str(sent) for sent in book]
	book = [s.split(" ") for s in book]
	x_text = book
	y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
	sentences, labels = x_text,y
	sentences_padded = pad_sentences(sentences)
    
    
    
	sentencesT, labelsT = load_data_and_labels()
	sentences_paddedT = pad_sentences(sentencesT)
	vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
	x, y = build_input_data(sentences_padded, labels, vocabulary)
	return [x, y, vocabulary, vocabulary_inv, sentencesT] 
Example 9
Project: DNN-Sentiment   Author: awjuliani   File: data_helpers.py    License: MIT License 6 votes vote down vote up
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index] 
Example 10
Project: pyrouge   Author: bheinzerling   File: sentence_splitter.py    License: MIT License 6 votes vote down vote up
def __init__(self, language="en", punkt_data_path=None):
        self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
        self.log = log.get_global_console_logger()
        try:
            import nltk.data
        except ImportError:
            self.log.error(
                "Cannot import NLTK data for the sentence splitter. Please "
                "check if the 'punkt' NLTK-package is installed correctly.")
        try:
            if not punkt_data_path:
                punkt_data_path = self.lang2datapath[language]
            self.sent_detector = nltk.data.load(punkt_data_path)
        except KeyError:
            self.log.error(
                "No sentence splitter data for language {}.".format(language))
        except:
            self.log.error(
                "Could not load sentence splitter data: {}".format(
                    self.lang2datapath[language])) 
Example 11
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3

    try:
        path = nltk.data.find(dbname)
        connection = sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings

        warnings.warn(
            "Make sure the database file %s is installed and uncompressed." % dbname
        )
        raise 
Example 12
Project: razzy-spinner   Author: rafasashi   File: test_corpus_views.py    License: GNU General Public License v3.0 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example 13
Project: razzy-spinner   Author: rafasashi   File: test_corpus_views.py    License: GNU General Public License v3.0 5 votes vote down vote up
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data)) 
Example 14
Project: razzy-spinner   Author: rafasashi   File: test_corpus_views.py    License: GNU General Public License v3.0 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example 15
Project: razzy-spinner   Author: rafasashi   File: grammar.py    License: GNU General Public License v3.0 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example 16
Project: razzy-spinner   Author: rafasashi   File: chat80.py    License: GNU General Public License v3.0 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example 17
Project: razzy-spinner   Author: rafasashi   File: chat80.py    License: GNU General Public License v3.0 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example 18
Project: razzy-spinner   Author: rafasashi   File: chat80.py    License: GNU General Public License v3.0 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: string
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True 
Example 19
Project: py-nltk-svo   Author: klintan   File: svo.py    License: MIT License 5 votes vote down vote up
def __init__(self):
        """
        Initialize the SVO Methods
        """
        self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"]
        self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
        self.adjective_types = ["JJ", "JJR", "JJS"]
        self.pred_verb_phrase_siblings = None
        self.parser = stanford.StanfordParser()
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 
Example 20
Project: estnltk   Author: estnltk   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def spans(self, layer):
        """Retrieve (start, end) tuples denoting the spans of given layer elements.

        Returns
        -------
        list of (int, int)
            List of (start, end) tuples.
        """
        spans = []
        for data in self[layer]:
            spans.append((data[START], data[END]))
        return spans 
Example 21
Project: estnltk   Author: estnltk   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def starts(self, layer):
        """Retrieve start positions of elements if given layer."""
        starts = []
        for data in self[layer]:
            starts.append(data[START])
        return starts 
Example 22
Project: estnltk   Author: estnltk   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def ends(self, layer):
        """Retrieve end positions of elements if given layer."""
        ends = []
        for data in self[layer]:
            ends.append(data[END])
        return ends 
Example 23
Project: estnltk   Author: estnltk   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def tag_timexes(self):
        """Create ``timexes`` layer.
        Depends on morphological analysis data in ``words`` layer
        and tags it automatically, if it is not present."""
        if not self.is_tagged(ANALYSIS):
            self.tag_analysis()
        if not self.is_tagged(TIMEXES):
            if self.__timex_tagger is None:
                self.__timex_tagger = load_default_timex_tagger()
            self.__timex_tagger.tag_document(self, **self.__kwargs)
        return self 
Example 24
Project: estnltk   Author: estnltk   File: text.py    License: GNU General Public License v2.0 5 votes vote down vote up
def spelling(self):
        """Flag incorrectly spelled words.
        Returns a list of booleans, where element at each position denotes, if the word at the same position
        is spelled correctly.
        """
        if not self.is_tagged(WORDS):
            self.tokenize_words()
        return [data[SPELLING] for data in vabamorf.spellcheck(self.word_texts, suggestions=False)] 
Example 25
Project: luscan-devel   Author: blackye   File: grammar.py    License: GNU General Public License v2.0 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print g
    print 
Example 26
Project: sumpy   Author: kedz   File: preprocessor.py    License: Apache License 2.0 5 votes vote down vote up
def build_sent_tokenizer(self):
        """Return a function that splits a string into a sequence of 
        sentences."""
        if self._sentence_tokenizer is not None:
            tok = self._sentence_tokenizer
        else:
            tok = nltk.data.load('tokenizers/punkt/english.pickle').tokenize
        return tok 
Example 27
Project: sumpy   Author: kedz   File: preprocessor.py    License: Apache License 2.0 5 votes vote down vote up
def _build_sent_tokenizer(self):
        """Return a function that splits a string into a sequence of 
        sentences."""
        if self._sentence_tokenizer is not None:
            return self._sentence_tokenizer
        else:
            return nltk.data.load('tokenizers/punkt/english.pickle').tokenize 
Example 28
Project: sumpy   Author: kedz   File: preprocessor.py    License: Apache License 2.0 5 votes vote down vote up
def build_stopwords(self):
        if self.remove_stopwords is True:
            if self._stopwords is None:             
                path = pkg_resources.resource_filename(
                    "sumpy", 
                    os.path.join("data", "smart_common_words.txt.gz"))
                with gzip.open(path, u"r") as f:
                    self._stopwords = set(
                        [word.strip().decode(u"utf-8").lower()
                         for word in f.readlines()])
            return lambda word: word in self._stopwords
        else:
            return lambda word: False 
Example 29
Project: semeval2017-scienceie   Author: UKPLab   File: extras.py    License: Apache License 2.0 5 votes vote down vote up
def __punkt_sentence_splitter(self):
#        print("initializing punkt sentence splitter")
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        return lambda x: sent_tokenizer.tokenize(x) 
Example 30
Project: semeval2017-scienceie   Author: UKPLab   File: extras.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self, src):
        firstLine = True
        header = []
        data = []
        with codecs.open(src, "r", "utf-8") as f:
            for line in f:
                line = line.strip().split("\t")[1:]
                if firstLine:
                    firstLine = False
                    header = line
                else:
                    data += [line]
        header = [x.split("/")[1].split(".")[0] for x in header]
        self.domain_data = {x: y for (x,y) in zip(header, data[0])}  # TODO add averaging/max vote