Python nltk.data() Examples

The following are 30 code examples of nltk.data(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: data_helpers.py    From DNN-Sentiment with MIT License 6 votes vote down vote up
def load_data_and_labels():
    """
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    #x_text = list(open("./trainUNK.txt", "r").readlines())
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [s.split(" ") for s in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y] 
Example #2
Source File: semaphore.py    From SemevalAspectBasedSentimentAnalysis with GNU General Public License v2.0 6 votes vote down vote up
def mysemaphore(text, path, semaphore=release):
    import shutil

    '''
    This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
    sentence (the text can be completely raw).
    '''
    os.chdir(semaphore)

    sample='../samples/cleaned.txt'

    open(sample, 'w').write(text)

    run_semaphore(release=semaphore, sample=sample)

    shutil.copy (semaphore+'/../samples/output.txt',path)

    return import_semaphore() 
Example #3
Source File: data_helpers.py    From DNN-Sentiment with MIT License 6 votes vote down vote up
def load_data_for_books(path):
	text = ''.join(open(path).readlines()).decode('utf8')
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	book = tokenizer.tokenize(text)
	#book = re.split(r' *[\.\?!][\'"\)\]]* *', text)
	#book = list(open(path, "r").readlines())
	book = [s.strip() for s in book]
	book = [clean_str(sent) for sent in book]
	book = [s.split(" ") for s in book]
	x_text = book
	y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T
	sentences, labels = x_text,y
	sentences_padded = pad_sentences(sentences)
    
    
    
	sentencesT, labelsT = load_data_and_labels()
	sentences_paddedT = pad_sentences(sentencesT)
	vocabulary, vocabulary_inv = build_vocab(sentences_paddedT)
	x, y = build_input_data(sentences_padded, labels, vocabulary)
	return [x, y, vocabulary, vocabulary_inv, sentencesT] 
Example #4
Source File: data_helpers.py    From DNN-Sentiment with MIT License 6 votes vote down vote up
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int(len(data)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index] 
Example #5
Source File: chat80.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 6 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3

    try:
        path = nltk.data.find(dbname)
        connection = sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings

        warnings.warn(
            "Make sure the database file %s is installed and uncompressed." % dbname
        )
        raise 
Example #6
Source File: semaphore.py    From SemevalAspectBasedSentimentAnalysis with GNU General Public License v2.0 6 votes vote down vote up
def semaphore(text='', files='', semaphore=release):
    '''
    This function takes a string or a list of file path names and outputs a python data structure containing semantic frames for each
    sentence (the text can be completely raw).
    '''
    os.chdir(semaphore)

    if text:
        sample='../samples/cleaned.txt'
    if files:
        text=text+' '.join([open(f, 'r').read() for f in files])
        #I just name the newly cleaned file by the name of the first file in the file list + "_clean":
        sample='../samples/' + files[0].split('/')[-1][:-4] + '_clean.txt'

    if text:
        clean_raw_text(text, file_name=sample)

    else:
        sample='../samples/sample.txt'

    run_semaphore(release=semaphore, sample=sample)

    return import_semaphore() 
Example #7
Source File: chat80.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise 
Example #8
Source File: semaphore.py    From SemevalAspectBasedSentimentAnalysis with GNU General Public License v2.0 6 votes vote down vote up
def clean_raw_text(text, file_name=''):

    '''
    cleans all text input and places the cleaned text in the 'samples' folder, one line at the time (as required by semaphore).
    '''

    import re
    import nltk, nltk.data

    sent_detector=nltk.data.load('tokenizers/punkt/english.pickle')

    raw_text=text
    clean_file=file_name if file_name else 'clean_text.txt'

    text=re.sub(r'-+(\n)\s*', '', raw_text)
    text=re.sub(r'(\n)+', '', text)

    text= '\n'.join([' '.join(nltk.word_tokenize(sent)) for sent in sent_detector.tokenize(text.strip())])
    open(clean_file, 'w').write(text) 
Example #9
Source File: chat80.py    From razzy-spinner with GNU General Public License v3.0 6 votes vote down vote up
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: string
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close() 
Example #10
Source File: sentence_splitter.py    From pyrouge with MIT License 6 votes vote down vote up
def __init__(self, language="en", punkt_data_path=None):
        self.lang2datapath = {"en": "tokenizers/punkt/english.pickle"}
        self.log = log.get_global_console_logger()
        try:
            import nltk.data
        except ImportError:
            self.log.error(
                "Cannot import NLTK data for the sentence splitter. Please "
                "check if the 'punkt' NLTK-package is installed correctly.")
        try:
            if not punkt_data_path:
                punkt_data_path = self.lang2datapath[language]
            self.sent_detector = nltk.data.load(punkt_data_path)
        except KeyError:
            self.log.error(
                "No sentence splitter data for language {}.".format(language))
        except:
            self.log.error(
                "Could not load sentence splitter data: {}".format(
                    self.lang2datapath[language])) 
Example #11
Source File: experiments.py    From clickbait with MIT License 6 votes vote down vote up
def add_full_stops_to_the_end(infile, outfile):
	#clean data of small titles nad add full stops for NLTK to work
	output_format = '{}.\n'.format
	with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
		for line in fin:
			if line[0] == ' ':
				pass
			#ignore headlines with less than three words
			elif len(line.split()) <= 3:
				pass
			elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
				print >> fout, line.decode('utf-8'),
			else:
				print >> fout, output_format(line.strip()).decode('utf-8'),



############################################
#   Convert All except first word and quotes
# 	to lower case 				           #
############################################ 
Example #12
Source File: reformat_20ng.py    From outlier-exposure with Apache License 2.0 5 votes vote down vote up
def reformat(in_path, out_path):
	with open(in_path, 'r') as f:
		lines = f.readlines()

	new_lines = [line.split() for line in lines]
	labels = [line[0] for line in new_lines]  # extract label from line
	labels = [class_names.index(label) for label in labels]  # switch to index
	data = [' '.join(line[1:]) for line in new_lines]  # extract text from line

	with open(out_path, 'w') as myfile:
		wr = csv.writer(myfile)
		for label, inp in zip(labels, data):
			wr.writerow([label, inp]) 
Example #13
Source File: open_ended_coders.py    From mpeds with MIT License 5 votes vote down vote up
def _getCLIFF(self, text):
        '''
        Retrieve organizations and location via CLIFF.

        :param text: text from which locations should be extracted
        :type text: string

        :return: CLIFF location results
        :rtype: dictionary
        '''

        if text != text:
            return ([],{})

        obj = None
        data = self._urlencode_utf8({ 'q': text })

        while obj is None:
            url = 'http://%s/parse/text' % self.cliff_url
            req = urllib2.Request(url, data)
            res = urllib2.urlopen(req)
            obj = json.loads(res.read())

            if obj is not None:
                if obj['status'] == 'error':
                    return ([], {})
                elif obj['status'] != 'ok':
                    obj = None
                    continue

            locs = obj['results']['places']

        return locs 
Example #14
Source File: open_ended_coders.py    From mpeds with MIT License 5 votes vote down vote up
def __init__(self):

        self.SENT_DETECTOR = nltk.data.load('tokenizers/punkt/english.pickle')

        self.NUM_MAP = None
        self.RE = None

        self.S_PREFIX = None
        self.P_SUBJ = None
        self.AGW = None
        self.SWS = None 
Example #15
Source File: extras.py    From semeval2017-scienceie with Apache License 2.0 5 votes vote down vote up
def __init__(self, src):
        firstLine = True
        header = []
        data = []
        with codecs.open(src, "r", "utf-8") as f:
            for line in f:
                line = line.strip().split("\t")[1:]
                if firstLine:
                    firstLine = False
                    header = line
                else:
                    data += [line]
        header = [x.split("/")[1].split(".")[0] for x in header]
        self.domain_data = {x: y for (x,y) in zip(header, data[0])}  # TODO add averaging/max vote 
Example #16
Source File: upper_bound_ilp.py    From acl2017-interactive_summarizer with Apache License 2.0 5 votes vote down vote up
def load_data(self, docs, models):
        '''
        Load the data into
            :doc_sent_dict
            :sentences

        Parameters:
        docs: List of list of docs each doc is represented with its filename and sents
            [['filename1', ['sent1','sent2','sent3']],['filename2'], ['sent1','sent2','sent3']] ]
        models: List of list of models each doc is represented with its filename and sents
            [['filename1', ['sent1','sent2','sent3']],['filename2'], ['sent1','sent2','sent3']] ]

        '''
        self.docs = docs
        self.models = models
        self.sentences = []
        self.doc_sent_dict = {}

        doc_id = 0
        for doc_id, doc in enumerate(docs):
            _, doc_sents = doc
            total = len(self.sentences)
            for sent_id, sentence in enumerate(doc_sents):
                token_sentence = word_tokenize(sentence, self.LANGUAGE)
                sentence_s = Sentence(token_sentence, doc_id, sent_id+1)

                untokenized_form = untokenize(token_sentence)
                sentence_s.untokenized_form = untokenized_form
                sentence_s.length = len(untokenized_form.split(' '))
                self.doc_sent_dict[total+sent_id] = "%s_%s" % (str(doc_id), str(sent_id))
                self.sentences.append(sentence_s) 
Example #17
Source File: data_helpers.py    From DNN-Sentiment with MIT License 5 votes vote down vote up
def load_data():
    """
    Loads and preprocessed data for the MR dataset.
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
    """
    # Load and preprocess data
    sentences, labels = load_data_and_labels()
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    x, y = build_input_data(sentences_padded, labels, vocabulary)
    return [x, y, vocabulary, vocabulary_inv] 
Example #18
Source File: preprocessor.py    From sumpy with Apache License 2.0 5 votes vote down vote up
def _build_sent_tokenizer(self):
        """Return a function that splits a string into a sequence of 
        sentences."""
        if self._sentence_tokenizer is not None:
            return self._sentence_tokenizer
        else:
            return nltk.data.load('tokenizers/punkt/english.pickle').tokenize 
Example #19
Source File: test_corpus_views.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example #20
Source File: test_corpus_views.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data)) 
Example #21
Source File: test_corpus_views.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example #22
Source File: childes_fixt.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    import nltk.data

    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest(
            "The CHILDES corpus is not found. "
            "It should be manually downloaded and saved/unpacked "
            "to [NLTK_Data_Dir]/corpora/childes/"
        ) 
Example #23
Source File: grammar.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def fcfg_demo():
    import nltk.data

    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example #24
Source File: chat80.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example #25
Source File: chat80.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel + r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example #26
Source File: chat80.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def process_bundle(rels):
    """
    Given a list of relation metadata bundles, make a corresponding
    dictionary of concepts, indexed by the relation name.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list(dict)
    :return: a dictionary of concepts, indexed by the relation name.
    :rtype: dict(str): Concept
    """
    concepts = {}
    for rel in rels:
        rel_name = rel['rel_name']
        closures = rel['closures']
        schema = rel['schema']
        filename = rel['filename']

        concept_list = clause2concepts(filename, rel_name, schema, closures)
        for c in concept_list:
            label = c.prefLabel
            if label in concepts:
                for data in c.extension:
                    concepts[label].augment(data)
                concepts[label].close()
            else:
                concepts[label] = c
    return concepts 
Example #27
Source File: chat80.py    From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: str
    """
    dbname = db + ".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation

        val = Valuation(db_in)
        #        val.read(db_in.items())
        return val


# def alpha(str):
# """
# Utility to filter out non-alphabetic constants.

#:param str: candidate constant
#:type str: string
#:rtype: bool
# """
# try:
# int(str)
# return False
# except ValueError:
## some unknown values in records are labeled '?'
# if not str == '?':
# return True 
Example #28
Source File: bbn2conll.py    From entity-recognition-datasets with MIT License 5 votes vote down vote up
def write_all_to_conll(DESC_DECISION='merge'):
    """ Convert all the BBN data to CONLL-format and save in CONLLDIR.

    For a description of the DESC_DECISION argument, see the documentation
    for function fix_iobtag.

    """
    filenames = os.listdir(DATADIR)
    for filename in filenames:
        writeconll(filename) 
Example #29
Source File: bbn2conll.py    From entity-recognition-datasets with MIT License 5 votes vote down vote up
def parse_docs(filename):

    #with open('wsj00a.qa') as fd:
    with open(filename,'r') as fd:
        data = fd.read()

    root = ET.fromstring(data)
    docs = root.getchildren()
    #c1 = docs[1]
    store = []
    for ii,doc in enumerate(docs):
        #print ii
        p = parse_doc(doc)
        store.extend(p)
    return store 
Example #30
Source File: preprocessor.py    From sumpy with Apache License 2.0 5 votes vote down vote up
def build_stopwords(self):
        if self.remove_stopwords is True:
            if self._stopwords is None:             
                path = pkg_resources.resource_filename(
                    "sumpy", 
                    os.path.join("data", "smart_common_words.txt.gz"))
                with gzip.open(path, u"r") as f:
                    self._stopwords = set(
                        [word.strip().decode(u"utf-8").lower()
                         for word in f.readlines()])
            return lambda word: word in self._stopwords
        else:
            return lambda word: False