Python nltk.data() Examples

The following are code examples for showing how to use nltk.data(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: razzy-spinner   Author: rafasashi   File: chat80.py    GNU General Public License v3.0 6 votes vote down vote up
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: string
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close() 
Example 2
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 6 votes vote down vote up
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: str
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close() 
Example 3
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 6 votes vote down vote up
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: str
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close() 
Example 4
Project: Health-Checker   Author: KriAga   File: chat80.py    MIT License 6 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise 
Example 5
Project: Health-Checker   Author: KriAga   File: chat80.py    MIT License 6 votes vote down vote up
def val_dump(rels, db):
    """
    Make a ``Valuation`` from a list of relation metadata bundles and dump to
    persistent database.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :param db: name of file to which data is written.
               The suffix '.db' will be automatically appended.
    :type db: str
    """
    concepts = process_bundle(rels).values()
    valuation = make_valuation(concepts, read=True)
    db_out = shelve.open(db, 'n')

    db_out.update(valuation)

    db_out.close() 
Example 6
Project: question_answering   Author: joswinkj   File: textProcessing.py    Apache License 2.0 6 votes vote down vote up
def load_ref_text(self,text_file):
        if self.indexed_data == 1:
            loc=open(text_file,'r')
            sentences1,chk2 ,self.vectorizer = pickle.load(loc)
            loc.close()
            return sentences1,chk2
        textfile = open(text_file,'r')
        lines=textfile.readlines()
        textfile.close()
        lines = ' '.join(lines)
        lines = lines.decode('utf8')
        lines = changeToMatrix.removeProbTexts([lines])
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = [ sent_tokenizer.tokenize(lines.strip()) ]
        sentences1 = [item.strip().strip('.') for sublist in sentences for item in sublist]
        #sentences2 = changeToMatrix.get_tagged_sentences(sentences1)
        sentences2 = sentences1
        sentences2 = changeToMatrix.removeStopWords(sentences2)
        sentences2 =[changeToMatrix.getSynonyms(sent,useNouns=1) for sent in sentences2]
        # pdb.set_trace()
        chk2=pd.DataFrame(self.vectorizer.fit_transform(sentences2).toarray(),columns=self.vectorizer.get_feature_names()).to_sparse(fill_value=0)
        loc = open(text_file+'.pickle','w')
        pickle.dump([sentences1,[chk2],self.vectorizer],loc)
        loc.close()
        return sentences1,[chk2] 
Example 7
Project: regex4dummies   Author: DarkmatterVale   File: test_dependencies.py    MIT License 6 votes vote down vote up
def test_for_nltk(self):
        """
        Downloading all required NLTK dependencies.
        """

        from nltk.data import find
        from nltk import download

        # Download data if needed
        try:
            find('stopwords.zip')
        except LookupError:
            download('stopwords')

        try:
            find('maxent_ne_chunker')
        except LookupError:
            download('maxent_ne_chunker')

        try:
            find('words')
        except LookupError:
            download('words') 
Example 8
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    MIT License 6 votes vote down vote up
def add_full_stops_to_the_end(infile, outfile):
	#clean data of small titles nad add full stops for NLTK to work
	output_format = '{}.\n'.format
	with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
		for line in fin:
			if line[0] == ' ':
				pass
			#ignore headlines with less than three words
			elif len(line.split()) <= 3:
				pass
			elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
				print >> fout, line.decode('utf-8'),
			else:
				print >> fout, output_format(line.strip()).decode('utf-8'),



############################################
#   Convert All except first word and quotes
# 	to lower case 				           #
############################################ 
Example 9
Project: FancyWord   Author: EastonLee   File: chat80.py    GNU General Public License v3.0 6 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise 
Example 10
Project: razzy-spinner   Author: rafasashi   File: test_corpus_views.py    GNU General Public License v3.0 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example 11
Project: razzy-spinner   Author: rafasashi   File: test_corpus_views.py    GNU General Public License v3.0 5 votes vote down vote up
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data)) 
Example 12
Project: razzy-spinner   Author: rafasashi   File: test_corpus_views.py    GNU General Public License v3.0 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example 13
Project: razzy-spinner   Author: rafasashi   File: grammar.py    GNU General Public License v3.0 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example 14
Project: razzy-spinner   Author: rafasashi   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example 15
Project: razzy-spinner   Author: rafasashi   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise 
Example 16
Project: razzy-spinner   Author: rafasashi   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example 17
Project: razzy-spinner   Author: rafasashi   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: string
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True 
Example 18
Project: OpenBottle   Author: xiaozhuchacha   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example 19
Project: OpenBottle   Author: xiaozhuchacha   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data)) 
Example 20
Project: OpenBottle   Author: xiaozhuchacha   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example 21
Project: OpenBottle   Author: xiaozhuchacha   File: grammar.py    MIT License 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example 22
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example 23
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def sql_query(dbname, query):
    """
    Execute an SQL query over a database.
    :param dbname: filename of persistent store
    :type schema: str
    :param query: SQL query
    :type rel_name: str
    """
    import sqlite3
    try:
        path = nltk.data.find(dbname)
        connection =  sqlite3.connect(str(path))
        cur = connection.cursor()
        return cur.execute(query)
    except (ValueError, sqlite3.OperationalError):
        import warnings
        warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname)
        raise 
Example 24
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example 25
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: str
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True 
Example 26
Project: OpenBottle   Author: xiaozhuchacha   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example 27
Project: OpenBottle   Author: xiaozhuchacha   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example 28
Project: OpenBottle   Author: xiaozhuchacha   File: childes_fixt.py    MIT License 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    import nltk.data
    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest("The CHILDES corpus is not found. "
                       "It should be manually downloaded and saved/unpacked "
                       "to [NLTK_Data_Dir]/corpora/childes/") 
Example 29
Project: OpenBottle   Author: xiaozhuchacha   File: grammar.py    MIT License 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example 30
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example 31
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example 32
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def process_bundle(rels):
    """
    Given a list of relation metadata bundles, make a corresponding
    dictionary of concepts, indexed by the relation name.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list(dict)
    :return: a dictionary of concepts, indexed by the relation name.
    :rtype: dict(str): Concept 
    """
    concepts = {}
    for rel in rels:
        rel_name = rel['rel_name']
        closures = rel['closures']
        schema = rel['schema']
        filename = rel['filename']

        concept_list = clause2concepts(filename, rel_name, schema, closures)
        for c in concept_list:
            label = c.prefLabel
            if (label in concepts):
                for data in c.extension:
                    concepts[label].augment(data)
                concepts[label].close()
            else:
                concepts[label] = c
    return concepts 
Example 33
Project: OpenBottle   Author: xiaozhuchacha   File: chat80.py    MIT License 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: str
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True 
Example 34
Project: Health-Checker   Author: KriAga   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example 35
Project: Health-Checker   Author: KriAga   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data)) 
Example 36
Project: Health-Checker   Author: KriAga   File: test_corpus_views.py    MIT License 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example 37
Project: Health-Checker   Author: KriAga   File: grammar.py    MIT License 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example 38
Project: Health-Checker   Author: KriAga   File: chat80.py    MIT License 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example 39
Project: Health-Checker   Author: KriAga   File: chat80.py    MIT License 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example 40
Project: Health-Checker   Author: KriAga   File: chat80.py    MIT License 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: str
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True 
Example 41
Project: steam_game_generator   Author: applepinegames   File: download.py    MIT License 5 votes vote down vote up
def get_app_data(app_id):
  url = 'http://store.steampowered.com/api/appdetails?appids=' + str(app_id)
  response = urllib.urlopen(url)
  try:
    data = json.loads(response.read())
    if not data[str(app_id)]['success'] or data[str(app_id)]['data']['type'] != 'game':
      return None
    return data[str(app_id)]
  except:
    return None 
Example 42
Project: steam_game_generator   Author: applepinegames   File: download.py    MIT License 5 votes vote down vote up
def get_apps():
  url = 'http://api.steampowered.com/ISteamApps/GetAppList/v2/'
  response = urllib.urlopen(url)
  try:
    data = json.loads(response.read())
    apps = data['applist']['apps']
    return apps
  except:
    return None 
Example 43
Project: steam_game_generator   Author: applepinegames   File: download.py    MIT License 5 votes vote down vote up
def get_description_from_app_data(app_data):
  description = clean_string(app_data['data']['detailed_description'])
  sentences = SENTENCE_DETECTOR.tokenize(description.strip())
  if len(sentences) > 0:
    sentences = sentences[0:(min(3, len(sentences)))]
    sentences = [x for x in sentences if len(x.split(' ')) > 5 and not x.split(' ')[0].isupper() and x.find('\r') == -1]
    combined_sentence = ' '.join(sentences)
    if len(combined_sentence) == 0 or not combined_sentence[0].isalpha() or len(combined_sentence.split(' ')) < 5:
      return None
    return combined_sentence
  return None 
Example 44
Project: steam_game_generator   Author: applepinegames   File: download.py    MIT License 5 votes vote down vote up
def get_title_from_app_data(app_data):
  return clean_string(app_data['data']['name']) 
Example 45
Project: py-nltk-svo   Author: klintan   File: svo.py    MIT License 5 votes vote down vote up
def __init__(self):
        """
        Initialize the SVO Methods
        """
        self.noun_types = ["NN", "NNP", "NNPS", "NNS", "PRP"]
        self.verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
        self.adjective_types = ["JJ", "JJR", "JJS"]
        self.pred_verb_phrase_siblings = None
        self.parser = stanford.StanfordParser()
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') 
Example 46
Project: blabbr   Author: bfontaine   File: cli.py    MIT License 5 votes vote down vote up
def setup_nltk(self, **kw):
        import nltk
        from nltk.data import find

        tagger = "averaged_perceptron_tagger"

        try:
            find("taggers/%s" % tagger)
        except LookupError:
            click.echo("Downloading NTLK data (~2MB)...")
            nltk.download(tagger)
            return True

        return False 
Example 47
Project: regex4dummies   Author: DarkmatterVale   File: test_dependencies.py    MIT License 5 votes vote down vote up
def test_for_textblob(self):
        """
        Install textblob dependencies. It automatically
        checks to see if dependencies are already
        installed, so I do not need to do that.
        """

        # Installing data
        os.system("python -m textblob.download_corpora") 
Example 48
Project: text-summarizer   Author: mishless   File: batch_process.py    MIT License 5 votes vote down vote up
def print_stuff(sentences, sentences_features):

    data = sentences_features

    for i in range(0, len(data)):
        print("******************************")

        print("Sentence: ", end="")        
        print(sentences[i].original)

        print_sentence_info(data[i])

        print("Rules: ")
        rules.print_rules_results(data[i]) 
Example 49
Project: text-summarizer   Author: mishless   File: text-summarizer.py    MIT License 5 votes vote down vote up
def print_stuff(sentences, sentences_features):

    data = sentences_features

    for i in range(0, len(data)):
        print("******************************")

        print("Sentence: ", end="")        
        print(sentences[i].original)

        print_sentence_info(data[i])

        print("Rules: ")
        rules.print_rules_results(data[i]) 
Example 50
Project: TREC_WebTrack   Author: JoaoLages   File: utils.py    GNU General Public License v3.0 5 votes vote down vote up
def split_in_sentences(texts):
    global sentence_tokenizer
    if sentence_tokenizer is None:
        sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    return list(map(sentence_tokenizer.tokenize, texts)) 
Example 51
Project: FancyWord   Author: EastonLee   File: test_corpus_views.py    GNU General Public License v3.0 5 votes vote down vote up
def data(self):
        for name in self.names:
            f = nltk.data.find(name)
            with f.open() as fp:
                file_data = fp.read().decode('utf8')
            yield f, file_data 
Example 52
Project: FancyWord   Author: EastonLee   File: test_corpus_views.py    GNU General Public License v3.0 5 votes vote down vote up
def test_correct_values(self):
        # Check that corpus views produce the correct sequence of values.

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(list(v), file_data.split())

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(list(v), self.linetok.tokenize(file_data)) 
Example 53
Project: FancyWord   Author: EastonLee   File: test_corpus_views.py    GNU General Public License v3.0 5 votes vote down vote up
def test_correct_length(self):
        # Check that the corpus views report the correct lengths:

        for f, file_data in self.data():
            v = StreamBackedCorpusView(f, read_whitespace_block)
            self.assertEqual(len(v), len(file_data.split()))

            v = StreamBackedCorpusView(f, read_line_block)
            self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) 
Example 54
Project: FancyWord   Author: EastonLee   File: childes_fixt.py    GNU General Public License v3.0 5 votes vote down vote up
def setup_module(module):
    from nose import SkipTest
    import nltk.data
    try:
        nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')
    except LookupError as e:
        print(e)
        raise SkipTest("The CHILDES corpus is not found. "
                       "It should be manually downloaded and saved/unpacked "
                       "to [NLTK_Data_Dir]/corpora/childes/") 
Example 55
Project: FancyWord   Author: EastonLee   File: grammar.py    GNU General Public License v3.0 5 votes vote down vote up
def fcfg_demo():
    import nltk.data
    g = nltk.data.load('grammars/book_grammars/feat0.fcfg')
    print(g)
    print() 
Example 56
Project: FancyWord   Author: EastonLee   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def augment(self, data):
        """
        Add more data to the ``Concept``'s extension set.

        :param data: a new semantic value
        :type data: string or pair of strings
        :rtype: set

        """
        self._extension.add(data)
        self.extension = sorted(list(self._extension))
        return self._extension 
Example 57
Project: FancyWord   Author: EastonLee   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def _str2records(filename, rel):
    """
    Read a file into memory and convert each relation clause into a list.
    """
    recs = []
    contents = nltk.data.load("corpora/chat80/%s" % filename, format="text")
    for line in contents.splitlines():
        if line.startswith(rel):
            line = re.sub(rel+r'\(', '', line)
            line = re.sub(r'\)\.$', '', line)
            record = line.split(',')
            recs.append(record)
    return recs 
Example 58
Project: FancyWord   Author: EastonLee   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def process_bundle(rels):
    """
    Given a list of relation metadata bundles, make a corresponding
    dictionary of concepts, indexed by the relation name.

    :param rels: bundle of metadata needed for constructing a concept
    :type rels: list of dict
    :return: a dictionary of concepts, indexed by the relation name.
    :rtype: dict
    """
    concepts = {}
    for rel in rels:
        rel_name = rel['rel_name']
        closures = rel['closures']
        schema = rel['schema']
        filename = rel['filename']

        concept_list = clause2concepts(filename, rel_name, schema, closures)
        for c in concept_list:
            label = c.prefLabel
            if (label in concepts):
                for data in c.extension:
                    concepts[label].augment(data)
                concepts[label].close()
            else:
                concepts[label] = c
    return concepts 
Example 59
Project: FancyWord   Author: EastonLee   File: chat80.py    GNU General Public License v3.0 5 votes vote down vote up
def val_load(db):
    """
    Load a ``Valuation`` from a persistent database.

    :param db: name of file from which data is read.
               The suffix '.db' should be omitted from the name.
    :type db: string
    """
    dbname = db+".db"

    if not os.access(dbname, os.R_OK):
        sys.exit("Cannot read file: %s" % dbname)
    else:
        db_in = shelve.open(db)
        from nltk.sem import Valuation
        val = Valuation(db_in)
#        val.read(db_in.items())
        return val


#def alpha(str):
    #"""
    #Utility to filter out non-alphabetic constants.

    #:param str: candidate constant
    #:type str: string
    #:rtype: bool
    #"""
    #try:
        #int(str)
        #return False
    #except ValueError:
        ## some unknown values in records are labeled '?'
        #if not str == '?':
            #return True 
Example 60
Project: Peerion   Author: hyperionxtech   File: __main__.py    Apache License 2.0 5 votes vote down vote up
def get_nltk_data_directories():
    import nltk.data

    data_directories = []

    # Find each data directory in the NLTK path that has content
    for path in nltk.data.path:
        if os.path.exists(path):
            if os.listdir(path):
                data_directories.append(path)

    return os.linesep.join(data_directories) 
Example 61
Project: transfer-learning-ner   Author: ciads-ut   File: bbn2conll.py    MIT License 5 votes vote down vote up
def write_all_to_conll(DESC_DECISION='merge'):
    """ Convert all the BBN data to CONLL-format and save in CONLLDIR.

    For a description of the DESC_DECISION argument, see the documentation
    for function fix_iobtag.

    """
    filenames = os.listdir(DATADIR)
    for filename in filenames:
        writeconll(filename) 
Example 62
Project: transfer-learning-ner   Author: ciads-ut   File: bbn2conll.py    MIT License 5 votes vote down vote up
def parse_docs(filename):

    #with open('wsj00a.qa') as fd:
    with open(filename,'r') as fd:
        data = fd.read()

    root = ET.fromstring(data)
    docs = root.getchildren()
    #c1 = docs[1]
    store = []
    for ii,doc in enumerate(docs):
        #print ii
        p = parse_doc(doc)
        store.extend(p)
    return store 
Example 63
Project: razzy-spinner   Author: rafasashi   File: discourse.py    GNU General Public License v3.0 4 votes vote down vote up
def discourse_demo(reading_command=None):
    """
    Illustrate the various methods of ``DiscourseTester``
    """
    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
                         reading_command)
    dt.models()
    print()
    #dt.grammar()
    print()
    dt.sentences()
    print()
    dt.readings()
    print()
    dt.readings(threaded=True)
    print()
    dt.models('d1')
    dt.add_sentence('John is a boxer')
    print()
    dt.sentences()
    print()
    dt.readings(threaded=True)
    print()
    dt = DiscourseTester(['A student dances', 'Every student is a person'],
                         reading_command)
    print()
    dt.add_sentence('No person dances', consistchk=True)
    print()
    dt.readings()
    print()
    dt.retract_sentence('No person dances', verbose=True)
    print()
    dt.models()
    print()
    dt.readings('A person dances')
    print()
    dt.add_sentence('A person dances', informchk=True)
    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
                          'Vincent is married', 'Fido barks'],
                          reading_command)
    dt.readings(filter=True)
    import nltk.data
    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
    background = nltk.data.load(background_file)
    
    print()
    dt.add_background(background, verbose=False)
    dt.background()
    print()
    dt.readings(filter=True)
    print()
    dt.models() 
Example 64
Project: razzy-spinner   Author: rafasashi   File: util.py    GNU General Public License v3.0 4 votes vote down vote up
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from sentiment_analyzer import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
Example 65
Project: razzy-spinner   Author: rafasashi   File: grammar.py    GNU General Public License v3.0 4 votes vote down vote up
def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use .replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n'+' '*26))
    print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)

        productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse) 
Example 66
Project: OpenBottle   Author: xiaozhuchacha   File: discourse.py    MIT License 4 votes vote down vote up
def discourse_demo(reading_command=None):
    """
    Illustrate the various methods of ``DiscourseTester``
    """
    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
                         reading_command)
    dt.models()
    print()
    #dt.grammar()
    print()
    dt.sentences()
    print()
    dt.readings()
    print()
    dt.readings(threaded=True)
    print()
    dt.models('d1')
    dt.add_sentence('John is a boxer')
    print()
    dt.sentences()
    print()
    dt.readings(threaded=True)
    print()
    dt = DiscourseTester(['A student dances', 'Every student is a person'],
                         reading_command)
    print()
    dt.add_sentence('No person dances', consistchk=True)
    print()
    dt.readings()
    print()
    dt.retract_sentence('No person dances', verbose=True)
    print()
    dt.models()
    print()
    dt.readings('A person dances')
    print()
    dt.add_sentence('A person dances', informchk=True)
    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
                          'Vincent is married', 'Fido barks'],
                          reading_command)
    dt.readings(filter=True)
    import nltk.data
    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
    background = nltk.data.load(background_file)
    
    print()
    dt.add_background(background, verbose=False)
    dt.background()
    print()
    dt.readings(filter=True)
    print()
    dt.models() 
Example 67
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 4 votes vote down vote up
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
Example 68
Project: OpenBottle   Author: xiaozhuchacha   File: grammar.py    MIT License 4 votes vote down vote up
def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use .replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n'+' '*26))
    print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)

        productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse) 
Example 69
Project: OpenBottle   Author: xiaozhuchacha   File: discourse.py    MIT License 4 votes vote down vote up
def discourse_demo(reading_command=None):
    """
    Illustrate the various methods of ``DiscourseTester``
    """
    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
                         reading_command)
    dt.models()
    print()
    #dt.grammar()
    print()
    dt.sentences()
    print()
    dt.readings()
    print()
    dt.readings(threaded=True)
    print()
    dt.models('d1')
    dt.add_sentence('John is a boxer')
    print()
    dt.sentences()
    print()
    dt.readings(threaded=True)
    print()
    dt = DiscourseTester(['A student dances', 'Every student is a person'],
                         reading_command)
    print()
    dt.add_sentence('No person dances', consistchk=True)
    print()
    dt.readings()
    print()
    dt.retract_sentence('No person dances', verbose=True)
    print()
    dt.models()
    print()
    dt.readings('A person dances')
    print()
    dt.add_sentence('A person dances', informchk=True)
    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
                          'Vincent is married', 'Fido barks'],
                          reading_command)
    dt.readings(filter=True)
    import nltk.data
    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
    background = nltk.data.load(background_file)
    
    print()
    dt.add_background(background, verbose=False)
    dt.background()
    print()
    dt.readings(filter=True)
    print()
    dt.models() 
Example 70
Project: OpenBottle   Author: xiaozhuchacha   File: util.py    MIT License 4 votes vote down vote up
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
Example 71
Project: OpenBottle   Author: xiaozhuchacha   File: grammar.py    MIT License 4 votes vote down vote up
def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use .replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n'+' '*26))
    print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)
        tree.chomsky_normal_form(horzMarkov = 2)

        productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    #sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse) 
Example 72
Project: Health-Checker   Author: KriAga   File: discourse.py    MIT License 4 votes vote down vote up
def discourse_demo(reading_command=None):
    """
    Illustrate the various methods of ``DiscourseTester``
    """
    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
                         reading_command)
    dt.models()
    print()
    # dt.grammar()
    print()
    dt.sentences()
    print()
    dt.readings()
    print()
    dt.readings(threaded=True)
    print()
    dt.models('d1')
    dt.add_sentence('John is a boxer')
    print()
    dt.sentences()
    print()
    dt.readings(threaded=True)
    print()
    dt = DiscourseTester(['A student dances', 'Every student is a person'],
                         reading_command)
    print()
    dt.add_sentence('No person dances', consistchk=True)
    print()
    dt.readings()
    print()
    dt.retract_sentence('No person dances', verbose=True)
    print()
    dt.models()
    print()
    dt.readings('A person dances')
    print()
    dt.add_sentence('A person dances', informchk=True)
    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
                          'Vincent is married', 'Fido barks'],
                         reading_command)
    dt.readings(filter=True)
    import nltk.data
    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
    background = nltk.data.load(background_file)

    print()
    dt.add_background(background, verbose=False)
    dt.background()
    print()
    dt.readings(filter=True)
    print()
    dt.models() 
Example 73
Project: Health-Checker   Author: KriAga   File: util.py    MIT License 4 votes vote down vote up
def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None):
    """
    Train and test a classifier on instances of the Subjective Dataset by Pang and
    Lee. The dataset is made of 5000 subjective and 5000 objective sentences.
    All tokens (words and punctuation marks) are separated by a whitespace, so
    we use the basic WhitespaceTokenizer to parse the data.

    :param trainer: `train` method of a classifier.
    :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file.
    :param n_instances: the number of total sentences that have to be used for
        training and testing. Sentences will be equally split between positive
        and negative.
    :param output: the output file where results have to be reported.
    """
    from nltk.sentiment import SentimentAnalyzer
    from nltk.corpus import subjectivity

    if n_instances is not None:
        n_instances = int(n_instances/2)

    subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
    obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

    # We separately split subjective and objective instances to keep a balanced
    # uniform class distribution in both train and test sets.
    train_subj_docs, test_subj_docs = split_train_test(subj_docs)
    train_obj_docs, test_obj_docs = split_train_test(obj_docs)

    training_docs = train_subj_docs+train_obj_docs
    testing_docs = test_subj_docs+test_obj_docs

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

    # Add simple unigram word features handling negation
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

    # Apply features to obtain a feature-value representation of our datasets
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    classifier = sentim_analyzer.train(trainer, training_set)
    try:
        classifier.show_most_informative_features()
    except AttributeError:
        print('Your classifier does not provide a show_most_informative_features() method.')
    results = sentim_analyzer.evaluate(test_set)

    if save_analyzer == True:
        save_file(sentim_analyzer, 'sa_subjectivity.pickle')

    if output:
        extr = [f.__name__ for f in sentim_analyzer.feat_extractors]
        output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__,
                        Tokenizer='WhitespaceTokenizer', Feats=extr,
                        Instances=n_instances, Results=results)

    return sentim_analyzer 
Example 74
Project: Health-Checker   Author: KriAga   File: grammar.py    MIT License 4 votes vote down vote up
def pcfg_demo():
    """
    A demonstration showing how a ``PCFG`` can be created and used.
    """

    from nltk.corpus import treebank
    from nltk import treetransforms
    from nltk import induce_pcfg
    from nltk.parse import pchart

    pcfg_prods = toy_pcfg1.productions()

    pcfg_prod = pcfg_prods[2]
    print('A PCFG production:', repr(pcfg_prod))
    print('    pcfg_prod.lhs()  =>', repr(pcfg_prod.lhs()))
    print('    pcfg_prod.rhs()  =>', repr(pcfg_prod.rhs()))
    print('    pcfg_prod.prob() =>', repr(pcfg_prod.prob()))
    print()

    grammar = toy_pcfg2
    print('A PCFG grammar:', repr(grammar))
    print('    grammar.start()       =>', repr(grammar.start()))
    print('    grammar.productions() =>', end=' ')
    # Use .replace(...) is to line-wrap the output.
    print(repr(grammar.productions()).replace(',', ',\n' + ' ' * 26))
    print()

    # extract productions from three trees and induce the PCFG
    print("Induce PCFG grammar from treebank data:")

    productions = []
    item = treebank._fileids[0]
    for tree in treebank.parsed_sents(item)[:3]:
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS=False)
        tree.chomsky_normal_form(horzMarkov=2)

        productions += tree.productions()

    S = Nonterminal('S')
    grammar = induce_pcfg(S, productions)
    print(grammar)
    print()

    print("Parse sentence using induced grammar:")

    parser = pchart.InsideChartParser(grammar)
    parser.trace(3)

    # doesn't work as tokens are different:
    # sent = treebank.tokenized('wsj_0001.mrg')[0]

    sent = treebank.parsed_sents(item)[0].leaves()
    print(sent)
    for parse in parser.parse(sent):
        print(parse) 
Example 75
Project: movie2parallelDB   Author: alpoktem   File: movie2parallelDB.py    GNU General Public License v3.0 4 votes vote down vote up
def main(options):
	process_list_eng = fill_task_list_from_file(options.list_of_files_eng, options.output_dir)
	process_list_spa = fill_task_list_from_file(options.list_of_files_spa, options.output_dir)

	print(process_list_spa)
	print(process_list_eng)

	assert len(process_list_eng) == len(process_list_spa), "Process lists are not the same length"

	for task_index, (proscript_eng, proscript_spa) in enumerate(zip(process_tasks(process_list_eng, options.input_audio_format, skip_mfa=options.skip_mfa), process_tasks(process_list_spa, options.input_audio_format, skip_mfa=options.skip_mfa))):
		proscript_mapping = map_segments(proscript_spa, proscript_eng)

		aligned_proscript_spa, aligned_proscript_eng = get_aligned_proscripts(proscript_mapping, proscript_spa, proscript_eng)

		aligned_proscript_spa.get_speaker_means()
		aligned_proscript_eng.get_speaker_means()
		utils.assign_acoustic_means(aligned_proscript_spa)
		utils.assign_acoustic_means(aligned_proscript_eng)

		#Determine paths for parallel data
		task_output_path = process_list_eng[task_index]['output_dir']
		parallel_output_path = os.path.join(task_output_path, '..', 'spa-eng')
		checkArgument(parallel_output_path, createDir = True)

		#write mapping to file
		mapping_file_path = os.path.join(parallel_output_path, '%s_mapping.txt'%aligned_proscript_eng.id)
		mapping_to_file(proscript_mapping, mapping_file_path, proscript_spa, proscript_eng)
		mapping_tmx_file_path = os.path.join(parallel_output_path, '%s.tmx'%aligned_proscript_eng.id)
		mapping_as_tmx(proscript_mapping, mapping_tmx_file_path, proscript_spa, proscript_eng)
		print("Mapping extracted to %s"%mapping_file_path)

		print("Spanish audio: %s"%aligned_proscript_spa.audio_file)
		print("English audio: %s"%aligned_proscript_eng.audio_file)

		#generate textgrid files
		utils.proscript_to_textgrid(aligned_proscript_spa, parallel_output_path)
		utils.proscript_to_textgrid(aligned_proscript_eng, parallel_output_path)
		print("Spanish Textgrid: %s"%aligned_proscript_spa.textgrid_file)
		print("English Textgrid: %s"%aligned_proscript_eng.textgrid_file)

		#store aligned proscript data to disk
		extract_proscript_data_to_disk(aligned_proscript_spa, parallel_output_path, 'spa', cut_audio_portions = True, extract_segments_as_proscript = True, output_audio_format = 'wav', segments_subdir='segments_spa')
		extract_proscript_data_to_disk(aligned_proscript_eng, parallel_output_path, 'eng', cut_audio_portions = True, extract_segments_as_proscript = True, output_audio_format = 'wav', segments_subdir='segments_eng')
	
	#merge segments merged in the mapping. output 
Example 76
Project: regex4dummies   Author: DarkmatterVale   File: test_dependencies.py    MIT License 4 votes vote down vote up
def test_for_nlpnet(self):
        """
        Attempting to use nlpnet. This will cause an
        error if the required dependencies are not
        downloaded.
        """

        try:
            # Creating a new compare object
            compare_nlpnet = Compare()

            # Comparing using the nltk parser
            compare_nlpnet.compare_strings(text=["what time is it here?", "This is the cat's hat"], pattern_detection=False, parser="nlpnet")

            # If that was successfuly, getting information
            sentence_information = compare_nlpnet.get_pattern_information()
            for sentence in sentence_information:
                my_pattern = "[ Pattern ]          : " + sentence.pattern
                my_subject = "[ Subject ]          : " + sentence.subject
                my_verb = "[ Verb ]             : " + sentence.verb
                my_object = "[ Object ]           : " + sentence.object[0]
                my_preps = "[ Prep Phrases ]     : " + str(sentence.prepositional_phrases)
                my_reliability_score = "[ Reliability Score ]: " + str(sentence.reliability_score)
        except:
            # Getting nltk data path
            running = Popen(['python -c "import nltk;print nltk.data.path"'], stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
            stdin, stdout = running.communicate()

            # Setting the path that the nlpnet dependency will be downloaded from
            path = re.sub(r"\'", "", re.sub(r"\[", '', str(stdin.split('\n')[0].split(',')[0])))
            path = path.split(r"/")
            path = '/'.join(path[0 : len(path) - 1]) + '/nlpnet_dependency/'

            # Download the dependencies & extract
            current_directory = os.getcwd()

            os.mkdir(path)
            os.chdir(path)

            os.system("wget http://nilc.icmc.usp.br/nlpnet/data/dependency-en.tgz")
            tar = tarfile.open(path + 'dependency-en.tgz', 'r:gz')
            tar.extractall(path)
            os.remove(path + 'dependency-en.tgz')

            os.chdir(current_directory) 
Example 77
Project: text-summarizer   Author: mishless   File: batch_process.py    MIT License 4 votes vote down vote up
def pre_process_text(text):

    while text[0] == "\n":
        text = text[1:]

    text = text.split('\n', 1)

    title = tc.Title(text[0], [])
    text = text[1].replace(u"\u2018", '\'').replace(u"\u2019", '\'').replace(u"\u201c",'"').replace(u"\u201d", '"')
    words = dict()
    sentences = []
    
    sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    detected_sentences = sentence_detector.tokenize(text.strip())
    stopwords_list = nltk.corpus.stopwords.words('english')
    stemmer = nltk.stem.porter.PorterStemmer()
    
    #Pre-process title
    tokens = nltk.word_tokenize(title.original)
    tokens = [token for token in tokens if token not in stopwords_list]
    part_of_speech = nltk.pos_tag(tokens)

    for (token, word_pos) in zip(tokens, part_of_speech):
        token = token.lower()
        if (token not in words) and (token not in list(string.punctuation) and (token not in stopwords_list)):
                words[token] = tc.Word(stemmer.stem(token), word_pos, [(lemma, stemmer.stem(lemma)) for synset in nltk.corpus.wordnet.synsets(token) for lemma in synset.lemma_names()])
        title.bag_of_words.append(token)

    #Pre-process text
    for detected_sentence in detected_sentences:
        
        tokens = nltk.word_tokenize(detected_sentence)
        tokens = [token for token in tokens if token not in stopwords_list]
        if tokens:
            part_of_speech = nltk.pos_tag(tokens)
            bag_of_words = []
            stemmed_bag_of_words = []
            for (token, word_pos) in zip(tokens, part_of_speech):
                token = token.lower()
                if (token not in list(string.punctuation) and (token not in stopwords_list)):
                    if (token not in words):
                        words[token] = tc.Word(stemmer.stem(token), word_pos, [(lemma, stemmer.stem(lemma)) for synset in nltk.corpus.wordnet.synsets(token) for lemma in synset.lemma_names()])
                    elif token in words:
                        words[token].increment_abs_frequency()
                    bag_of_words.append(token)
                    stemmed_bag_of_words.append(stemmer.stem(token))
            if (len(bag_of_words) != 0 or len(stemmed_bag_of_words) != 0):
                sentences.append(tc.Sentence(detected_sentence, len(sentences) + 1, [], [], None))
                sentences[-1].bag_of_words = list(bag_of_words)
                sentences[-1].stemmed_bag_of_words = list(stemmed_bag_of_words)        
    return [title, sentences, words] 
Example 78
Project: text-summarizer   Author: mishless   File: text-summarizer.py    MIT License 4 votes vote down vote up
def pre_process_text(text):

    while text[0] == "\n":
        text = text[1:]
        
    text = text.split('\n', 1)
    title = tc.Title(text[0], [])
    text = text[1].replace(u"\u2018", '\'').replace(u"\u2019", '\'').replace(u"\u201c",'"').replace(u"\u201d", '"')
    words = dict()
    sentences = []
    
    sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    detected_sentences = sentence_detector.tokenize(text.strip())
    stopwords_list = nltk.corpus.stopwords.words('english')
    stemmer = nltk.stem.porter.PorterStemmer()
    
    #Pre-process title
    tokens = nltk.word_tokenize(title.original)
    tokens = [token for token in tokens if token not in stopwords_list]
    part_of_speech = nltk.pos_tag(tokens)
    for (token, word_pos) in zip(tokens, part_of_speech):
        token = token.lower()
        if (token not in words) and (token not in list(string.punctuation) and (token not in stopwords_list)):
                words[token] = tc.Word(stemmer.stem(token), word_pos, [(lemma, stemmer.stem(lemma)) for synset in nltk.corpus.wordnet.synsets(token) for lemma in synset.lemma_names()])
        title.bag_of_words.append(token)

    #Pre-process text
    for detected_sentence in detected_sentences:
        
        tokens = nltk.word_tokenize(detected_sentence)
        tokens = [token for token in tokens if token not in stopwords_list]
        if tokens:
            part_of_speech = nltk.pos_tag(tokens)
            bag_of_words = []
            stemmed_bag_of_words = []
            for (token, word_pos) in zip(tokens, part_of_speech):
                token = token.lower()
                if (token not in list(string.punctuation) and (token not in stopwords_list)):
                    if (token not in words):
                        words[token] = tc.Word(stemmer.stem(token), word_pos, [(lemma, stemmer.stem(lemma)) for synset in nltk.corpus.wordnet.synsets(token) for lemma in synset.lemma_names()])
                    elif token in words:
                        words[token].increment_abs_frequency()
                    bag_of_words.append(token)
                    stemmed_bag_of_words.append(stemmer.stem(token))
            if (len(bag_of_words) != 0 or len(stemmed_bag_of_words) != 0):
                sentences.append(tc.Sentence(detected_sentence, len(sentences) + 1, [], [], None))
                sentences[-1].bag_of_words = list(bag_of_words)
                sentences[-1].stemmed_bag_of_words = list(stemmed_bag_of_words)        
    return [title, sentences, words] 
Example 79
Project: FancyWord   Author: EastonLee   File: discourse.py    GNU General Public License v3.0 4 votes vote down vote up
def discourse_demo(reading_command=None):
    """
    Illustrate the various methods of ``DiscourseTester``
    """
    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
                         reading_command)
    dt.models()
    print()
    #dt.grammar()
    print()
    dt.sentences()
    print()
    dt.readings()
    print()
    dt.readings(threaded=True)
    print()
    dt.models('d1')
    dt.add_sentence('John is a boxer')
    print()
    dt.sentences()
    print()
    dt.readings(threaded=True)
    print()
    dt = DiscourseTester(['A student dances', 'Every student is a person'],
                         reading_command)
    print()
    dt.add_sentence('No person dances', consistchk=True)
    print()
    dt.readings()
    print()
    dt.retract_sentence('No person dances', verbose=True)
    print()
    dt.models()
    print()
    dt.readings('A person dances')
    print()
    dt.add_sentence('A person dances', informchk=True)
    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
                          'Vincent is married', 'Fido barks'],
                          reading_command)
    dt.readings(filter=True)
    import nltk.data
    background_file = os.path.join('grammars', 'book_grammars', 'background.fol')
    background = nltk.data.load(background_file)
    
    print()
    dt.add_background(background, verbose=False)
    dt.background()
    print()
    dt.readings(filter=True)
    print()
    dt.models() 
Example 80
Project: nltk-on-gae   Author: sivu22   File: discourse.py    Apache License 2.0 4 votes vote down vote up
def discourse_demo(reading_command=None):
    """
    Illustrate the various methods of ``DiscourseTester``
    """
    dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'],
                         reading_command)
    dt.models()
    print()
    #dt.grammar()
    print()
    dt.sentences()
    print()
    dt.readings()
    print()
    dt.readings(threaded=True)
    print()
    dt.models('d1')
    dt.add_sentence('John is a boxer')
    print()
    dt.sentences()
    print()
    dt.readings(threaded=True)
    print()
    dt = DiscourseTester(['A student dances', 'Every student is a person'],
                         reading_command)
    print()
    dt.add_sentence('No person dances', consistchk=True)
    print()
    dt.readings()
    print()
    dt.retract_sentence('No person dances', verbose=True)
    print()
    dt.models()
    print()
    dt.readings('A person dances')
    print()
    dt.add_sentence('A person dances', informchk=True)
    dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer',
                          'Vincent is married', 'Fido barks'],
                          reading_command)
    dt.readings(filter=True)
    import nltk.data
    background = nltk.data.load('/grammars/book_grammars/background.fol')
    print()
    dt.add_background(background, verbose=False)
    dt.background()
    print()
    dt.readings(filter=True)
    print()
    dt.models()