Python nltk.Text() Examples

The following are code examples for showing how to use nltk.Text(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: StabiHacks   Author: elektrobohemian   File: fulltext_statistics.py    Apache License 2.0 6 votes vote down vote up
def creatStatisticFiles(statFilePath, resultTxt):
    statFile = open(statFilePath, "w")
    # standard NLP workflow
    # 1) tokenize the text
    tokens = nltk.word_tokenize(resultTxt)
    nltkText=nltk.Text(tokens)
    # 2) normalize tokens
    words = [w.lower() for w in tokens]
    # 3) create vocabulary
    vocab = sorted(set(words))

    # calculate token frequencies
    fdist = nltk.FreqDist(nltkText)
    fTxt=""
    for (word,freq) in fdist.most_common(100):
        fTxt+=str(word)+"\t"+str(freq)+"\n"
    statFile.write(fTxt)
    statFile.close() 
Example 2
Project: HerokuCondaFlaskCeleryRedis   Author: mmadsen   File: app.py    Apache License 2.0 6 votes vote down vote up
def count_words_from_html(page):
	"""
	Given a returned page from the requests library, this method 
	extracts the raw text using BeautifulSoup, tokenizes, removes
	punctuation, and tabulates the raw result and the result with
	common English stop words removed, and returns a tuple of results
	"""
	raw = BeautifulSoup(page.text, 'html.parser').get_text()
	nltk.data.path.append('./nltk_data') # set path for precompiled tokenizers
	tokens = nltk.word_tokenize(raw)
	text = nltk.Text(tokens)

	# remove punctuation
	nonPunct = re.compile('.*[A-Za-z].*')
	raw_words = [w for w in text if nonPunct.match(w)]
	raw_word_counts = Counter(raw_words)

	# remove English stop words
	stops = stop_words.get_stop_words('english')
	no_stop_words = [w for w in raw_words if w.lower() not in stops]
	no_stop_counts = Counter(no_stop_words)

	return raw_word_counts, no_stop_counts 
Example 3
Project: EL-Python   Author: EngLang   File: dict_parser.py    MIT License 6 votes vote down vote up
def word_links():
    # to_analyze = nltk.data.load(, format='auto', cache=true,
    # verbose=False, logic_parser=None, fstruct_reader=None, encoding=None)
    
    # Download dict.json word resource file
    pl.plogger.okblue("Downloading word list...")
    response = urlopen(
        "https://raw.githubusercontent.com/adambom/dictionary/master/dictionary.json")
    raw = response.read().decode('utf8')
    pl.plogger.okgreen("Finished downloading word list")
    
    pl.plogger.okblue("Creating entities...")
    tokens = nltk.word_tokenize(raw)
    text = nltk.Text(tokens)
    tagged = nltk.pos_tag(text)
    entities = nltk.chunk.ne_chunk(tagged)
    pl.plogger.okgreen("Finished creating entities...")
    return entities 
Example 4
Project: AutomaticEssayGrading   Author: SahilC   File: models.py    MIT License 5 votes vote down vote up
def train_nn(training_data):
    feature_vector = []
    scores = []
    fo = codecs.open(training_data, encoding='utf-8')
    lines = fo.readlines()
    fo.close()
    vocab = set()
    # for l in lines:
    #     words = nltk.word_tokenize(l)
    #     tokens = nltk.Text(words)
    #     for j in tokens:
    #         if j.isalpha():
    #             vocab.add(j.lower())
    # print len(vocab)
    # print vocab
    line = 0
    for each_line in lines:
        row = each_line.split('\n')[0].split('\t')
        vector = []
        # Ignore the heading line
        if line < 1:
            line += 1
            continue
        if line % 50 == 0:
            print('Training sample: '+str(line))
        e = Essay(row, store_score = True)
        f = e.features
        for i in sorted(f.__dict__.keys()):
            vector.append(f.__dict__[i])
        scores.append(e.score)
        feature_vector.append(vector)
        line += 1
    #f = improved_features.Features(lines)


# Trains the SVR classifier on training data and returns the classifier 
Example 5
Project: werika   Author: pywirrarika   File: wixanlp.py    GNU General Public License v3.0 5 votes vote down vote up
def token(data):
    tokens = nltk.word_tokenize(data)
    text = nltk.Text(tokens)
    return text 
Example 6
Project: flask-celery-boilerplate   Author: JhossePaul   File: count_words.py    GNU Affero General Public License v3.0 5 votes vote down vote up
def count_words(url):
    errors = []
    result = Words.query.filter_by(url=url).first()
    if result:
        return result.id

    try:
        r = requests.get(url)
    except:
        errors.append("Unable to get URL")
        return {"error": errors}

    raw = BeautifulSoup(r.text, "html.parser").get_text()
    tokens = nltk.word_tokenize(raw)
    text = nltk.Text(tokens)

    nonPunct = re.compile('.*[A-Za-z].*')
    raw_words = [w for w in text if nonPunct.match(w)]
    raw_word_count = Counter(raw_words)

    no_stop_words = [
        w
        for w
        in raw_words
        if w.lower() not in nltk.corpus.stopwords.words("spanish")
    ]
    no_stop_words_count = Counter(no_stop_words)

    try:
        result = Words(
            url=url,
            result_all=raw_word_count,
            result_no_stop_words=no_stop_words_count
        )
        db.session.add(result)
        db.session.commit()
        return result.id
    except:
        errors.append("Unable to add item to database")
        return {"error": errors} 
Example 7
Project: frankenstein   Author: hunterowens   File: app.py    Apache License 2.0 5 votes vote down vote up
def tokenize_nltk(text):
    """
    Note: 	This function imports a list of custom stopwords from the user
        If the user does not modify custom stopwords (default=[]),
        there is no substantive update to the stopwords.
    """
    tokens = word_tokenize(text)
    text = nltk.Text(tokens)
    stop_words = set(stopwords.words('english'))
    # stop_words.update(custom_stopwords)
    words = [w.lower() for w in text if w.isalpha() and w.lower() not in stop_words]
    return words 
Example 8
Project: Python-Forensics   Author: mpedzi03   File: _classNLTKQuery.py    GNU General Public License v3.0 5 votes vote down vote up
def textCorpusInit(self, thePath):
        # Validate the path is a directory
        if not os.path.isdir(thePath):
            return "Path is not a Directory"
        # Validate the path is readable
        if not os.access(thePath, os.R_OK):
            return "Directory is not Readable"
        # Attempt to Create a corpus with all .txt files found in Directory
        try:
            self.Corpus = PlaintextCorpusReader(thePath, '.*')
            print "Processing Files:"
            print self.Corpus.fileids()
            print "Please wait...."
            self.rawText = self.Corpus.raw()
            self.tokens = nltk.word_tokenize(self.rawText)
            upperstop = [word.upper() for word in stopwords.words('english')]
            self.tokens_nostop = [t for t in self.tokens if t not in upperstop]
            
                                  
            self.TextCorpus = nltk.Text(self.tokens)
            
            self.TextCorpusNoStop = nltk.Text(self.tokens_nostop)

            self.stemmer = PorterStemmer()
            self.stemmedTokens = [self.stemmer.stem(t.lower()) for t in self.tokens_nostop]
            self.stemmedText = nltk.Text(self.stemmedTokens)
            
            self.PosTaggedCorpus =nltk.pos_tag(self.tokens)
            
        except:
            return "Corpus Creation Failed"

        self.ActiveTextCorpus = True
        return "Success" 
Example 9
Project: Python-Forensics   Author: mpedzi03   File: _classNLTKQuery.py    GNU General Public License v3.0 5 votes vote down vote up
def printCorpusLength(self):
        print
        print "Corpus Text Length: ",
        print len(self.rawText) 
Example 10
Project: llp   Author: quadrismegistus   File: text.py    MIT License 5 votes vote down vote up
def nltk(self):
		import nltk
		tokens = nltk.word_tokenize(self.text_plain())
		return nltk.Text(tokens) 
Example 11
Project: llp   Author: quadrismegistus   File: text.py    MIT License 5 votes vote down vote up
def meta(self):
		if not hasattr(self,'_meta'):
			self._meta=md={'corpus':self.corpus.name}
			for t in reversed(self.texts):
				for k,v in list(t.meta.items()):
					#if k in md and md[k]:
					#	k=k+'_'+t.__class__.__name__.replace('Text','').lower()
					md[k]=v
		return self._meta 
Example 12
Project: Distrpy   Author: j0e1in   File: analyze.py    MIT License 5 votes vote down vote up
def analyze(content, url, title):
	tokenizer = RegexpTokenizer(r'\w+')
	en_stop = get_stop_words('en')
	p_stemmer = LancasterStemmer()

	stop_token = ['The', 'can', 's', 'I', 't', 'am', 'are']
	texts = []
	content_tokens = word_tokenize(content)
	title_tokens = word_tokenize(title)
	content_text = nltk.Text(content_tokens)

	tokens = tokenizer.tokenize(content)

	tokens = [i for i in tokens if not i.isdigit()]  #Remove all numbers
	stopped_tokens = [i for i in tokens if not i in en_stop] #Remove all meaningless words
	stopped_tokens = [i for i in stopped_tokens if not i in stop_token] #Stem tokens
	stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
	texts.append(stemmed_tokens)

	dictionary = corpora.Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]
	ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1,\
	 id2word = dictionary, passes=20)
	topics = ldamodel.show_topic(0, 3)
	#topics = ldamodel.print_topics(num_topics=1, num_words=3)[0]
	Rtopic = []

	for topicTuple in topics:
		topic, rate = topicTuple
		Rtopic.append(topic)

	if len(Rtopic) == 0:
		Rtopic.append("Not English")
		Rtopic.append("Maybe Chinese?")

	return (Rtopic, url, title) 
Example 13
Project: PDF-Scraper   Author: ian-nai   File: pos_grapher.py    GNU General Public License v3.0 5 votes vote down vote up
def grapher(): 

    with open("scrape.txt", 'r') as file1:
        text1 = file1.read()
    
    tokenizer = nltk.word_tokenize
    tokens = tokenizer(text1)
    text =  nltk.Text(tokens)
    tags = nltk.pos_tag(text)


    counts = Counter(tag for word, tag in tags)
    print counts

    graphtags = [tag for word, tag in tags]
    graphfreqs = nltk.FreqDist(graphtags)
    graphfreqs.tabulate(25)
    graphfreqs.plot(20)
    
    pdf_again = raw_input("Would you like to scrape another PDF?")
    if pdf_again == "y":
        os.system("python pdf_scraper.py")
        exit(0)
    if pdf_again == "n":
        print "Bye!"
        exit(0)
    else:
        print "Please enter a valid input." 
Example 14
Project: trolling_detection   Author: rafaharo   File: feature_extraction.py    Apache License 2.0 5 votes vote down vote up
def corpus_stats(collection):
    import nltk
    import pprint
    words = tokenize_collection(collection, lowercase=True, stopwords='english', min_length=3)
    text = nltk.Text(word.lower() for word in words)
    print "Number of Words: " + str(len(text))
    print "Number of unique words: " + str(len(set(text)))
    dist = nltk.FreqDist(text)
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(dist.most_common(20))
    print dist['stupid'] 
Example 15
Project: trolling_detection   Author: rafaharo   File: feature_extraction.py    Apache License 2.0 5 votes vote down vote up
def similar_words(collection, word, num=10):
    import nltk
    words = tokenize_collection(collection, stopwords='english')
    text = nltk.Text(word.lower() for word in words)
    text.similar(word, num) 
Example 16
Project: SentimentTwitter   Author: Amanda-Clark   File: sentiment.py    MIT License 5 votes vote down vote up
def load_tweets(tweet_file):
    """
    Loads tweets into a form suitable for analysis. Probably could be combined with get_prob.
    See http://www.nltk.org/book/ch01.html for more reference.
    """
    user_tweets = open(tweet_file, 'r')
    text = user_tweets.read()
    text = text.split()
    twts = nltk.Text(text)
    get_prob(twts)