Python nltk.corpus.gutenberg.words() Examples

The following are 10 code examples of nltk.corpus.gutenberg.words(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk.corpus.gutenberg , or try the search function

Example #1

Source File: wordfreq_app.py From razzy-spinner with GNU General Public License v3.0

5 votes

def app():
    t1 = Text(gutenberg.words('melville-moby_dick.txt'))
    plot_word_freq_dist(t1)

Example #2

Source File: dispersion.py From razzy-spinner with GNU General Public License v3.0

5 votes

def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
    """
    Generate a lexical dispersion plot.

    :param text: The source text
    :type text: list(str) or enum(str)
    :param words: The target words
    :type words: list of str
    :param ignore_case: flag to set if case should be ignored when searching text
    :type ignore_case: bool
    """

    try:
        from matplotlib import pylab
    except ImportError:
        raise ValueError('The plot function requires matplotlib to be installed.'
                     'See http://matplotlib.org/')

    text = list(text)
    words.reverse()

    if ignore_case:
        words_to_comp = list(map(str.lower, words))
        text_to_comp = list(map(str.lower, text))
    else:
        words_to_comp = words
        text_to_comp = text

    points = [(x,y) for x in range(len(text_to_comp))
                    for y in range(len(words_to_comp))
                    if text_to_comp[x] == words_to_comp[y]]
    if points:
        x, y = list(zip(*points))
    else:
        x = y = ()
    pylab.plot(x, y, "b|", scalex=.1)
    pylab.yticks(list(range(len(words))), words, color="b")
    pylab.ylim(-1, len(words))
    pylab.title(title)
    pylab.xlabel("Word Offset")
    pylab.show()

Example #3

Source File: nltk_utils.py From combine-FEVER-NSMN with MIT License

5 votes

def get_nltk_freq_words():
    """Use Brown corpus frequent words
    More corpora: https://www.nltk.org/book/ch02.html
    """
    freq_dict = nltk.FreqDist(brown.words())

    for fileid in gutenberg.fileids():
        freq_dict.update(nltk.FreqDist(gutenberg.words(fileid)))

    freq_words = [k for k, v in freq_dict.items() if v > 10]
    return freq_words, freq_dict

Example #4

Source File: wordfreq_app.py From luscan-devel with GNU General Public License v2.0

5 votes

def app():
    t1 = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    plot_word_freq_dist(t1)

Example #5

Source File: dispersion.py From luscan-devel with GNU General Public License v2.0

5 votes

def dispersion_plot(text, words, ignore_case=False):
    """
    Generate a lexical dispersion plot.

    :param text: The source text
    :type text: list(str) or enum(str)
    :param words: The target words
    :type words: list of str
    :param ignore_case: flag to set if case should be ignored when searching text
    :type ignore_case: bool
    """

    try:
        import pylab
    except ImportError:
        raise ValueError('The plot function requires the matplotlib package (aka pylab).'
                     'See http://matplotlib.sourceforge.net/')

    text = list(text)
    words.reverse()

    if ignore_case:
        words_to_comp = map(str.lower, words)
        text_to_comp = map(str.lower, text)
    else:
        words_to_comp = words
        text_to_comp = text

    points = [(x,y) for x in range(len(text_to_comp))
                    for y in range(len(words_to_comp))
                    if text_to_comp[x] == words_to_comp[y]]
    if points:
        x, y = zip(*points)
    else:
        x = y = ()
    pylab.plot(x, y, "b|", scalex=.1)
    pylab.yticks(range(len(words)), words, color="b")
    pylab.ylim(-1, len(words))
    pylab.title("Lexical Dispersion Plot")
    pylab.xlabel("Word Offset")
    pylab.show()

Example #6

Source File: wordfreq_app.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def app():
    t1 = Text(gutenberg.words('melville-moby_dick.txt'))
    plot_word_freq_dist(t1)

Example #7

Source File: test_concordance.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

5 votes

def setup_class(cls):
        cls.corpus = gutenberg.words('melville-moby_dick.txt')

Example #8

Source File: 3_corpus.py From ml_code with Apache License 2.0

5 votes

def unusual_words(text):
    text_vocab = set(w.lower() for w in text if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab.difference(english_vocab)
    return sorted(unusual)

Example #9

Source File: 3_corpus.py From ml_code with Apache License 2.0

5 votes

def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words('english')
    content = [w for w in text if w.lower() not in stopwords]
    return len(content) / len(text)

Example #10

Source File: dispersion.py From V1EngineeringInc-Docs with Creative Commons Attribution Share Alike 4.0 International

4 votes

def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"):
    """
    Generate a lexical dispersion plot.

    :param text: The source text
    :type text: list(str) or enum(str)
    :param words: The target words
    :type words: list of str
    :param ignore_case: flag to set if case should be ignored when searching text
    :type ignore_case: bool
    """

    try:
        from matplotlib import pylab
    except ImportError:
        raise ValueError(
            'The plot function requires matplotlib to be installed.'
            'See http://matplotlib.org/'
        )

    text = list(text)
    words.reverse()

    if ignore_case:
        words_to_comp = list(map(str.lower, words))
        text_to_comp = list(map(str.lower, text))
    else:
        words_to_comp = words
        text_to_comp = text

    points = [
        (x, y)
        for x in range(len(text_to_comp))
        for y in range(len(words_to_comp))
        if text_to_comp[x] == words_to_comp[y]
    ]
    if points:
        x, y = list(zip(*points))
    else:
        x = y = ()
    pylab.plot(x, y, "b|", scalex=0.1)
    pylab.yticks(list(range(len(words))), words, color="b")
    pylab.ylim(-1, len(words))
    pylab.title(title)
    pylab.xlabel("Word Offset")
    pylab.show()