Python nltk.download() Examples

The following are 30 code examples of nltk.download(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module nltk , or try the search function .
Example #1
Source File: treebank_encoder.py    From PyTorch-NLP with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs) 
Example #2
Source File: ngram_featurizer.py    From metal with Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example #3
Source File: res_sen2vec.py    From resilient-community-apps with MIT License 6 votes vote down vote up
def __init__(self, w2v, sif, log=None):
        # A NLPWord2Vec to get the vec for a word
        self.word2vec = w2v
        # A ResSIF used to get word count
        self.sif = sif
        # util to pre-process data
        self.utils = WordSentenceUtils()
        self.log = log if log else logging.getLogger(__name__)
        self.sentence_vectors = []
        self.feature_size = 0
        # download nltk resource if necessary
        nltk.download('words', quiet=True)
        self.setofwords = set(nltk_words.words())

        # pca vector
        self.pca_u = [] 
Example #4
Source File: __init__.py    From senpy with Apache License 2.0 6 votes vote down vote up
def install_deps(*plugins):
    installed = False
    nltk_resources = set()
    requirements = []
    for info in plugins:
        requirements = info.get('requirements', [])
        if requirements:
            requirements += missing_requirements(requirements)
        nltk_resources |= set(info.get('nltk_resources', []))
    if requirements:
        logger.info('Installing requirements: ' + str(requirements))
        pip_args = [sys.executable, '-m', 'pip', 'install']
        for req in requirements:
            pip_args.append(req)
        process = subprocess.Popen(
            pip_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        _log_subprocess_output(process)
        exitcode = process.wait()
        installed = True
        if exitcode != 0:
            raise models.Error(
                "Dependencies not properly installed: {}".format(pip_args))
    installed |= download(list(nltk_resources))
    return installed 
Example #5
Source File: install.py    From metadoc with MIT License 6 votes vote down vote up
def install_nltk_sets():
    DATA_DIR = os.path.join(os.path.dirname(__file__), "extract/data")
    REQUIRED_CORPORA = [
        'brown', # Required for FastNPExtractor
        'punkt', # Required for WordTokenizer
        'wordnet', # Required for lemmatization and Wordnet
        'maxent_ne_chunker',
        'stopwords',
        'words'
    ]

    for each in REQUIRED_CORPORA:
        print(('[+] Downloading corpus:  "{0}"'.format(each)))
        nltk.download(each, download_dir=DATA_DIR)

    from metadoc.extract.pos import do_train
    print('[+] Training tagger now.')
    do_train()
    remove_zips(DATA_DIR)
    return 
Example #6
Source File: tokenizer.py    From prenlp with Apache License 2.0 6 votes vote down vote up
def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except Exception as ex:
            import nltk
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')
        self.tokenizer = MosesTokenizer() 
Example #7
Source File: utils.py    From essaysense with MIT License 6 votes vote down vote up
def __init__(self, hyperparameters, lookup_table):
        """Constructor for initializing ASAP-AES datasets.

        Args:
            - hyperparameters: hyperparameters of the experiments.
            - lookup_table: word embedding lookup table, which should be a dict
                            mapping words into their NumPy vector repre-
                            sentation.
        """
        # This constructor tries to detect or download NLTK's tokenizer
        # automatically.
        try:
            self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        except LookupError:
            nltk.download("punkt")
            self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        # Also load hyperparameters and lookup table.
        self.lookup_table = lookup_table
        self.hp = hyperparameters 
Example #8
Source File: utils.py    From Fox-V3 with GNU Affero General Public License v3.0 6 votes vote down vote up
def remove_stopwords(tokens, language):
    """
    Takes a language (i.e. 'english'), and a set of word tokens.
    Returns the tokenized text with any stopwords removed.
    Stop words are words like "is, the, a, ..."

    Be sure to download the required NLTK corpus before calling this function:
    - from chatterbot.utils import nltk_download_corpus
    - nltk_download_corpus('corpora/stopwords')
    """
    from nltk.corpus import stopwords

    # Get the stopwords for the specified language
    stop_words = stopwords.words(language)

    # Remove the stop words from the set of word tokens
    tokens = set(tokens) - set(stop_words)

    return tokens 
Example #9
Source File: test_corpus.py    From cltk with MIT License 6 votes vote down vote up
def setUpClass(self):
        try:
            corpus_importer = CorpusImporter("latin")
            corpus_importer.import_corpus("latin_models_cltk")
            corpus_importer.import_corpus("latin_text_latin_library")
        except:
            raise Exception("Failure to download test corpus")
        self.reader = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader._fileids = ["pervig.txt"]
        # Need a additional instance because tests below change internals #TO-DO Fix
        self.reader_2 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader_3 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader_4 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        ) 
Example #10
Source File: NewsAutosummarize.py    From Python-Scripts-Repo-on-Data-Science with GNU General Public License v3.0 6 votes vote down vote up
def get_only_text_washingtonpost_url(url):
    # this func will take the URL as an argument and return only
    # the raw text of the url.
    # this function works specifically for the washPost articles
    # because we know the structure of the pages
    page = urllib.urlopen(url).read().decode('utf8')
    # we download the URL
    soup = BeautifulSoup(page)
    # initialize a beautifulsoup object with the page we downloaded
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # the above gets everything bewteen a pair of HTML tags
    # that look a certain way e.g. <article> stuff</article>
    # the above format is specific to the washington post
    soup2 = BeautifulSoup(text)
    # find all the paragraph tage <p>
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

#######################################################################

# TEST
###################################################################### 
Example #11
Source File: word.py    From flambe with MIT License 6 votes vote down vote up
def __init__(self, ngrams: Union[int, List[int]] = 1,
                 exclude_stopwords: bool = False,
                 stop_words: Optional[List] = None) -> None:
        """ Initialize the NGramsTokenizer

        Parameters
        ----------
        ngrams : Union[int, List[int]], optional
            [description], by default 1
        exclude_stopwords: bool
            [description], by default False
        stop_words: Optional[List]
            [description], by default None

        """
        self.ngrams = ngrams
        self.exclude_stopwords = exclude_stopwords

        if self.exclude_stopwords:
            self.stop_words = stop_words
            if self.stop_words is None:
                nltk.download('stopwords', quiet=True)
                self.stop_words = stopwords.words('english')

        nltk.download('punkt', quiet=True) 
Example #12
Source File: language_util.py    From talk-generator with MIT License 6 votes vote down vote up
def print_corpus_download_warning():
    corpus_warning = """
    Hmm...
    ---------------------

    We had some trouble downloading the NLTK corpuses.. 
    Try running the following from a command line. This should 
    download the needed packages.. but it might also tell you if 
    there is another issue.

    $ python3 -m nltk.downloader punkt averaged_perceptron_tagger
    """
    logger.warning(corpus_warning)


# Helpers 
Example #13
Source File: agents.py    From ParlAI with MIT License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer.
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example #14
Source File: wordnet.py    From gobbli with Apache License 2.0 6 votes vote down vote up
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True) 
Example #15
Source File: download.py    From textkit with MIT License 6 votes vote down vote up
def download():
    '''
    Install required libraries.
    Note this library will install nltk dependencies into your
    user directory.
    '''

    click.echo("Installing nltk packages into your user directories in " +
               "the following order of existence (first found):\n" +
               '\n'.join(nltk.data.path))

    extensions = [("taggers", "averaged_perceptron_tagger"),
                  ("corpora", "wordnet"),
                  ("tokenizers", "punkt")]

    missing = check_packages_exist(extensions)

    for ext_tuple in missing:
        nltk.download(ext_tuple[1]) 
Example #16
Source File: nlp.py    From Quora with MIT License 6 votes vote down vote up
def lemmatize(tokens):
    """
    lemmatize tokens
    """
    try:
        wnl = nltk.WordNetLemmatizer()
    except LookupError:
        nltk.download('wordnet')
        wnl = nltk.WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in tokens] 
Example #17
Source File: agents.py    From neural_chat with MIT License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example #18
Source File: nlp.py    From twip with MIT License 6 votes vote down vote up
def nltk_download(name, ignore_errors=True):
    r"""Like nltk.download, but be quiet about it, and get a room (separate python process)

    Does some simple whitespace normalization on `name`, but doesn't yet do fuzzy matching
    Caches the normalized names of packages already attempted, so they aren't re-tried

    >>> nltk_download('nonexistent dataset name', ignore_errors=True)
    False
    >>> nltk_download('WordNet', ignore_errors=True)
    True
    >>> nltk_download('wordnet', ignore_errors=True)
    True
    """
    name = re.sub(r"[-\s=+']+", '_', name.lower())
    if name in nltk_download.done:
        return nltk_download.done[name]
    proc = subprocess.Popen(["python", "-c", "import nltk; nltk.download('{}')".format(name)], stdout=subprocess.PIPE)
    msgs = [s for s in proc.communicate() if s is not None]
    if any(re.match(r'^\[nltk_data\]\s+Error', msg, flags=re.IGNORECASE) for msg in msgs):
        nltk_download.done[name] = False
        if ignore_errors:
            return nltk_download.done[name]
        raise ValueError('Unable to download the requested NLTK dataset: {}'.format('\n'.join(msgs)))
    nltk_download.done[name] = True
    return nltk_download.done[name] 
Example #19
Source File: transformation_functions.py    From sparklingml with Apache License 2.0 6 votes vote down vote up
def get(self, lang):
        if lang not in self._spacys:
            import spacy
            # Hack to dynamically download languages on cluster machines,
            # you can remove if you have the models installed and just do:
            # cls._spacys[lang] = spacy.load(lang)
            try:
                old_exit = sys.exit
                sys.exit = None
                try:
                    self._spacys[lang] = spacy.load(lang)
                except Exception:
                    spacy.cli.download(lang)
                    self._spacys[lang] = spacy.load(lang)
            except Exception as e:
                raise Exception(
                    "Failed to find or download language {0}: {1}"
                    .format(lang, e))
            finally:
                sys.exit = old_exit

        return self._spacys[lang] 
Example #20
Source File: test_text_filters.py    From pliers with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_token_removal_filter():
    stim = TextStim(text='this is not a very long sentence')
    filt = TokenRemovalFilter()
    assert filt.transform(stim).text == 'long sentence'

    filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is'])
    assert filt2.transform(stim).text == 'this not very long sentence'

    stim2 = TextStim(text='More. is Real, sentence that\'ll work')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    from nltk.corpus import stopwords
    tokens = set(stopwords.words('english')) | set(string.punctuation)
    filt3 = TokenRemovalFilter(tokens=tokens)
    assert filt3.transform(stim2).text == 'More Real sentence \'ll work' 
Example #21
Source File: nlp.py    From Quora with MIT License 5 votes vote down vote up
def remove_stopwords(tokens):
    """
    remove stopwords from tokens
    """
    try:
        stopwords = nltk.corpus.stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
        stopwords = nltk.corpus.stopwords.words('english')
    return [t for t in tokens if t.lower() not in stopwords] 
Example #22
Source File: bidaf_model_runtime.py    From botbuilder-python with MIT License 5 votes vote down vote up
def init_bidaf(bidaf_model_dir: str, download_ntlk_punkt: bool = False) -> bool:
        if os.path.isdir(bidaf_model_dir):
            print("bidaf model directory already present..", file=sys.stderr)
        else:
            print("Creating bidaf model directory..", file=sys.stderr)
            os.makedirs(bidaf_model_dir, exist_ok=True)

        # Download Punkt Sentence Tokenizer
        if download_ntlk_punkt:
            nltk.download("punkt", download_dir=bidaf_model_dir)
            nltk.download("punkt")

        # Download bidaf onnx model
        onnx_model_file = os.path.abspath(os.path.join(bidaf_model_dir, "bidaf.onnx"))

        print(f"Checking file {onnx_model_file}..", file=sys.stderr)
        if os.path.isfile(onnx_model_file):
            print("bidaf.onnx downloaded already!", file=sys.stderr)
        else:
            print("Downloading bidaf.onnx...", file=sys.stderr)
            response = requests.get(
                "https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx",
                stream=True,
            )
            with open(onnx_model_file, "wb") as f:
                response.raw.decode_content = True
                shutil.copyfileobj(response.raw, f)
        return True 
Example #23
Source File: test_text_extractors.py    From pliers with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_part_of_speech_extractor():
    import nltk
    nltk.download('tagsets')
    stim = ComplexTextStim(join(TEXT_DIR, 'complex_stim_with_header.txt'))
    result = merge_results(PartOfSpeechExtractor().transform(stim),
                           format='wide', extractor_names=False)
    assert result.shape == (4, 54)
    assert result['NN'].sum() == 1
    result = result.sort_values('onset')
    assert result['VBD'].iloc[3] == 1 
Example #24
Source File: text.py    From pliers with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, tokens=None, language='english'):
        self.language = language
        if tokens:
            self.tokens = set(tokens)
        else:
            try:
                nltk.data.find('corpora/stopwords')
            except LookupError:
                nltk.download('stopwords')
            from nltk.corpus import stopwords
            self.tokens = set(stopwords.words(self.language))
        super().__init__() 
Example #25
Source File: word_sentence_utils.py    From resilient-community-apps with MIT License 5 votes vote down vote up
def __init__(self):
        nltk.download("wordnet", quiet=True)
        nltk.download("stopwords", quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        self.remove_list = ", . ; ? ~ ! * ) ( { } $ # @ < > ] [".split()
        self.lem = WordNetLemmatizer() 
Example #26
Source File: hq_main.py    From HackQ-Trivia with MIT License 5 votes vote down vote up
def download_nltk_resources():
        for resource in {"stopwords", "averaged_perceptron_tagger", "punkt"}:
            nltk.download(resource, quiet=True) 
Example #27
Source File: sum.py    From vidsum with GNU General Public License v3.0 5 votes vote down vote up
def download_video_srt(subs):
    """ Downloads specified Youtube video's subtitles as a vtt/srt file.

    Args:
        subs(str): Full url of Youtube video

    Returns:
        True


    The video will be downloaded as 1.mp4 and its subtitles as 1.(lang).srt
    Both, the video and its subtitles, will be downloaded to the same location
    as that of this script (sum.py)

    """
    ydl_opts = {
        'format': 'best',
        'outtmpl': '1.%(ext)s',
        'subtitlesformat': 'srt',
        'writeautomaticsub': True,
        # 'allsubtitles': True # Get all subtitles
    }

    movie_filename = ""
    subtitle_filename = ""
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        # ydl.download([subs])
        result = ydl.extract_info("{}".format(url), download=True)
        movie_filename = ydl.prepare_filename(result)
        subtitle_info = result.get("requested_subtitles")
        subtitle_language = subtitle_info.keys()[0]
        subtitle_ext = subtitle_info.get(subtitle_language).get("ext")
        subtitle_filename = movie_filename.replace(".mp4", ".%s.%s" %
                                                   (subtitle_language,
                                                    subtitle_ext))
    return movie_filename, subtitle_filename 
Example #28
Source File: piglatin.py    From CloudBot with GNU General Public License v3.0 5 votes vote down vote up
def load_nltk():
    nltk.download('cmudict')

    global pronunciations
    pronunciations = nltk.corpus.cmudict.dict() 
Example #29
Source File: setup.py    From rake-nltk with MIT License 5 votes vote down vote up
def _post_install():
    """Post installation nltk corpus downloads."""
    import nltk

    nltk.download("punkt")
    nltk.download("stopwords") 
Example #30
Source File: downloadcorpus.py    From chicago-justice with GNU General Public License v3.0 5 votes vote down vote up
def handle(self, *args, **options):
        LOG.info('Downloading NLTK data')

        if options['download_dir']:
            dest = options['download_dir']
            nltk.download('punkt', download_dir=dest)
            nltk.download('wordnet', download_dir=dest)
        else:
            nltk.download('punkt')
            nltk.download('wordnet')