Python nltk.download() Examples

The following are 30 code examples for showing how to use nltk.download(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module nltk , or try the search function .

Example 1
Project: metal   Author: HazyResearch   File: ngram_featurizer.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example 2
Project: senpy   Author: gsi-upm   File: __init__.py    License: Apache License 2.0 6 votes vote down vote up
def install_deps(*plugins):
    installed = False
    nltk_resources = set()
    requirements = []
    for info in plugins:
        requirements = info.get('requirements', [])
        if requirements:
            requirements += missing_requirements(requirements)
        nltk_resources |= set(info.get('nltk_resources', []))
    if requirements:
        logger.info('Installing requirements: ' + str(requirements))
        pip_args = [sys.executable, '-m', 'pip', 'install']
        for req in requirements:
            pip_args.append(req)
        process = subprocess.Popen(
            pip_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        _log_subprocess_output(process)
        exitcode = process.wait()
        installed = True
        if exitcode != 0:
            raise models.Error(
                "Dependencies not properly installed: {}".format(pip_args))
    installed |= download(list(nltk_resources))
    return installed 
Example 3
Project: cltk   Author: cltk   File: test_corpus.py    License: MIT License 6 votes vote down vote up
def setUpClass(self):
        try:
            corpus_importer = CorpusImporter("latin")
            corpus_importer.import_corpus("latin_models_cltk")
            corpus_importer.import_corpus("latin_text_latin_library")
        except:
            raise Exception("Failure to download test corpus")
        self.reader = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader._fileids = ["pervig.txt"]
        # Need a additional instance because tests below change internals #TO-DO Fix
        self.reader_2 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader_3 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        )
        self.reader_4 = get_corpus_reader(
            language="latin", corpus_name="latin_text_latin_library"
        ) 
Example 4
Project: PyTorch-NLP   Author: PetrochukM   File: treebank_encoder.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs) 
Example 5
Project: essaysense   Author: zlliang   File: utils.py    License: MIT License 6 votes vote down vote up
def __init__(self, hyperparameters, lookup_table):
        """Constructor for initializing ASAP-AES datasets.

        Args:
            - hyperparameters: hyperparameters of the experiments.
            - lookup_table: word embedding lookup table, which should be a dict
                            mapping words into their NumPy vector repre-
                            sentation.
        """
        # This constructor tries to detect or download NLTK's tokenizer
        # automatically.
        try:
            self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        except LookupError:
            nltk.download("punkt")
            self.s_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        # Also load hyperparameters and lookup table.
        self.lookup_table = lookup_table
        self.hp = hyperparameters 
Example 6
Project: talk-generator   Author: korymath   File: language_util.py    License: MIT License 6 votes vote down vote up
def print_corpus_download_warning():
    corpus_warning = """
    Hmm...
    ---------------------

    We had some trouble downloading the NLTK corpuses.. 
    Try running the following from a command line. This should 
    download the needed packages.. but it might also tell you if 
    there is another issue.

    $ python3 -m nltk.downloader punkt averaged_perceptron_tagger
    """
    logger.warning(corpus_warning)


# Helpers 
Example 7
Project: ParlAI   Author: facebookresearch   File: agents.py    License: MIT License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer.
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example 8
Project: gobbli   Author: RTIInternational   File: wordnet.py    License: Apache License 2.0 6 votes vote down vote up
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True) 
Example 9
Project: textkit   Author: learntextvis   File: download.py    License: MIT License 6 votes vote down vote up
def download():
    '''
    Install required libraries.
    Note this library will install nltk dependencies into your
    user directory.
    '''

    click.echo("Installing nltk packages into your user directories in " +
               "the following order of existence (first found):\n" +
               '\n'.join(nltk.data.path))

    extensions = [("taggers", "averaged_perceptron_tagger"),
                  ("corpora", "wordnet"),
                  ("tokenizers", "punkt")]

    missing = check_packages_exist(extensions)

    for ext_tuple in missing:
        nltk.download(ext_tuple[1]) 
Example 10
Project: neural_chat   Author: natashamjaques   File: agents.py    License: MIT License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example 11
Project: sparklingml   Author: sparklingpandas   File: transformation_functions.py    License: Apache License 2.0 6 votes vote down vote up
def get(self, lang):
        if lang not in self._spacys:
            import spacy
            # Hack to dynamically download languages on cluster machines,
            # you can remove if you have the models installed and just do:
            # cls._spacys[lang] = spacy.load(lang)
            try:
                old_exit = sys.exit
                sys.exit = None
                try:
                    self._spacys[lang] = spacy.load(lang)
                except Exception:
                    spacy.cli.download(lang)
                    self._spacys[lang] = spacy.load(lang)
            except Exception as e:
                raise Exception(
                    "Failed to find or download language {0}: {1}"
                    .format(lang, e))
            finally:
                sys.exit = old_exit

        return self._spacys[lang] 
Example 12
Project: pliers   Author: tyarkoni   File: test_text_filters.py    License: BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_token_removal_filter():
    stim = TextStim(text='this is not a very long sentence')
    filt = TokenRemovalFilter()
    assert filt.transform(stim).text == 'long sentence'

    filt2 = TokenRemovalFilter(tokens=['a', 'the', 'is'])
    assert filt2.transform(stim).text == 'this not very long sentence'

    stim2 = TextStim(text='More. is Real, sentence that\'ll work')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords')
    from nltk.corpus import stopwords
    tokens = set(stopwords.words('english')) | set(string.punctuation)
    filt3 = TokenRemovalFilter(tokens=tokens)
    assert filt3.transform(stim2).text == 'More Real sentence \'ll work' 
Example 13
Project: twip   Author: totalgood   File: nlp.py    License: MIT License 6 votes vote down vote up
def nltk_download(name, ignore_errors=True):
    r"""Like nltk.download, but be quiet about it, and get a room (separate python process)

    Does some simple whitespace normalization on `name`, but doesn't yet do fuzzy matching
    Caches the normalized names of packages already attempted, so they aren't re-tried

    >>> nltk_download('nonexistent dataset name', ignore_errors=True)
    False
    >>> nltk_download('WordNet', ignore_errors=True)
    True
    >>> nltk_download('wordnet', ignore_errors=True)
    True
    """
    name = re.sub(r"[-\s=+']+", '_', name.lower())
    if name in nltk_download.done:
        return nltk_download.done[name]
    proc = subprocess.Popen(["python", "-c", "import nltk; nltk.download('{}')".format(name)], stdout=subprocess.PIPE)
    msgs = [s for s in proc.communicate() if s is not None]
    if any(re.match(r'^\[nltk_data\]\s+Error', msg, flags=re.IGNORECASE) for msg in msgs):
        nltk_download.done[name] = False
        if ignore_errors:
            return nltk_download.done[name]
        raise ValueError('Unable to download the requested NLTK dataset: {}'.format('\n'.join(msgs)))
    nltk_download.done[name] = True
    return nltk_download.done[name] 
Example 14
Project: Quora   Author: KevinLiao159   File: nlp.py    License: MIT License 6 votes vote down vote up
def lemmatize(tokens):
    """
    lemmatize tokens
    """
    try:
        wnl = nltk.WordNetLemmatizer()
    except LookupError:
        nltk.download('wordnet')
        wnl = nltk.WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in tokens] 
Example 15
Project: flambe   Author: asappresearch   File: word.py    License: MIT License 6 votes vote down vote up
def __init__(self, ngrams: Union[int, List[int]] = 1,
                 exclude_stopwords: bool = False,
                 stop_words: Optional[List] = None) -> None:
        """ Initialize the NGramsTokenizer

        Parameters
        ----------
        ngrams : Union[int, List[int]], optional
            [description], by default 1
        exclude_stopwords: bool
            [description], by default False
        stop_words: Optional[List]
            [description], by default None

        """
        self.ngrams = ngrams
        self.exclude_stopwords = exclude_stopwords

        if self.exclude_stopwords:
            self.stop_words = stop_words
            if self.stop_words is None:
                nltk.download('stopwords', quiet=True)
                self.stop_words = stopwords.words('english')

        nltk.download('punkt', quiet=True) 
Example 16
def get_only_text_washingtonpost_url(url):
    # this func will take the URL as an argument and return only
    # the raw text of the url.
    # this function works specifically for the washPost articles
    # because we know the structure of the pages
    page = urllib.urlopen(url).read().decode('utf8')
    # we download the URL
    soup = BeautifulSoup(page)
    # initialize a beautifulsoup object with the page we downloaded
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    # the above gets everything bewteen a pair of HTML tags
    # that look a certain way e.g. <article> stuff</article>
    # the above format is specific to the washington post
    soup2 = BeautifulSoup(text)
    # find all the paragraph tage <p>
    text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
    return soup.title.text, text

#######################################################################

# TEST
###################################################################### 
Example 17
Project: Fox-V3   Author: bobloy   File: utils.py    License: GNU Affero General Public License v3.0 6 votes vote down vote up
def remove_stopwords(tokens, language):
    """
    Takes a language (i.e. 'english'), and a set of word tokens.
    Returns the tokenized text with any stopwords removed.
    Stop words are words like "is, the, a, ..."

    Be sure to download the required NLTK corpus before calling this function:
    - from chatterbot.utils import nltk_download_corpus
    - nltk_download_corpus('corpora/stopwords')
    """
    from nltk.corpus import stopwords

    # Get the stopwords for the specified language
    stop_words = stopwords.words(language)

    # Remove the stop words from the set of word tokens
    tokens = set(tokens) - set(stop_words)

    return tokens 
Example 18
Project: metadoc   Author: fanmatics   File: install.py    License: MIT License 6 votes vote down vote up
def install_nltk_sets():
    DATA_DIR = os.path.join(os.path.dirname(__file__), "extract/data")
    REQUIRED_CORPORA = [
        'brown', # Required for FastNPExtractor
        'punkt', # Required for WordTokenizer
        'wordnet', # Required for lemmatization and Wordnet
        'maxent_ne_chunker',
        'stopwords',
        'words'
    ]

    for each in REQUIRED_CORPORA:
        print(('[+] Downloading corpus:  "{0}"'.format(each)))
        nltk.download(each, download_dir=DATA_DIR)

    from metadoc.extract.pos import do_train
    print('[+] Training tagger now.')
    do_train()
    remove_zips(DATA_DIR)
    return 
Example 19
Project: resilient-community-apps   Author: ibmresilient   File: res_sen2vec.py    License: MIT License 6 votes vote down vote up
def __init__(self, w2v, sif, log=None):
        # A NLPWord2Vec to get the vec for a word
        self.word2vec = w2v
        # A ResSIF used to get word count
        self.sif = sif
        # util to pre-process data
        self.utils = WordSentenceUtils()
        self.log = log if log else logging.getLogger(__name__)
        self.sentence_vectors = []
        self.feature_size = 0
        # download nltk resource if necessary
        nltk.download('words', quiet=True)
        self.setofwords = set(nltk_words.words())

        # pca vector
        self.pca_u = [] 
Example 20
Project: botbuilder-python   Author: microsoft   File: bidaf_model_runtime.py    License: MIT License 5 votes vote down vote up
def init_bidaf(bidaf_model_dir: str, download_ntlk_punkt: bool = False) -> bool:
        if os.path.isdir(bidaf_model_dir):
            print("bidaf model directory already present..", file=sys.stderr)
        else:
            print("Creating bidaf model directory..", file=sys.stderr)
            os.makedirs(bidaf_model_dir, exist_ok=True)

        # Download Punkt Sentence Tokenizer
        if download_ntlk_punkt:
            nltk.download("punkt", download_dir=bidaf_model_dir)
            nltk.download("punkt")

        # Download bidaf onnx model
        onnx_model_file = os.path.abspath(os.path.join(bidaf_model_dir, "bidaf.onnx"))

        print(f"Checking file {onnx_model_file}..", file=sys.stderr)
        if os.path.isfile(onnx_model_file):
            print("bidaf.onnx downloaded already!", file=sys.stderr)
        else:
            print("Downloading bidaf.onnx...", file=sys.stderr)
            response = requests.get(
                "https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx",
                stream=True,
            )
            with open(onnx_model_file, "wb") as f:
                response.raw.decode_content = True
                shutil.copyfileobj(response.raw, f)
        return True 
Example 21
Project: prenlp   Author: lyeoni   File: tokenizer.py    License: Apache License 2.0 5 votes vote down vote up
def __init__(self):
        try:
            from nltk.tokenize.moses import MosesTokenizer
        except Exception as ex:
            import nltk
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')
        self.tokenizer = MosesTokenizer() 
Example 22
Project: qb   Author: Pinafore   File: setup.py    License: MIT License 5 votes vote down vote up
def run(self):
        import nltk
        nltk.download('stopwords')
        nltk.download('punkt')
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')
        path = 'data/external/nltk_download_SUCCESS'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'w') as f:
            f.write('Downloaded nltk: stopwords, pinkt, wordnet') 
Example 23
Project: jingwei   Author: li-xirong   File: check_availability.py    License: MIT License 5 votes vote down vote up
def check_robustpca(trainCollection, testCollection, feature):
    ready = True
    
    # check matlab    
    if not check_matlab():
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'Matlab is not available or incorrectly configured.')
        ready = False
    
    # check if knn is available
    if not check_knn(trainCollection, testCollection, feature):
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'KNN is not available.')        
        ready = False

    # check data files
    datafiles = [ os.path.join(ROOT_PATH, trainCollection, 'TextData', 'id.userid.lemmtags.txt'),
                  os.path.join(ROOT_PATH, trainCollection, 'FeatureData', feature)]
    res = find_missing_files(datafiles)
    if res:
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'the following files or folders are missing:\n%s' % res)
        return False    
              
    # check external dependencies  
    try:
        import h5py
        import numpy
        import scipy.io
        import scipy.sparse
        from nltk.corpus import wordnet as wn
        from nltk.corpus import wordnet_ic
        brown_ic = wordnet_ic.ic('ic-brown.dat')
        wn.morphy('cat')
        wn.synsets('cat', pos=wn.NOUN)
    except Exception, e:
        try:
            import nltk
            nltk.download('brown')
            nltk.download('wordnet')
            nltk.download('wordnet_ic')
        except Exception, e:
            print e
            ready = False 
Example 24
Project: lisc   Author: lisc-tools   File: conftest.py    License: Apache License 2.0 5 votes vote down vote up
def download_data():

    # Download required nltk data for tokenizing
    nltk.download('punkt')
    nltk.download('stopwords') 
Example 25
Project: text-to-image   Author: paarthneekhara   File: download_datasets.py    License: MIT License 5 votes vote down vote up
def create_data_paths():
    if not os.path.isdir(DATA_DIR):
        raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR)
    needed_paths = [
        os.path.join(DATA_DIR, 'samples'),
        os.path.join(DATA_DIR, 'val_samples'),
        os.path.join(DATA_DIR, 'Models'),
    ]
    for p in needed_paths:
        make_sure_path_exists(p)


# adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python 
Example 26
Project: text-to-image   Author: paarthneekhara   File: download_datasets.py    License: MIT License 5 votes vote down vote up
def create_data_paths():
    if not os.path.isdir(DATA_DIR):
        raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR)
    needed_paths = [
        os.path.join(DATA_DIR, 'samples'),
        os.path.join(DATA_DIR, 'val_samples'),
        os.path.join(DATA_DIR, 'Models'),
    ]
    for p in needed_paths:
        make_sure_path_exists(p)


# adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python 
Example 27
Project: dl4ir-webnav   Author: nyu-dl   File: op_sentence.py    License: BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, wiki, vocab, n_consec):
        self.wiki = wiki
        self.vocab = vocab
        self.n_consec = n_consec # number of consecutive sections that are used to form a query
        nltk.download('punkt')
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 
Example 28
Project: senpy   Author: gsi-upm   File: __init__.py    License: Apache License 2.0 5 votes vote down vote up
def evaluate(plugins, datasets, **kwargs):
    for plug in plugins:
        if not hasattr(plug, 'as_pipe'):
            raise models.Error('Plugin {} cannot be evaluated'.format(plug.name))

    if not isinstance(datasets, dict):
        datasets = gsitk_compat.prepare(datasets, download=True)

    tuples = list(product(plugins, datasets))
    missing = []
    for (p, d) in tuples:
        if (p.id, d) not in cached_evs:
            pipe = p.as_pipe()
            missing.append(gsitk_compat.EvalPipeline(pipe, d))
    if missing:
        ev = gsitk_compat.Eval(tuples=missing, datasets=datasets)
        ev.evaluate()
        results = ev.results
        new_ev = evaluations_to_JSONLD(results, **kwargs)
        for ev in new_ev:
            dataset = ev.evaluatesOn
            model = ev.evaluates
            cached_evs[(model, dataset)] = ev
    evaluations = []
    logger.debug('%s. Cached evs: %s', tuples, cached_evs)
    for (p, d) in tuples:
        logger.debug('Adding %s, %s', d, p)
        evaluations.append(cached_evs[(p.id, d)])
    return evaluations 
Example 29
Project: Pointer-Generator   Author: Sohone-Guo   File: tokenizers.py    License: MIT License 5 votes vote down vote up
def _get_sentence_tokenizer(self, language):
        if language in self.SPECIAL_SENTENCE_TOKENIZERS:
            return self.SPECIAL_SENTENCE_TOKENIZERS[language]
        try:
            path = to_string("tokenizers/punkt/%s.pickle") % to_string(language)
            return nltk.data.load(path)
        except (LookupError, zipfile.BadZipfile):
            raise LookupError(
                "NLTK tokenizers are missing. Download them by following command: "
                '''python -c "import nltk; nltk.download('punkt')"'''
            ) 
Example 30
Project: tokenquery   Author: ramtinms   File: tokenizer.py    License: GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, tokenizer_type="PTBTokenizer"):

        # Sanity checks
        if tokenizer_type in ['SpaceTokenizer', 'NLTKWhiteSpaceTokenizer', 'PTBTokenizer']:
            self.tokenizer_type = tokenizer_type
        else:
            print ("Unrecognized tokenizer type : setting back to default (PTBTokenizer)")
            self.tokenizer_type = "PTBTokenizer"
        try:
            nltk.data.find('punkt.zip')
        except LookupError:
            nltk.download('punkt')