Python nltk.download() Examples

The following are code examples for showing how to use nltk.download(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: ns3-arg-extract   Author: project-em   File: app.py    MIT License 8 votes vote down vote up
def main():
	nltk.download('averaged_perceptron_tagger')
	nltk.download('punkt')

	topic_files = make_topic_map("./arg-extract/data/selected_topics.txt", "./arg-extract/data/")
	num_topics = len(topic_files)

	print "loading files"
	sentence_map = {topic: read_data_file(topic_file) for topic, topic_file in topic_files.items()}

	train_sentences = []
	train_topics = []
	train_y = []
	for topic in sentence_map:
		sentences, y = sentence_map[topic]
		topics = list(itertools.repeat(topic, len(sentences)))
		train_sentences.extend(sentences)
		train_topics.extend(topics)
		train_y.extend(y)

	arg_model.fit(train_topics, train_sentences, train_y)
	app.run(host="0.0.0.0", port=5000) 
Example 2
Project: metal   Author: HazyResearch   File: ngram_featurizer.py    Apache License 2.0 7 votes vote down vote up
def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        ) 
Example 3
Project: mfnbc   Author: wharton   File: __init__.py    MIT License 7 votes vote down vote up
def __init__(self, likelihoods_input_file, unlabeled_data_file,
                 verbose, output_filename='out.csv'):
        """
            Args:
                likelihoods_input_file - location of Likelyhood table (str)
                unlabeled_data_file - Location of unlabeled data file (str)
                verbose - Turn on of off verbolse output,
                    Default is false (boolean)
            Returns:
                None
        """
        self.likelihoods_input_file = likelihoods_input_file
        self.unlabeled_data_file = unlabeled_data_file
        self.outfile = output_filename
        self.verbose = verbose
        self.probs = {}
        self.posteriors = {}
        self.results = []
        self.features = []
        self.fieldnames = None
        print("Checking if you have NLTK libs...")
        nltk.download('punkt') 
Example 4
Project: gender_analysis   Author: dhmit   File: common.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def download_nltk_package_if_not_present(package_name):
    """
    Checks to see whether the user already has a given nltk package,
    and if not, prompts the user whether to download it.

    We download all necessary packages at install time, but this is just
    in case the user has deleted them.

    :param package_name: name of the nltk package
    :return:
    """

    # nltk.data.find uses the folder + name, but download uses just the name...
    package_download_name = package_name.split('/')[1]

    try:
        nltk.data.find(package_name)
    except LookupError:
        user_key = input(f'This function requires the NLTK package {package_name}, which you do not have installed.\n'
                         + 'Press ENTER to download and install this package, or n then enter to cancel and exit.\n')
        if user_key.strip() == 'n':
            exit()

        nltk.download(package_download_name)
        print('\n') 
Example 5
Project: ML-fomo   Author: tahaHichri   File: main.py    GNU General Public License v3.0 6 votes vote down vote up
def __init__(self):
		self._checkLang =  language_check.LanguageTool('en-US')

		print(f'\nDownloading/fetching stopwords ..')
		nltk.download('stopwords')
		print(f'Crunching data ..\n')

		# TODO insert your Twitter API keys here
		# Create a developer account and request access
		# @link{ https://developer.twitter.com/en/apply-for-access.html} 
		consumer_key        = '<consumer_key>'
		consumer_secret     = '<consumer_secret>'
		access_token        = '<access_token>'
		access_token_secret = '<access_token_secret>'

		try: 
			self.auth = OAuthHandler(consumer_key, consumer_secret) 
			self.auth.set_access_token(access_token, access_token_secret) 
			self.api = tweepy.API(self.auth)  
			# print(self.api.auth._get_request_token.value)

		except: 
			print("Error: Authentication Failed") 
Example 6
Project: NUIG-suggestion   Author: MixedEmotions   File: SuggestionMiningDL.py    Apache License 2.0 6 votes vote down vote up
def activate(self, *args, **kwargs):
                
        np.random.seed(1337)  # for reproducibility
        
        st = datetime.now()
        self._classifierModel = load_model(self.savedModelPath)       
        logger.info("{} {}".format(datetime.now() - st, "loaded _classifierModel"))
        
        st = datetime.now()
        self._tokenizer = self.get_tokenizer()
        logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer"))
        
        #st = datetime.now()
        #nltk.download()
        #self._tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle')
        #logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer_nltk"))
        
        logger.info("SuggestionMiningDL plugin is ready to go!") 
Example 7
Project: PyTorch-NLP   Author: PetrochukM   File: treebank_encoder.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def __init__(self, *args, **kwargs):
        if 'tokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``tokenize``.')

        if 'detokenize' in kwargs:
            raise TypeError('``TreebankEncoder`` does not take keyword argument ``detokenize``.')

        try:
            import nltk

            # Required for moses
            nltk.download('perluniprops')
            nltk.download('nonbreaking_prefixes')

            from nltk.tokenize.treebank import TreebankWordTokenizer
            from nltk.tokenize.treebank import TreebankWordDetokenizer
        except ImportError:
            print("Please install NLTK. " "See the docs at http://nltk.org for more information.")
            raise

        super().__init__(
            *args,
            tokenize=TreebankWordTokenizer().tokenize,
            detokenize=TreebankWordDetokenizer().detokenize,
            **kwargs) 
Example 8
Project: webapp-bench   Author: edgedb   File: textgen.py    Apache License 2.0 6 votes vote down vote up
def init_nltk(self):
        # get the relevant corpus, etc.
        try:
            nltk.data.find('corpora/gutenberg')
        except LookupError:
            nltk.download('gutenberg')
        nltk.corpus.gutenberg.ensure_loaded()

        try:
            nltk.data.find('taggers/universal_tagset')
        except LookupError:
            nltk.download('universal_tagset')

        try:
            nltk.data.find('taggers/averaged_perceptron_tagger')
        except LookupError:
            nltk.download('averaged_perceptron_tagger')

        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt') 
Example 9
Project: Peerion   Author: hyperionxtech   File: utils.py    Apache License 2.0 6 votes vote down vote up
def remove_stopwords(tokens, language):
    """
    Takes a language (i.e. 'english'), and a set of word tokens.
    Returns the tokenized text with any stopwords removed.
    Stop words are words like "is, the, a, ..."

    Be sure to download the required NLTK corpus before calling this function:
    - from chatterbot.utils import nltk_download_corpus
    - nltk_download_corpus('corpora/stopwords')
    """
    from nltk.corpus import stopwords

    # Get the stopwords for the specified language
    stop_words = stopwords.words(language)

    # Remove the stop words from the set of word tokens
    tokens = set(tokens) - set(stop_words)

    return tokens 
Example 10
Project: talk-generator   Author: korymath   File: language_util.py    MIT License 6 votes vote down vote up
def print_corpus_download_warning():
    corpus_warning = """
    Hmm...
    ---------------------

    We had some trouble downloading the NLTK corpuses.. 
    Try running the following from a command line. This should 
    download the needed packages.. but it might also tell you if 
    there is another issue.

    $ python3 -m nltk.downloader punkt averaged_perceptron_tagger
    """
    logger.warning(corpus_warning)


# Helpers 
Example 11
Project: ParlAI   Author: facebookresearch   File: agents.py    MIT License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer.
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example 12
Project: textkit   Author: learntextvis   File: download.py    MIT License 6 votes vote down vote up
def download():
    '''
    Install required libraries.
    Note this library will install nltk dependencies into your
    user directory.
    '''

    click.echo("Installing nltk packages into your user directories in " +
               "the following order of existence (first found):\n" +
               '\n'.join(nltk.data.path))

    extensions = [("taggers", "averaged_perceptron_tagger"),
                  ("corpora", "wordnet"),
                  ("tokenizers", "punkt")]

    missing = check_packages_exist(extensions)

    for ext_tuple in missing:
        nltk.download(ext_tuple[1]) 
Example 13
Project: neural_chat   Author: natashamjaques   File: agents.py    MIT License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example 14
Project: dl4dial-mt-beam   Author: nyu-dl   File: agents.py    BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def get_sentence_tokenizer():
    """
    Loads the nltk sentence tokenizer
    """
    try:
        import nltk
    except ImportError:
        raise ImportError('Please install nltk (e.g. pip install nltk).')
    # nltk-specific setup
    st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
    try:
        sent_tok = nltk.data.load(st_path)
    except LookupError:
        nltk.download('punkt')
        sent_tok = nltk.data.load(st_path)
    return sent_tok 
Example 15
Project: botbuilder-python   Author: microsoft   File: bidaf_model_runtime.py    MIT License 5 votes vote down vote up
def init_bidaf(bidaf_model_dir: str, download_ntlk_punkt: bool = False) -> bool:
        if os.path.isdir(bidaf_model_dir):
            print("bidaf model directory already present..", file=sys.stderr)
        else:
            print("Creating bidaf model directory..", file=sys.stderr)
            os.makedirs(bidaf_model_dir, exist_ok=True)

        # Download Punkt Sentence Tokenizer
        if download_ntlk_punkt:
            nltk.download("punkt", download_dir=bidaf_model_dir)
            nltk.download("punkt")

        # Download bidaf onnx model
        onnx_model_file = os.path.abspath(os.path.join(bidaf_model_dir, "bidaf.onnx"))

        print(f"Checking file {onnx_model_file}..", file=sys.stderr)
        if os.path.isfile(onnx_model_file):
            print("bidaf.onnx downloaded already!", file=sys.stderr)
        else:
            print("Downloading bidaf.onnx...", file=sys.stderr)
            response = requests.get(
                "https://onnxzoo.blob.core.windows.net/models/opset_9/bidaf/bidaf.onnx",
                stream=True,
            )
            with open(onnx_model_file, "wb") as f:
                response.raw.decode_content = True
                shutil.copyfileobj(response.raw, f)
        return True 
Example 16
Project: airbnbbot   Author: shirosaidev   File: airbnb_bot.py    Apache License 2.0 5 votes vote down vote up
def read_corpus():
    """open corpus file and create word and sentence tokens
    corpus file is the base brain for Tobot which contains words/sentences
    used by nltk and sklearn to help Tobot respond to questions"""
    f = open('tobot_corpus.txt', 'r', errors='ignore')
    raw = f.read()
    f.close()
    raw = raw.lower()
    #nltk.download('punkt')
    #nltk.download('wordnet')
    #nltk.download('stopwords')
    sent_tokens = nltk.sent_tokenize(raw)
    word_tokens = nltk.word_tokenize(raw)

    return sent_tokens, word_tokens 
Example 17
Project: chattR   Author: patrickstocklin   File: download_corpora.py    GNU General Public License v2.0 5 votes vote down vote up
def download_lite():
    for each in MIN_CORPORA:
        nltk.download(each) 
Example 18
Project: chattR   Author: patrickstocklin   File: download_corpora.py    GNU General Public License v2.0 5 votes vote down vote up
def download_all():
    for each in ALL_CORPORA:
        nltk.download(each) 
Example 19
Project: qb   Author: Pinafore   File: setup.py    MIT License 5 votes vote down vote up
def run(self):
        import nltk
        nltk.download('stopwords')
        nltk.download('punkt')
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')
        path = 'data/external/nltk_download_SUCCESS'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'w') as f:
            f.write('Downloaded nltk: stopwords, pinkt, wordnet') 
Example 20
Project: tweets-preprocessor   Author: vasisouv   File: requirements_installer.py    GNU General Public License v3.0 5 votes vote down vote up
def download_nltk_dependencies(self):
        nltk.download('stopwords')
        nltk.download('punkt')
        return self 
Example 21
Project: tweets-preprocessor   Author: vasisouv   File: requirements_installer.py    GNU General Public License v3.0 5 votes vote down vote up
def download_spacy_dependencies(self):
        os.system('python -m spacy download en')
        return self


# Run this script to ensure that specific package requirements are installed 
Example 22
Project: PyMoliere   Author: JSybrandt   File: setup.py    GNU General Public License v3.0 5 votes vote down vote up
def run(self):
    _install.do_egg_install(self)
    import nltk
    # Needed for split-sentences
    nltk.download("punkt") 
Example 23
Project: jingwei   Author: li-xirong   File: check_availability.py    MIT License 5 votes vote down vote up
def check_robustpca(trainCollection, testCollection, feature):
    ready = True
    
    # check matlab    
    if not check_matlab():
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'Matlab is not available or incorrectly configured.')
        ready = False
    
    # check if knn is available
    if not check_knn(trainCollection, testCollection, feature):
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'KNN is not available.')        
        ready = False

    # check data files
    datafiles = [ os.path.join(ROOT_PATH, trainCollection, 'TextData', 'id.userid.lemmtags.txt'),
                  os.path.join(ROOT_PATH, trainCollection, 'FeatureData', feature)]
    res = find_missing_files(datafiles)
    if res:
        print_msg('RobustPCA (%s, %s, %s)' % (trainCollection, testCollection, feature), 'the following files or folders are missing:\n%s' % res)
        return False    
              
    # check external dependencies  
    try:
        import h5py
        import numpy
        import scipy.io
        import scipy.sparse
        from nltk.corpus import wordnet as wn
        from nltk.corpus import wordnet_ic
        brown_ic = wordnet_ic.ic('ic-brown.dat')
        wn.morphy('cat')
        wn.synsets('cat', pos=wn.NOUN)
    except Exception, e:
        try:
            import nltk
            nltk.download('brown')
            nltk.download('wordnet')
            nltk.download('wordnet_ic')
        except Exception, e:
            print e
            ready = False 
Example 24
Project: lightwood   Author: mindsdb   File: infersent.py    MIT License 5 votes vote down vote up
def _download_model_file(self):
        pkl_dir = "pkl_objects/"
        pkl_url = "https://dl.fbaipublicfiles.com/infersent/infersent2.pkl"
        if not os.path.exists(pkl_dir):
            os.makedirs(pkl_dir)
        if not os.path.exists(MODEL_PATH):
            logging.info('This is the first time you use this text encoder, we will download a pretrained model.')
            sys.stderr.write('Downloading: "{}" to {}\n'.format(pkl_url, MODEL_PATH))
            self._download_url_to_file(pkl_url, MODEL_PATH, progress=True) 
Example 25
Project: lightwood   Author: mindsdb   File: infersent.py    MIT License 5 votes vote down vote up
def _download_embeddings_file(self):
        emdeddings_dir = "datasets/fastText/"
        embeddings_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip"
        if not os.path.exists(emdeddings_dir):
            os.makedirs(emdeddings_dir)
        parts = urlparse(embeddings_url)
        filename = os.path.basename(parts.path)
        cached_zip_file = os.path.join(emdeddings_dir, filename)
        if not os.path.exists(W2V_PATH):
            logging.info('We will download word embeddings, this will take about 20 minutes.')
            sys.stderr.write('Downloading: "{}" to {}\n'.format(embeddings_url, cached_zip_file))
            self._download_url_to_file(embeddings_url, cached_zip_file, progress=True)
            self._unzip_file(cached_zip_file, emdeddings_dir)
            os.remove(cached_zip_file)
            os.remove(cached_zip_file.replace("zip", "bin")) 
Example 26
Project: lisc   Author: lisc-tools   File: conftest.py    Apache License 2.0 5 votes vote down vote up
def download_data():

    # Download required nltk data for tokenizing
    nltk.download('punkt')
    nltk.download('stopwords') 
Example 27
Project: text-to-image   Author: paarthneekhara   File: download_datasets.py    MIT License 5 votes vote down vote up
def create_data_paths():
    if not os.path.isdir(DATA_DIR):
        raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR)
    needed_paths = [
        os.path.join(DATA_DIR, 'samples'),
        os.path.join(DATA_DIR, 'val_samples'),
        os.path.join(DATA_DIR, 'Models'),
    ]
    for p in needed_paths:
        make_sure_path_exists(p)


# adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python 
Example 28
Project: text-to-image   Author: paarthneekhara   File: download_datasets.py    MIT License 5 votes vote down vote up
def create_data_paths():
    if not os.path.isdir(DATA_DIR):
        raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR)
    needed_paths = [
        os.path.join(DATA_DIR, 'samples'),
        os.path.join(DATA_DIR, 'val_samples'),
        os.path.join(DATA_DIR, 'Models'),
    ]
    for p in needed_paths:
        make_sure_path_exists(p)


# adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python 
Example 29
Project: PTBot   Author: whocares-openscene   File: piglatin.py    GNU General Public License v3.0 5 votes vote down vote up
def load_nltk():
    nltk.download('cmudict')

    global pronunciations
    pronunciations = nltk.corpus.cmudict.dict() 
Example 30
Project: r-botreborn   Author: colethedj   File: processors.py    MIT License 5 votes vote down vote up
def sumy_url(url):

    #logging.debug("Summarizing URL " + str(url))
    loop = asyncio.get_event_loop()
    try:
        def do_stuff():
            summary_final = ""
            parser = HtmlParser.from_url(url, Tokenizer(Config.sumy_lang))
            stemmer = Stemmer(Config.sumy_lang)
            summarizer = Summarizer(stemmer)
            summarizer.stop_words = get_stop_words(Config.sumy_lang)
            for sentence in summarizer(parser.document, Config.sumy_num_sentences):
                summary_final = summary_final + " " + str(sentence)
            return summary_final

        future = loop.run_in_executor(None, do_stuff)
        summary = await future

        if len(summary) > 1850:
            summary = summary[:1850] + '... [go to link to read more]'
    except LookupError:
        # automatically download the requirements for sumy
        print("DOWNLOADING SUMY REQUIREMENTS")
        import nltk
        nltk.download('punkt')
        summary = await sumy_url(url)

    # except requests.exceptions.HTTPError: # sometimes sumy fails
    #     summary = "" #TODO: get this working (test it)
    return summary 
Example 31
Project: bianalyzer   Author: luntos   File: setup.py    MIT License 5 votes vote down vote up
def post_install():
    import nltk
    for resource in nltk_dependencies:
        if not nltk.download(resource):
            sys.stderr.write('ERROR: Could not download required NLTK resource:'
                             ' {}\n'.format(resource))
            sys.stderr.flush() 
Example 32
Project: goose   Author: sexxis   File: setup.py    GNU General Public License v3.0 5 votes vote down vote up
def main():
    nltk_deps = ['punkt', 'averaged_perceptron_tagger']
    print 'Checking nltk deps...'
    map(nltk.download, nltk_deps)
    print 'nltk deps done' 
Example 33
Project: blabbr   Author: bfontaine   File: cli.py    MIT License 5 votes vote down vote up
def setup_nltk(self, **kw):
        import nltk
        from nltk.data import find

        tagger = "averaged_perceptron_tagger"

        try:
            find("taggers/%s" % tagger)
        except LookupError:
            click.echo("Downloading NTLK data (~2MB)...")
            nltk.download(tagger)
            return True

        return False 
Example 34
Project: democraciv-discord-bot   Author: jonasbohmann   File: law_helper.py    MIT License 5 votes vote down vote up
def __init__(self, bot):
        self.bot = bot
        nltk.download('punkt')
        nltk.download('averaged_perceptron_tagger') 
Example 35
Project: dl4ir-webnav   Author: nyu-dl   File: op_sentence.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, wiki, vocab, n_consec):
        self.wiki = wiki
        self.vocab = vocab
        self.n_consec = n_consec # number of consecutive sections that are used to form a query
        nltk.download('punkt')
        self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 
Example 36
Project: core   Author: verifiqueme   File: __init__.py    Mozilla Public License 2.0 5 votes vote down vote up
def filter_stopwords(texto: str) -> str:
    """
    Filtra stopwords, removendo-as
    :param texto: Texto a ser filtrado
    :return: Texto filtrado
    """
    try:
        nltk.data.find('corpora\stopwords')
    except LookupError:
        nltk.download('stopwords')
    if len(texto) == 0:
        return ""
    p_stopwords = set(stopwords.words('portuguese'))
    filtered = (w for w in texto.split() if w.lower() not in p_stopwords)
    return " ".join(filtered) 
Example 37
Project: Pointer-Generator   Author: Sohone-Guo   File: tokenizers.py    MIT License 5 votes vote down vote up
def _get_sentence_tokenizer(self, language):
        if language in self.SPECIAL_SENTENCE_TOKENIZERS:
            return self.SPECIAL_SENTENCE_TOKENIZERS[language]
        try:
            path = to_string("tokenizers/punkt/%s.pickle") % to_string(language)
            return nltk.data.load(path)
        except (LookupError, zipfile.BadZipfile):
            raise LookupError(
                "NLTK tokenizers are missing. Download them by following command: "
                '''python -c "import nltk; nltk.download('punkt')"'''
            ) 
Example 38
Project: gender_analysis   Author: dhmit   File: setup.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def download_nltk_packages():
    import nltk
    nltk.download('punkt')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('stopwords') 
Example 39
Project: jgtextrank   Author: jerrygaoLondon   File: setup.py    MIT License 5 votes vote down vote up
def run(self):
        _install.do_egg_install(self)
        import nltk

        nltk.download("punkt")
        nltk.download('averaged_perceptron_tagger')
        nltk.download('wordnet') 
Example 40
Project: CloudBot   Author: paris-ci   File: piglatin.py    GNU General Public License v3.0 5 votes vote down vote up
def load_nltk():
    nltk.download('cmudict')

    global pronunciations
    pronunciations = nltk.corpus.cmudict.dict() 
Example 41
Project: TAGGS   Author: jensdebruijn   File: sanitize.py    MIT License 5 votes vote down vote up
def tokenize(text, stopwords=False, remove_punctuation=False):
    tokens = tknzr.tokenize(text)
    if stopwords:
        if isinstance(stopwords, str):
            while True:
                try:
                    stopwords = nltk.corpus.stopwords.words(stopwords)
                    break
                except LookupError:
                    nltk.download("stopwords")
        tokens = [token for token in tokens if token.lower() not in stopwords]
    if remove_punctuation:
        tokens = [token for token in tokens if token not in string.punctuation]
    return tokens 
Example 42
Project: tokenquery   Author: ramtinms   File: tokenizer.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, tokenizer_type="PTBTokenizer"):

        # Sanity checks
        if tokenizer_type in ['SpaceTokenizer', 'NLTKWhiteSpaceTokenizer', 'PTBTokenizer']:
            self.tokenizer_type = tokenizer_type
        else:
            print ("Unrecognized tokenizer type : setting back to default (PTBTokenizer)")
            self.tokenizer_type = "PTBTokenizer"
        try:
            nltk.data.find('punkt.zip')
        except LookupError:
            nltk.download('punkt') 
Example 43
Project: tokenquery   Author: ramtinms   File: pos_tagger.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self):
        try:
            nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle')
        except LookupError:
            nltk.download('averaged_perceptron_tagger') 
Example 44
Project: modmod   Author: Remesh   File: example.py    Apache License 2.0 5 votes vote down vote up
def create(pool, config):
    nltk.download('stopwords')
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append('')
    stopwords.remove('not')
    stopwords.remove('no')
    return RemoveStopwords(stopwords) 
Example 45
Project: ldt   Author: annargrs   File: load_config.py    Apache License 2.0 5 votes vote down vote up
def nltk_download():
    """Downloading the necessary NLTK resources if they are missing."""
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
    try:
        nltk.data.find('corpora/stopwords')
    except LookupError:
        nltk.download('stopwords') 
Example 46
Project: vn   Author: togatoga   File: setup.py    MIT License 5 votes vote down vote up
def run(self):
        install.run(self)
        import nltk
        nltk.download('wordnet') 
Example 47
Project: cornerwise   Author: codeforboston   File: keywords.py    MIT License 5 votes vote down vote up
def setup():
    "Install required NLTK corpora"
    return nltk.download("punkt") and \
        nltk.download("averaged_perceptron_tagger") 
Example 48
Project: python-arpa   Author: sfischer13   File: download.py    MIT License 5 votes vote down vote up
def main():
    for corpus in ['punkt', 'udhr2', 'words']:
        nltk.download(corpus)
    return 0 
Example 49
Project: easse   Author: feralvam   File: text.py    GNU General Public License v3.0 5 votes vote down vote up
def to_sentences(text, language='english'):
    try:
        tokenizer = nltk.data.load(f'tokenizers/punkt/{language}.pickle')
    except LookupError:
        nltk.download('punkt')
        tokenizer = nltk.data.load(f'tokenizers/punkt/{language}.pickle')
    return tokenizer.tokenize(text) 
Example 50
Project: Peerion   Author: hyperionxtech   File: utils.py    Apache License 2.0 5 votes vote down vote up
def nltk_download_corpus(resource_path):
    """
    Download the specified NLTK corpus file
    unless it has already been downloaded.

    Returns True if the corpus needed to be downloaded.
    """
    from nltk.data import find
    from nltk import download
    from os.path import split, sep
    from zipfile import BadZipfile

    # Download the NLTK data only if it is not already downloaded
    _, corpus_name = split(resource_path)

    # From http://www.nltk.org/api/nltk.html
    # When using find() to locate a directory contained in a zipfile,
    # the resource name must end with the forward slash character.
    # Otherwise, find() will not locate the directory.
    #
    # Helps when resource_path=='sentiment/vader_lexicon''
    if not resource_path.endswith(sep):
        resource_path = resource_path + sep

    downloaded = False

    try:
        find(resource_path)
    except LookupError:
        download(corpus_name)
        downloaded = True
    except BadZipfile:
        raise BadZipfile(
            'The NLTK corpus file being opened is not a zipfile, '
            'or it has been corrupted and needs to be manually deleted.'
        )

    return downloaded 
Example 51
Project: sia-cog   Author: tech-quantum   File: nltkmgr.py    MIT License 5 votes vote down vote up
def download():
    nltk.download() 
Example 52
Project: talk-generator   Author: korymath   File: language_util.py    MIT License 5 votes vote down vote up
def _check_and_download_corpus(corpus_fullname, corpus_shortname):
    try:
        nltk.data.find(corpus_fullname)
    except LookupError as le:
        nltk.download(corpus_shortname) 
Example 53
Project: ParlAI   Author: facebookresearch   File: interactive_retrieval.py    MIT License 5 votes vote down vote up
def _set_up_sent_tok(self):
        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 54
Project: ParlAI   Author: facebookresearch   File: interactive_end2end.py    MIT License 5 votes vote down vote up
def _set_up_sent_tok(self):
        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 55
Project: ParlAI   Author: facebookresearch   File: agents.py    MIT License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 56
Project: ParlAI   Author: facebookresearch   File: agents.py    MIT License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 57
Project: zodiacy   Author: wilzbach   File: setup.py    MIT License 5 votes vote down vote up
def check_nltk(entries):
    """ checks whether required nltk parts are available """
    import nltk
    for entry in entries:
        try:
            nltk.data.find(entry[0])
        except LookupError:
            print("downloading %s", entry[1])
            nltk.download(entry[1]) 
Example 58
Project: deepmatcher   Author: anhaidgroup   File: process.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def _maybe_download_nltk_data():
    import nltk
    nltk.download('perluniprops', quiet=True)
    nltk.download('nonbreaking_prefixes', quiet=True)
    nltk.download('punkt', quiet=True) 
Example 59
Project: kpi2017   Author: deepmipt   File: embeddings_dict.py    Apache License 2.0 5 votes vote down vote up
def __init__(self, opt, embedding_dim):
        """Initialize the class according to given parameters."""

        self.tok2emb = {}
        self.embedding_dim = embedding_dim
        self.opt = copy.deepcopy(opt)
        self.load_items()

        nltk.download('punkt')

        if not self.opt.get('fasttext_model'):
            raise RuntimeError('No pretrained fasttext model provided')
        self.fasttext_model_file = self.opt.get('fasttext_model')
        if not os.path.isfile(self.fasttext_model_file):
            emb_path = os.environ.get('EMBEDDINGS_URL')
            if not emb_path:
                raise RuntimeError('No pretrained fasttext model provided')
            fname = os.path.basename(self.fasttext_model_file)
            try:
                print('Trying to download a pretrained fasttext model from the repository')
                url = urllib.parse.urljoin(emb_path, fname)
                urllib.request.urlretrieve(url, self.fasttext_model_file)
                print('Downloaded a fasttext model')
            except Exception as e:
                raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e)

        self.fasttext_model = fasttext.load_model(self.fasttext_model_file) 
Example 60
Project: neural_chat   Author: natashamjaques   File: interactive_retrieval.py    MIT License 5 votes vote down vote up
def _set_up_sent_tok(self):
        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 61
Project: neural_chat   Author: natashamjaques   File: interactive_end2end.py    MIT License 5 votes vote down vote up
def _set_up_sent_tok(self):
        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 62
Project: neural_chat   Author: natashamjaques   File: agents.py    MIT License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 63
Project: neural_chat   Author: natashamjaques   File: agents.py    MIT License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 64
Project: youtube-sentiment-helper   Author: dillonmabry   File: setup.py    MIT License 5 votes vote down vote up
def post_install():
      """Post installation nltk corpus downloads."""
      import nltk
      nltk.download("punkt")
      nltk.download('words')
      nltk.download('maxent_ne_chunker')
      nltk.download('averaged_perceptron_tagger')
      nltk.download("stopwords") 
Example 65
Project: fine-grained-sentiment-app   Author: prrao87   File: lime_explainer.py    MIT License 5 votes vote down vote up
def __init__(self, model_file: str = None) -> None:
        try:
            from nltk.sentiment.vader import SentimentIntensityAnalyzer
        except:
            import nltk
            nltk.download('vader_lexicon')
        self.vader = SentimentIntensityAnalyzer()
        self.classes = np.array([1, 2, 3, 4, 5]) 
Example 66
Project: dl4dial-mt-beam   Author: nyu-dl   File: agents.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        # get number of examples
        self.num_exs = 0
        for ep in range(self.num_episodes()):
            d = self.data[ep]
            for entry in d['dialog']:
                if (entry.get('checked_sentence', None) is not None and
                        entry.get('checked_sentence') != {} and
                        TOKEN_NOCHOSEN not in
                        entry.get('checked_sentence')):
                    self.num_exs += 1
        self.stop_words = ['i', 'a', 'an', 'am', 'are', 'about', 'as', 'at',
                           'be', 'by', 'for', 'from', 'how', 'in', 'is', 'it',
                           'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was',
                           'what', 'when', 'where', '--', '?', '.', "''", "''",
                           "``", ',', 'do', 'see', 'want', 'people', 'and',
                           "n't", "me", 'too', 'own', 'their', '*', "'s", 'not',
                           'than', 'other', 'you', 'your', 'know', 'just',
                           'but', 'does', 'really', 'have', 'into', 'more',
                           'also', 'has', 'any', 'why', 'will', 'with', 'well',
                           'still', 'he', 'she', 'we', 'may', 'these', 'his',
                           'hers', 'which', 'such', 'they', 'its', 'were', 'my',
                           'there', ';', '-', ':', '|', '&', ')', '(']

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path)

        self.teacher_type = opt.get('teacher_type') 
Example 67
Project: dl4dial-mt-beam   Author: nyu-dl   File: agents.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 68
Project: dl4dial-mt-beam   Author: nyu-dl   File: agents.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def __init__(self, opt, shared=None):
        super().__init__(opt, shared)

        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format('english')
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 69
Project: dl4dial-mt-beam   Author: nyu-dl   File: worlds.py    BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def setup_tokenizer(self, opt):
        try:
            import nltk
        except ImportError:
            raise ImportError('Please install nltk (e.g. pip install nltk).')
        # nltk-specific setup
        st_path = 'tokenizers/punkt/{0}.pickle'.format(opt['dict_language'])
        try:
            self.sent_tok = nltk.data.load(st_path)
        except LookupError:
            nltk.download('punkt')
            self.sent_tok = nltk.data.load(st_path) 
Example 70
Project: sunbird-ml-workbench   Author: project-sunbird   File: taggingUtils.py    MIT License 5 votes vote down vote up
def download_file_to_folder(url_to_download, path_to_folder, file_name): #download_from_downloadUrl
    download_dir = os.path.join(path_to_folder, 'temp' + file_name)
    status = downloadZipFile(url_to_download, download_dir)
    try:
        if status:
            unzip_files(download_dir)
            ecar_unzip(
                download_dir, os.path.join(
                    path_to_folder, file_name))
            path_to_file = os.path.join(path_to_folder, file_name)
            return path_to_file
    except BaseException:
        print("Unavailable for download") 
Example 71
Project: sunbird-ml-workbench   Author: project-sunbird   File: taggingUtils.py    MIT License 5 votes vote down vote up
def ecar_unzip(download_location, copy_location): #ekstep_ecar_unzip
    """
    This function unzips an ecar file(ekstep file format)
    and parses all the subfolder.
    All the files are copied into one of ``'assets','data','items'`` folder
    (same name as in downloaded folder is maintained)
    based on its location in the downloaded folder.
    :param download_location(str): A location in the disk where ekstep ecar resource file is downloaded
    :param copy_location(str): A disk location where the ecar is unwrapped
    """
    assert isinstance(download_location, str)
    assert isinstance(copy_location, str)
    if not os.path.exists(copy_location):
        os.makedirs(copy_location)
    #To make the new sub-directories in which the files will be eventually stored
    location=[os.path.join(copy_location,folder) for folder in ['assets','data','items']]
    for loc in location:
        if not os.path.exists(loc):
            os.makedirs(loc)
    ecar_extensions = ['png', 'gif', 'jpg', 'mp4', 'webm', 'pdf', 'mp3', 'ecml']
    files_found = findFiles(download_location, ecar_extensions)
    if files_found:
        for file in files_found:
            if file[-4:] in "ecml":
                shutil.copy(file, copy_location)
            else:
                shutil.copy(file, os.path.join(copy_location, "assets"))
    else:
        print("No files to copy!")
    # Delete the messy download directory
    if os.path.exists(download_location):
        shutil.rmtree(download_location) 
Example 72
Project: sunbird-ml-workbench   Author: project-sunbird   File: taggingUtils.py    MIT License 5 votes vote down vote up
def url_to_audio_extraction(url, path):
    """
    Download audio in .mp3 format from a youtube URL and save it in a disk location.
    :param url(str): A youtube URL
    :returns: Path to the downloaded audio
    """
    logging.info("UTAE_YOUTUBE_URL_START: {0}".format(url))
    if not os.path.exists(path):
        os.makedirs(path)
    cid = os.path.split(os.path.split(path)[0])[1]
    path_to_audio = os.path.join(path, cid + ".mp3")
    print(path_to_audio)
    logging.info("UTAE_YOUTUBE_URL_START: {0}".format(url))
    if not os.path.isfile(path_to_audio):
        os.chdir(path)
        url = embed_youtube_url_validation(url)
        ydl_opts = {
            'format': 'bestaudio[asr=44100]/best',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '256'
            }]
        }

        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        os.rename(
            list(
                filter(
                    (lambda x: '.mp3' in x),
                    os.listdir(path)))[0],
            path_to_audio)
        logging.info("UTAE_AUDIO_DOWNLOAD_COMPLETE")
        return path_to_audio
    else:
        return path_to_audio
    logging.info("UTAE_YOUTUBE_URL_STOP") 
Example 73
Project: sunbird-ml-workbench   Author: project-sunbird   File: taggingUtils.py    MIT License 5 votes vote down vote up
def download_content(method, url_to_download, path_to_save, id_name): #download_to_local
    logging.info("DTL_START_FOR_URL: {0}".format(url_to_download))
    path_to_id = ""
    if method == "ecml":
        logging.info("DTL_ECAR_URL: {0}".format(url_to_download))
        try:
            path_to_id = download_file_to_folder(
                url_to_download, path_to_save, id_name)
        except RuntimeError:
            logging.info("Skipped url: {0}".format(url_to_download))

    if method == "youtube":
        try:
            logging.info("DTL_YOUTUBE_URL: {0}".format(url_to_download))
            path_to_id = os.path.join(path_to_save, id_name)
            location = [os.path.join(path_to_id, folder)
                        for folder in ['assets', 'data', 'items']]

            path_to_audio_download = os.path.join(path_to_id, "assets")
            for loc in location:
                if not os.path.exists(loc):
                    os.makedirs(loc)
            path_to_audio = url_to_audio_extraction(
                url_to_download, path_to_audio_download)
            logging.info("Path to audio file is {0}".format(path_to_audio))
        except BaseException:
            logging.info("Could not download the youtube url")

    if method == "pdf":
        logging.info("DTL_PDF_URL: {0}".format(url_to_download))
        try:
            path_to_id = download_file_to_folder(
                url_to_download, path_to_save, id_name)
        except BaseException:
            logging.info("Skipped url: {0}".format(url_to_download))

    else:
        logging.info(
            "Download not required for url: {0}".format(url_to_download))
    logging.info("DTL_STOP_FOR_URL: {0}".format(url_to_download))
    return path_to_id 
Example 74
Project: text-to-image   Author: paarthneekhara   File: download_datasets.py    MIT License 4 votes vote down vote up
def download_dataset(data_name):
    if data_name == 'flowers':
        print('== Flowers dataset ==')
        flowers_dir = os.path.join(DATA_DIR, 'flowers')
        flowers_jpg_tgz = os.path.join(flowers_dir, '102flowers.tgz')
        make_sure_path_exists(flowers_dir)

        # the original google drive link at https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view
        # from https://github.com/reedscot/icml2016 is problematic to download automatically, so included
        # the text_c10 directory from that archive as a bzipped file in the repo
        captions_tbz = os.path.join(DATA_DIR, 'flowers_text_c10.tar.bz2')
        print('Extracting ' + captions_tbz)
        captions_tar = tarfile.open(captions_tbz, 'r:bz2')
        captions_tar.extractall(flowers_dir)

        flowers_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
        print('Downloading ' + flowers_jpg_tgz + ' from ' + flowers_url)
        urlretrieve(flowers_url, flowers_jpg_tgz,
                    reporthook=dl_progress_hook)
        print('Extracting ' + flowers_jpg_tgz)
        flowers_jpg_tar = tarfile.open(flowers_jpg_tgz, 'r:gz')
        flowers_jpg_tar.extractall(flowers_dir)  # archive contains jpg/ folder

    elif data_name == 'skipthoughts':
        print('== Skipthoughts models ==')
        SKIPTHOUGHTS_DIR = os.path.join(DATA_DIR, 'skipthoughts')
        SKIPTHOUGHTS_BASE_URL = 'http://www.cs.toronto.edu/~rkiros/models/'
        make_sure_path_exists(SKIPTHOUGHTS_DIR)

        # following https://github.com/ryankiros/skip-thoughts#getting-started
        skipthoughts_files = [
            'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
            'bi_skip.npz.pkl',
        ]
        for filename in skipthoughts_files:
            src_url = SKIPTHOUGHTS_BASE_URL + filename
            print('Downloading ' + src_url)
            urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
                        reporthook=dl_progress_hook)

    elif data_name == 'nltk_punkt':
        import nltk
        print('== NLTK pre-trained Punkt tokenizer for English ==')
        nltk.download('punkt')

    elif data_name == 'pretrained_model':
        print('== Pretrained model ==')
        MODEL_DIR = os.path.join(DATA_DIR, 'Models')
        pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
        src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
        print('Downloading ' + src_url)
        urlretrieve(
            src_url,
            os.path.join(MODEL_DIR, pretrained_model_filename),
            reporthook=dl_progress_hook,
        )

    else:
        raise ValueError('Unknown dataset name: ' + data_name) 
Example 75
Project: text-to-image   Author: paarthneekhara   File: download_datasets.py    MIT License 4 votes vote down vote up
def download_dataset(data_name):
    if data_name == 'flowers':
        print('== Flowers dataset ==')
        flowers_dir = os.path.join(DATA_DIR, 'flowers')
        flowers_jpg_tgz = os.path.join(flowers_dir, '102flowers.tgz')
        make_sure_path_exists(flowers_dir)

        # the original google drive link at https://drive.google.com/file/d/0B0ywwgffWnLLcms2WWJQRFNSWXM/view
        # from https://github.com/reedscot/icml2016 is problematic to download automatically, so included
        # the text_c10 directory from that archive as a bzipped file in the repo
        captions_tbz = os.path.join(DATA_DIR, 'flowers_text_c10.tar.bz2')
        print(('Extracting ' + captions_tbz))
        captions_tar = tarfile.open(captions_tbz, 'r:bz2')
        captions_tar.extractall(flowers_dir)

        flowers_url = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
        print(('Downloading ' + flowers_jpg_tgz + ' from ' + flowers_url))
        urlretrieve(flowers_url, flowers_jpg_tgz,
                    reporthook=dl_progress_hook)
        print(('Extracting ' + flowers_jpg_tgz))
        flowers_jpg_tar = tarfile.open(flowers_jpg_tgz, 'r:gz')
        flowers_jpg_tar.extractall(flowers_dir)  # archive contains jpg/ folder

    elif data_name == 'skipthoughts':
        print('== Skipthoughts models ==')
        SKIPTHOUGHTS_DIR = os.path.join(DATA_DIR, 'skipthoughts')
        SKIPTHOUGHTS_BASE_URL = 'http://www.cs.toronto.edu/~rkiros/models/'
        make_sure_path_exists(SKIPTHOUGHTS_DIR)

        # following https://github.com/ryankiros/skip-thoughts#getting-started
        skipthoughts_files = [
            'dictionary.txt', 'utable.npy', 'btable.npy', 'uni_skip.npz', 'uni_skip.npz.pkl', 'bi_skip.npz',
            'bi_skip.npz.pkl',
        ]
        for filename in skipthoughts_files:
            src_url = SKIPTHOUGHTS_BASE_URL + filename
            print(('Downloading ' + src_url))
            urlretrieve(src_url, os.path.join(SKIPTHOUGHTS_DIR, filename),
                        reporthook=dl_progress_hook)

    elif data_name == 'nltk_punkt':
        import nltk
        print('== NLTK pre-trained Punkt tokenizer for English ==')
        nltk.download('punkt')

    elif data_name == 'pretrained_model':
        print('== Pretrained model ==')
        MODEL_DIR = os.path.join(DATA_DIR, 'Models')
        pretrained_model_filename = 'latest_model_flowers_temp.ckpt'
        src_url = 'https://bitbucket.org/paarth_neekhara/texttomimagemodel/raw/74a4bbaeee26fe31e148a54c4f495694680e2c31/' + pretrained_model_filename
        print(('Downloading ' + src_url))
        urlretrieve(
            src_url,
            os.path.join(MODEL_DIR, pretrained_model_filename),
            reporthook=dl_progress_hook,
        )

    else:
        raise ValueError('Unknown dataset name: ' + data_name) 
Example 76
Project: dl4ir-webnav   Author: nyu-dl   File: convert2idx.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def compute_idx(pages_path_in, pages_path_out, vocab):


    f = h5py.File(pages_path_in, 'r')

    if prm.att_doc and prm.att_segment_type == 'sentence':
        nltk.download('punkt')
        tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    os.remove(pages_path_out) if os.path.exists(pages_path_out) else None

    # Save to HDF5
    fout = h5py.File(pages_path_out,'a')

    if prm.att_doc:
        shape = (f['text'].shape[0],prm.max_segs_doc,prm.max_words)
    else:
        shape=(f['text'].shape[0],prm.max_words)

    idxs = fout.create_dataset('idx', shape=shape, dtype=np.int32)
    mask = fout.create_dataset('mask', shape=(f['text'].shape[0],), dtype=np.float32)

    i = 0
    for text in f['text']:
        st = time.time()

        if prm.att_doc:
            if prm.att_segment_type.lower() == 'section' or prm.att_segment_type.lower() == 'subsection':
                segs = ['']
                for line in text.split('\n'):
                    if prm.att_segment_type == 'section':
                        line = line.replace('===', '')
                    if line.strip().startswith('==') and line.strip().endswith('=='):
                        segs.append('')
                    segs[-1] += line.lower() + '\n'
            elif prm.att_segment_type.lower() == 'sentence':
                segs = tokenizer.tokenize(text.lower().decode('ascii', 'ignore'))
            elif prm.att_segment_type.lower() == 'word':
                segs = wordpunct_tokenize(text.decode('ascii', 'ignore'))
            else:
                raise ValueError('Not a valid value for the attention segment type (att_segment_type) parameter. Valid options are "section", "subsection", "sentence", or "word".')
            
            segs = segs[:prm.max_segs_doc]
            idxs_, _ = utils.text2idx2(segs, vocab, prm.max_words)
            idxs[i,:len(idxs_),:] = idxs_
            mask[i] = len(idxs_)
        else:
            idx, _ = utils.text2idx2([text.lower()], vocab, prm.max_words)
            idxs[i,:] = idx[0]
        i += 1

        #if i > 3000:
        #    break

        print 'processing article', i, 'time', time.time()-st

    f.close()
    fout.close() 
Example 77
Project: baby-ai-game   Author: simon555   File: sentenceEmbedder.py    BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def __init__(self,
                 glove_path=directory+"/InferSent/dataset/GloVe/glove.840B.300d.txt",
                 useCuda=False,
                 Nwords=10000,
                 pathToInferSentModel=directory+'/InferSent/infersent.allnli.pickle',
                 modelDirectory=directory+"/InferSent"):
        print ("Loading Glove Model")
        
        
        #adding directory to the InferSent module
        if (not modelDirectory in sys.path):
            print("adding local directory to load the model")
            sys.path.append(modelDirectory)
        else:
            print("directory already in the sys.path")
            
        
        nltk.download('punkt')        
        
        #loading model
        if (useCuda):
            print("you are on GPU (encoding ~1000 sentences/s, default)")
            self.infersent = torch.load(pathToInferSentModel)
        else: 
            print("you are on CPU (~40 sentences/s)")
            self.infersent = torch.load(pathToInferSentModel, map_location=lambda storage, loc: storage)
        
        
        
        self.infersent.set_glove_path(glove_path)
        
        print("loading the {} most common words".format(Nwords))
        try: 
            self.infersent.build_vocab_k_words(K=Nwords)
            print("vocab trained")
        except Exception as e:
            print("ERROR")    
            print(e)
            print("\nPOSSIBLE SOLUTION")
            print("if you have an encoding error, specify encoder='utf8' in the models.py file line 111 " )
        
        print("done") 
Example 78
Project: KarazinNews-telegram-bot   Author: maxkrivich   File: rssbot.py    MIT License 4 votes vote down vote up
def public_posts(self):
        # Получаем 30 последних записей из rss канала и новости из БД, у которых message_id=0
        current_time = datetime.utcnow().replace(tzinfo=pytz.UTC)
        posts_from_db = self.db.get_post_without_message_id()
        self.src.refresh()
        line = []
        for i in self.src.news:
            if (current_time - i.date).days < 2:
                line.append(i)

        # Выбор пересечний этих списков
        for_publishing = list(set(line) & set(posts_from_db))
        for_publishing = sorted(for_publishing, key=lambda news: news.date)
        # Постинг каждого сообщений
        flag = False
        for post in for_publishing:
            flag = True
            try:
                article = Article(post.link, language='uk')
                article.download()
                article.parse()
                article.nlp()
                img = upload_image(article.top_image)
                parsed_uri = urlparse(post.link)
                text = ''.join(map(lambda s: '<p>{}</p>'.format(s),  article.text.split('\n')))
                tel_text = messages.TELEGRAPH_TML.format(img=img,
                                                        text=text,
                                                        slink=parsed_uri.netloc,
                                                        link=self.url_shortener.short_link(post.link))
                url = upload_to_telegraph(title=post.title, author='Max Krivich', text=tel_text, author_url='https://t.me/maxkrivich')['url']

                text = messages.POST_MESSAGE.format(title=post.title,
                                                    link=url)
                a = self.bot.sendMessage(chat_id=self.chat_id,
                                        text=text,
                                        parse_mode=telegram.ParseMode.HTML,
                                        disable_notification=True)
                message_id = a.message_id
                chat_id = a['chat']['id']
                self.db.update(post.link, chat_id, message_id)
                logger.info('Public: %s;%s;' % (post, message_id))
                time.sleep(self.delay_between_messages)
            except Exception as e:
                logger.exception(e)
        if flag:
            self.db.session.close()
        return flag 
Example 79
Project: ancile   Author: ancile-project   File: rdl.py    GNU Affero General Public License v3.0 4 votes vote down vote up
def rdl_usage_data_recurrence(data):
    """
    Returns top 10 words in frequency from google searches within date ranges
    :param data:
    :return:
    """
    from datetime import  timedelta
    from dateutil.parser import parse as parse_date
    import nltk
    nltk.download('wordnet')
    nltk.download('stopwords')
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords
    stops = set(stopwords.words("english"))
    filtered_data = data['data']['usage']
    lemmatizer = WordNetLemmatizer()

    word_week_dict = {}  # Words to weeks
    for search_term in filtered_data:
        dt = parse_date(search_term[1]).date()
        week_dt = dt - timedelta(days=dt.weekday())  # Get Sunday
        word_list = search_term[0].split(' ')
        for word in word_list:
            lem_word = lemmatizer.lemmatize(word)
            if lem_word in stops:
                continue
            if lem_word in word_week_dict:
                word_week_dict[lem_word].add(week_dt)
            else:
                word_week_dict[lem_word] = {week_dt}

    # Get Minimum and Maximum weeks
    total_weeks = 1
    if data['data']['extents']['min'] is not None and data['data']['extents']['max'] is not None:
        dt = parse_date(data['data']['extents']['min'])
        dt = dt - timedelta(days=dt.weekday())  # Get Sunday
        dt = dt.date()  # Truncate to date
        end = parse_date(data['data']['extents']['max'])
        end = end - timedelta(days=end.weekday())
        end = end.date()
        total_weeks = (end - dt).days / 7
        if total_weeks == 0:
            total_weeks = 1

    word_recurrences = []
    for word in word_week_dict:
        word_recurrences.append({'lemma': word, 'recurrence': len(word_week_dict[word]) / total_weeks})
    word_recurrences.sort(reverse=True, key=lambda x: x['recurrence'])
    top_ten = word_recurrences[:10]
    data['output'].append('RDL Top Ten Recurrence Words Transform.')
    data['items'] = top_ten
    # Delete raw data
    del data['data']
    return data 
Example 80
Project: lemontree   Author: khshim   File: gutenberg.py    MIT License 4 votes vote down vote up
def __init__(self, base_datapath, mode='alice_in_wonderland', eos=True, sos=False, lower=True):
        """
        This function initializes the class.
        Currently, only one book per class is implemented.
        Reading multiple books at one time is for future implementation.
        For initialization, split text into sentences.

        Parameters
        ----------
        base_datapath: string
            a string path where textbook is saved.
        mode: string, default: 'alice_in_wonderland'
            a string which will be target book.
            i.e., 'alice_in_wonderland.txt'
        eos: bool, default: True.
            a bool value to determine whether to put <eos> at the end of sentences.
        sos: bool, default: False.
            a bool value to determine wheter to put <sos> at the front of sentences.
        lower: bool, default: True.
            a bool value, whether we should lower all cases or not (for english).

        Returns
        -------
        None.
        """
        # check asserts
        assert isinstance(base_datapath, str), '"base_datapath" should be a string path.'
        assert isinstance(mode, str), '"mode" should be a string name for textbook.'
        assert isinstance(eos, bool), '"eos" should be a bool value to determine <eos> insert or not.'
        assert isinstance(sos, bool), '"eos" should be a bool value to determine <sos> insert or not.'

        # load
        book_file =  base_datapath + 'gutenberg/' + mode + '.txt'
        print('Gutenberg load book:', book_file)
        start_time = time.clock()
        import nltk

        with open(book_file, 'r') as f:
            if lower:
                corpus = f.read().lower().replace('\n', ' ')
            else:
                corpus = f.read().replace('\n', ' ')
            # nltk.download()  # download model -> punkt if you get an error
            self.sentences = nltk.tokenize.sent_tokenize(corpus)  # a list of sentences, each sentence is string
            for i in range(len(self.sentences)):
                words_from_string = nltk.tokenize.word_tokenize(self.sentences[i])
                if eos:
                    words_from_string = words_from_string + ['<EOS>']
                if sos:
                    words_from_string = ['<SOS>'] + words_from_string
                self.sentences[i] = words_from_string  # string to word, now sentence is list of list of words
        print('Gutenberg number of sentences:', len(self.sentences))
        end_time = time.clock()
        print('Gutenberg load time:', end_time - start_time)