Python Code Examples for create dictionary

60 Python code examples are found related to "create dictionary". These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Project: p2ptv-pi   Author: alesnav   File: buddycast.py    License: MIT License 6 votes vote down vote up
def createCollectedDictionaryList(self, buddycast_data, selversion):
        collecteds = buddycast_data.get('collected torrents', [])
        if len(collecteds) == 0:
            return []
        d = []
        try:
            d = [ dict({'infohash': coll[0],
             'num_seeders': coll[1],
             'num_leechers': coll[2],
             'calc_age': coll[3],
             'num_sources_seen': coll[4]}) for coll in collecteds ]
            return d
        except Exception as msg:
            print_exc()
            raise Exception, msg
            return d 
Example 2
Project: UniqueBible   Author: eliranwong   File: ThirdParty.py    License: GNU General Public License v3.0 6 votes vote down vote up
def createDictionaryModule(self, module, content):
        filename = os.path.join("thirdParty", "dictionaries", "{0}.dic.bbp".format(module))
        if os.path.isfile(filename):
            os.remove(filename)
        with sqlite3.connect(filename) as connection:
            cursor = connection.cursor()
            # create table "Dictionary"
            create = "CREATE TABLE Dictionary (Topic NVARCHAR(100), Definition TEXT)"
            cursor.execute(create)
            connection.commit()
            # insert data to table "Dictionary"
            insert = "INSERT INTO Dictionary (Topic, Definition) VALUES (?, ?)"
            cursor.executemany(insert, content)
            connection.commit()

    # create UniqueBible.app lexicon modules 
Example 3
Project: attract-repel   Author: nmrksic   File: attract-repel.py    License: Apache License 2.0 6 votes vote down vote up
def create_vector_dictionary(self):
        """
        Extracts the current word vectors from TensorFlow embeddings and (if print_simlex=True) prints their SimLex scores. 
        """
        log_time = time.time()

        [current_vectors] = self.sess.run([self.W_dynamic])
        self.word_vectors = {}
        for idx in range(0, self.vocabulary_size):
            self.word_vectors[self.inverted_index[idx]] = normalise_vector(current_vectors[idx, :])

        if self.log_scores_over_time or self.print_simlex:
            (score_simlex, score_wordsim) = simlex_scores(self.word_vectors, self.print_simlex)
            return (score_simlex, score_wordsim)

        return (1.0, 1.0) 
Example 4
Project: OpenSeq2Seq   Author: NVIDIA   File: lmutils.py    License: Apache License 2.0 6 votes vote down vote up
def create_dictionary(self, proc_path, filename):
    '''
    Add words to the dictionary only if it's in the train file
    '''
    self.dictionary.add_word(self.dictionary.UNK)
    with open(filename, 'r') as f:
      f.readline()
      for line in f:
        words = line.split() + [self.dictionary.EOS]
        for word in words:
          self.dictionary.add_word(word)

    with open(os.path.join(proc_path, self.vocab_link), 'w') as f:
      f.write('\t'.join(['0', self.dictionary.UNK, '0']) + '\n')
      idx = 1
      for token_id, count in self.dictionary.counter.most_common():
        if count < self.limit:
          f.write(str(idx) + '\n')
          return
        f.write('\t'.join([str(idx), 
              self.dictionary.idx2word[token_id], 
              str(count)]) + '\n')
        idx += 1 
Example 5
Project: garcon   Author: xethorn   File: utils.py    License: MIT License 6 votes vote down vote up
def create_dictionary_key(dictionary):
    """Create a key that represents the content of the dictionary.

    Args:
        dictionary (dict): the dictionary to use.
    Return:
        str: the key that represents the content of the dictionary.
    """

    if not isinstance(dictionary, dict):
        raise TypeError('The value passed should be a dictionary.')

    if not dictionary:
        raise ValueError('The dictionary cannot be empty.')

    sorted_dict = sorted(dictionary.items())

    key_parts = ''.join([
        "'{key}':'{val}';".format(key=key, val=val)
        for (key, val) in sorted_dict])

    return hashlib.sha1(key_parts.encode('utf-8')).hexdigest() 
Example 6
Project: yap   Author: Novartis   File: yap_tools.py    License: Apache License 2.0 6 votes vote down vote up
def create_dictionary(config_file):
    """ configuration file reader /
	creates dictionary of parameters """
    try:
        result = numpy.loadtxt(config_file, dtype = "a100",comments = '#',delimiter = ':=')
	workflow_config_dict = {}
    	for i in range(0,len(result)):
        	k = re.search('(.*\")(.*)(\".*)',result[i][0])
        	v = re.search('(.*\")(.*)(\".*)',result[i][1])
        	key = k.group(2).strip(" ")
        	val = v.group(2).strip(" ")
        	key = key.strip("\t")
        	val = val.strip("\t")
        	workflow_config_dict.setdefault(key,val)
    except IOError as (errno ,strerror):
	print "Error:  while Opening the file : " , config_file 
	print "	       I/O error({0}): {1}".format(errno,strerror)
	exit() 
Example 7
Project: pan-genome-analysis   Author: neherlab   File: sf_gain_loss.py    License: GNU General Public License v3.0 6 votes vote down vote up
def create_ignoring_pattern_dictionary(tree,p = 0):
    """
    create a dictionary of pattern that correspond to extended core genes and extended unique genes
    these pattern will be ignored in the inference of gene gain/loss rates
    """
    #create a pattern dictionary
    #unpatterndict = {pattern_tuple: [first position in pseudoalignment with pattern, number of genes with this pattern]}
    #initialize dictionaries
    import itertools
    tree.tree.unpatterndict = {}
    numstrains = len(tree.tree.get_terminals())
    if p == 0:
        p = int(numstrains/10)
    corepattern = ('1',)*numstrains
    nullpattern = ('0',)*numstrains

    #all sets of indices for p or less of numstrains individuals
    myindices = iter(())
    for i in range(p):
        myindices = itertools.chain(myindices, itertools.combinations(range(numstrains),i+1))

    for indices in myindices:
        tree.tree.unpatterndict[index2pattern(indices,numstrains)] = [-1,0,0]
        tree.tree.unpatterndict[index2pattern_reverse(indices,numstrains)] = [-1,0,0] 
Example 8
Project: medaCy   Author: NLPatVCU   File: vectorizer.py    License: GNU General Public License v3.0 6 votes vote down vote up
def create_feature_dictionary(self, feature_name, sentences):
        """Get dictionary that maps all possible values of a specific feature to ids.

        :param feature_name: Name of feature.
        :param sentences: Sentences to get feature for.
        :return: Dictionary for given feature.
        """
        feature_to_index = {}
        feature_name = '0:' + feature_name

        for sentence in sentences:
            for token in sentence:
                feature = token[feature_name]
                if feature not in feature_to_index:
                    feature_to_index[feature] = len(feature_to_index)

        return feature_to_index 
Example 9
Project: transferable_sent2vec   Author: wasiahmad   File: utils.py    License: MIT License 6 votes vote down vote up
def create_dictionary(sentences):
    words = {}
    for s in sentences:
        for word in s:
            if word in words:
                words[word] += 1
            else:
                words[word] = 1
    words['<s>'] = 1e9 + 4
    words['</s>'] = 1e9 + 3
    words['<p>'] = 1e9 + 2
    # words['<UNK>'] = 1e9 + 1
    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
    id2word = []
    word2id = {}
    for i, (w, _) in enumerate(sorted_words):
        id2word.append(w)
        word2id[w] = i

    return id2word, word2id 
Example 10
Project: RAFCON   Author: DLR-RM   File: state.py    License: Eclipse Public License 1.0 6 votes vote down vote up
def create_output_dictionary_for_state(state):
        """Return empty output dictionary for a state

        :param state: the state of which the output data is determined
        :return: the output data of the target state
        """
        from rafcon.core.states.library_state import LibraryState
        result_dict = {}
        for key, data_port in state.output_data_ports.items():
            if isinstance(state, LibraryState) and state.use_runtime_value_output_data_ports[key]:
                result_dict[data_port.name] = copy.copy(state.output_data_port_runtime_values[key])
            else:
                result_dict[data_port.name] = copy.copy(data_port.default_value)
        return result_dict
    # ---------------------------------------------------------------------------------------------
    # ----------------------------------- data port functions -------------------------------------
    # --------------------------------------------------------------------------------------------- 
Example 11
Project: cs-ranking   Author: kiudee   File: letor_listwise_dataset_reader.py    License: Apache License 2.0 6 votes vote down vote up
def create_dataset_dictionary(self, files):
        self.logger.info("Files {}".format(files))
        dataset_dictionaries = dict()
        for file in files:
            dataset = dict()
            key = os.path.basename(file).split(".txt")[0]
            self.logger.info("File name {}".format(key))
            for line in open(file):
                information = line.split("#")[0].split(" qid:")
                rel_deg = int(information[0])
                qid = information[1].split(" ")[0]
                x = np.array(
                    [float(l.split(":")[1]) for l in information[1].split(" ")[1:-1]]
                )
                x = np.insert(x, len(x), rel_deg)
                if qid not in dataset:
                    dataset[qid] = [x]
                else:
                    dataset[qid].append(x)
            array = np.array([len(i) for i in dataset.values()])
            dataset_dictionaries[key] = dataset
            self.logger.info("Maximum length of ranking: {}".format(np.max(array)))
        return dataset_dictionaries 
Example 12
Project: pigaios   Author: joxeankoret   File: sourcexp_ida.py    License: GNU General Public License v3.0 6 votes vote down vote up
def create_function_dictionary(self, args):
    d = {}
    d["ea"] = args[0]
    d["name"] = args[1]
    d["prototype"] = args[2]
    d["prototype2"] = args[3]
    d["conditions"] = args[4]
    d["constants"] = args[5]
    d["constants_json"] = args[6]
    d["loops"] = args[7]
    d["switchs"] = args[8]
    d["switchs_json"] = args[9]
    d["calls"] = args[10]
    d["externals"] = args[11]
    d["recursive"] = args[12]
    d["indirects"] = args[13]
    d["globals"] = args[14]
    d["callees_json"] = args[15]
    return d 
Example 13
Project: e-SNLI   Author: OanaMariaCamburu   File: utils.py    License: MIT License 6 votes vote down vote up
def create_dictionary(sentences):
    words = {}
    for s in sentences:
        for word in s:
            if word in words:
                words[word] += 1
            else:
                words[word] = 1
    words['<s>'] = 1e9 + 4
    words['</s>'] = 1e9 + 3
    words['<p>'] = 1e9 + 2
    # words['<UNK>'] = 1e9 + 1
    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
    id2word = []
    word2id = {}
    for i, (w, _) in enumerate(sorted_words):
        id2word.append(w)
        word2id[w] = i

    return id2word, word2id 
Example 14
Project: popgen-stats   Author: tatumdmortimer   File: subsampleVCF.py    License: MIT License 6 votes vote down vote up
def create_vcf_dictionary(vcf):
    vcfDict = {}
    for line in vcf:
        if line[0] == "#":
            continue
        line = line.strip().split()
        pos = line[1]
        ALT = line[4]
        if len(ALT) > 1:    #skip positions that aren't biallelic
            continue
        INFO = line[7]
        outgroup = line[9]
        if outgroup != ".":
            print "VCF in incorrect format."
            print "Outgroup should be reference & first strain in alleles"
            sys.exit()
        alleles = line[10:]
        if "SILENT" in INFO:
            vcfDict[pos] = ["S", alleles]
        if "MISSENSE" in INFO:
            vcfDict[pos] = ["NS", alleles]
    vcf.close()
    return vcfDict 
Example 15
Project: phuzzer   Author: angr   File: __init__.py    License: BSD 2-Clause "Simplified" License 6 votes vote down vote up
def create_dictionary(self):
        l.warning("creating a dictionary of string references within target \"%s\"", self.target)

        b = angr.Project(self.target, load_options={'auto_load_libs': False})
        cfg = b.analyses.CFG(resolve_indirect_jumps=True, collect_data_references=True)
        state = b.factory.blank_state()

        string_references = []
        for v in cfg._memory_data.values():
            if v.sort == "string" and v.size > 1:
                st = state.solver.eval(state.memory.load(v.address, v.size), cast_to=bytes)
                string_references.append((v.address, st))

        strings = [] if len(string_references) == 0 else list(list(zip(*string_references))[1])
        return strings


    #
    # Subclasses should override this.
    # 
Example 16
Project: nlp-recipes   Author: microsoft   File: utils.py    License: MIT License 6 votes vote down vote up
def create_dictionary(sentences):
    words = {}
    for s in sentences:
        for word in s:
            if word in words:
                words[word] += 1
            else:
                words[word] = 1
    words['<s>'] = 1e9 + 4
    words['</s>'] = 1e9 + 3
    words['<p>'] = 1e9 + 2
    # words['<UNK>'] = 1e9 + 1
    sorted_words = sorted(words.items(), key=lambda x: -x[1])  # inverse sort
    id2word = []
    word2id = {}
    for i, (w, _) in enumerate(sorted_words):
        id2word.append(w)
        word2id[w] = i

    return id2word, word2id 
Example 17
def create_target_dictionary(development_matches):
    sgd_target_pattern = r'[0-9]+\.[0-9]+' # pattern to match target format
    target_matches = {}
    for development_match in development_matches:
        development_match.replace(np.nan, '', regex=True, inplace = True)
        target = None

        for row in development_match.itertuples():
            match = re.search(sgd_target_pattern, str(row[1]), flags=0)
            if match: # If we found a undp target
                target = match.group()
                if target in target_matches: # Add sentence to the set for that target's key
                    target_matches[target].add(row[1][len(target):])
                else:
                    target_matches[target] = set({row[1][len(target):]})
            # Continue adding to the current target's key if there is text in the data frame
            if target != None and row[2] != '':
                target_matches[target].add(row[2])
        
    return target_matches 
Example 18
Project: LEAR   Author: nmrksic   File: lear.py    License: Apache License 2.0 6 votes vote down vote up
def create_vector_dictionary(self):
        """
        Extracts the current word vectors from TensorFlow embeddings and (if print_simlex=True) prints their SimLex scores. 
        """
        log_time = time.time()

        [current_vectors] = self.sess.run([self.W_dynamic])
        self.word_vectors = {}
        for idx in range(0, self.vocabulary_size):
            #self.word_vectors[self.inverted_index[idx]] = normalise_vector(current_vectors[idx, :])
            self.word_vectors[self.inverted_index[idx]] = current_vectors[idx, :]

        if self.log_scores_over_time or self.print_simlex:
            (score_simlex, score_wordsim) = simlex_scores(self.word_vectors, self.distance_metric, self.order, self.print_simlex)
            return (score_simlex, score_wordsim)

        return (1.0, 1.0) 
Example 19
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    License: MIT License 6 votes vote down vote up
def create_histrogram_given_dictionary(d, wonky, title):
	fig = plt.figure() 
	if wonky:
		d_ = dict()
		for key , value in d.iteritems():
			d_[key[0]] = value
		f = {x:70*i for i,x in enumerate(set(d_.keys()))}
		new_d = dict()
		for key, value in d_.iteritems():
			new_d[f[key]] = value
	else:
		f = {x:70*i for i,x in enumerate(set(d.keys()))}
		new_d = dict()
		for key, value in d.iteritems():
			new_d[f[key]] = value
	c = list(new_d.items())
	X,Y = zip(*c)
	plt.barh(X,Y,align='center')
	c = list(f.items())
	ticks,pos = zip(*c)
	pylab.yticks(pos,ticks)
	matplotlib.rc('ytick', labelsize=8)
	fig.suptitle(title)
	plt.show() 
Example 20
Project: clickbait   Author: bhargaviparanjape   File: experiments.py    License: MIT License 6 votes vote down vote up
def create_tag_sentence_dictionary(infile, outfile):
	tagged_data = open(infile, "r+")
	f2 = codecs.open(outfile, "a+", "utf-8")
	for line in tagged_data:
		sentence = ast.literal_eval(line)
		d = dict()
		pos = []
		sen= []
		for word in sentence["tokens"]:
			pos.append(word['pos'])
		for word in sentence["tokens"]:
			sen.append(word['word'])
		if len(sen) >= 4:
			d['sentence'] = " ".join(sen)
			d['pos_sentence'] = " ".join(pos)
			print >> f2, d 
Example 21
Project: aca   Author: geekinglcq   File: lsi_model.py    License: MIT License 6 votes vote down vote up
def create_dictionary(interest_paper):

    print ("create dictionary ...")

    interest_seq = []
    paper_seq = []
    for interest,paper in interest_paper.items():
        interest_seq.append(interest)
        text = ' '.join(paper)
        text = clean_data(text)
        paper_seq.append(text)
    #paper_seq = remove_once_appearance(paper_seq,2)

    dictionary = corpora.Dictionary(paper_seq)
    corpus = [dictionary.doc2bow(text) for text in paper_seq]
    return (interest_seq,dictionary,corpus) 
Example 22
Project: CapsGNN   Author: benedekrozemberczki   File: capsgnn.py    License: GNU General Public License v3.0 5 votes vote down vote up
def create_data_dictionary(self, target, edges, features):
        """
        Creating a data dictionary.
        :param target: Target vector.
        :param edges: Edge list tensor.
        :param features: Feature tensor.
        """
        to_pass_forward = dict()
        to_pass_forward["target"] = target
        to_pass_forward["edges"] = edges
        to_pass_forward["features"] = features
        return to_pass_forward 
Example 23
Project: p2ptv-pi   Author: alesnav   File: buddycast.py    License: MIT License 5 votes vote down vote up
def createPreferenceDictionaryList(self, buddycast_data):
        prefs = buddycast_data.get('preferences', [])
        if len(prefs) == 0:
            return []
        d = []
        try:
            if not type(prefs[0]) == list:
                d = [ dict({'infohash': pref}) for pref in prefs ]
                if buddycast_data['oversion'] >= OLPROTO_VER_EIGHTH:
                    if DEBUG:
                        print >> sys.stderr, 'buddycast: received OLPROTO_VER_EIGHTH buddycast data containing old style preferences. only ok if talking to an earlier non-release version'
                return d
            if buddycast_data['oversion'] >= OLPROTO_VER_ELEVENTH:
                d = [ dict({'infohash': pref[0],
                 'search_terms': pref[1],
                 'position': pref[2],
                 'reranking_strategy': pref[3],
                 'num_seeders': pref[4],
                 'num_leechers': pref[5],
                 'calc_age': pref[6],
                 'num_sources_seen': pref[7]}) for pref in prefs ]
            elif buddycast_data['oversion'] >= OLPROTO_VER_EIGHTH:
                d = [ dict({'infohash': pref[0],
                 'search_terms': pref[1],
                 'position': pref[2],
                 'reranking_strategy': pref[3]}) for pref in prefs ]
            else:
                raise RuntimeError, 'buddycast: unknown preference protocol, pref entries are lists but oversion= %s:\n%s' % (buddycast_data['oversion'], prefs)
            return d
        except Exception as msg:
            print_exc()
            raise Exception, msg
            return d 
Example 24
Project: pilot   Author: PanDAWMS   File: pUtil.py    License: Apache License 2.0 5 votes vote down vote up
def createESFileDictionary(writeToFile):
    """ Create the event range file dictionary from the writeToFile info """

    # writeToFile = 'fileNameForTrf_1:LFN_1,LFN_2^fileNameForTrf_2:LFN_3,LFN_4'
    # -> esFileDictionary = {'fileNameForTrf_1': 'LFN_1,LFN_2', 'fileNameForTrf_2': 'LFN_3,LFN_4'}
    # Also, keep track of the dictionary keys (e.g. 'fileNameForTrf_1') ordered since we have to use them to update the jobParameters
    # once we know the full path to them (i.e. '@fileNameForTrf_1:..' will be replaced by '@/path/filename:..')
    # (the dictionary is otherwise not ordered so we cannot simply use the dictionary keys later)
    # fileInfo = ['fileNameForTrf_1:LFN_1,LFN_2', 'fileNameForTrf_2:LFN_3,LFN_4']

    fileInfo = writeToFile.split("^")
    esFileDictionary = {}
    orderedFnameList = []
    for i in range(len(fileInfo)):
        # Extract the file name
        if ":" in fileInfo[i]:
            finfo = fileInfo[i].split(":")

            # add cwd before the lfns
            #finfo[1] = "`pwd`/" + finfo[1]
            #finfo[1] = finfo[1].replace(',',',`pwd`/')

            # fix the issue that some athena 20 releases have _000 at the end of the filename
            if finfo[0].endswith("_000"):
                tolog("replace %s with %s" % (finfo[0], finfo[0][:-4]))
                finfo[0] = finfo[0][:-4]
            esFileDictionary[finfo[0]] = finfo[1]
            orderedFnameList.append(finfo[0])
        else:
            tolog("!!WARNING!!4444!! File info does not have the correct format, expected a separator \':\': %s" % (fileInfo[i]))
            esFileDictionary = {}
            break

    return esFileDictionary, orderedFnameList 
Example 25
Project: neural_ime   Author: yohokuno   File: decode_ngram.py    License: MIT License 5 votes vote down vote up
def create_dictionary(ngrams):
    dictionary = defaultdict(list)
    for ngram in ngrams.keys():
        if len(ngram) == 1:
            target, source = ngram[0]
            dictionary[source].append(target)
    return dictionary 
Example 26
Project: medaCy   Author: NLPatVCU   File: vectorizer.py    License: GNU General Public License v3.0 5 votes vote down vote up
def create_tag_dictionary(self, tags):
        """Setup self.tag_to_index

        :param tags: List of list of tag names. Usually all true labels for a dataset.
        """
        tag_to_index = {}

        for sequence in tags:
            for tag in sequence:
                if tag not in tag_to_index:
                    tag_to_index[tag] = len(tag_to_index)

        self.tag_to_index = tag_to_index 
Example 27
Project: SindyAutoencoders   Author: kpchamp   File: training.py    License: MIT License 5 votes vote down vote up
def create_feed_dictionary(data, params, idxs=None):
    """
    Create the feed dictionary for passing into tensorflow.

    Arguments:
        data - Dictionary object containing the data to be passed in. Must contain input data x,
        along the first (and possibly second) order time derivatives dx (ddx).
        params - Dictionary object containing model and training parameters. The relevant
        parameters are model_order (which determines whether the SINDy model predicts first or
        second order time derivatives), sequential_thresholding (which indicates whether or not
        coefficient thresholding is performed), coefficient_mask (optional if sequential
        thresholding is performed; 0/1 mask that selects the relevant coefficients in the SINDy
        model), and learning rate (float that determines the learning rate).
        idxs - Optional array of indices that selects which examples from the dataset are passed
        in to tensorflow. If None, all examples are used.

    Returns:
        feed_dict - Dictionary object containing the relevant data to pass to tensorflow.
    """
    if idxs is None:
        idxs = np.arange(data['x'].shape[0])
    feed_dict = {}
    feed_dict['x:0'] = data['x'][idxs]
    feed_dict['dx:0'] = data['dx'][idxs]
    if params['model_order'] == 2:
        feed_dict['ddx:0'] = data['ddx'][idxs]
    if params['sequential_thresholding']:
        feed_dict['coefficient_mask:0'] = params['coefficient_mask']
    feed_dict['learning_rate:0'] = params['learning_rate']
    return feed_dict 
Example 28
Project: cheetah-gui   Author: shmilylty   File: cheetah_dictionary.py    License: GNU General Public License v3.0 5 votes vote down vote up
def create_cheetah_dictionary_setting(root):
    """Starting point when module is imported by another program."""
    global w, w_win, rt
    rt = root
    w = Toplevel(root)
    cheetah_dictionary_support.set_tk_var()
    top = CheetahDictionarySetting(w)
    cheetah_dictionary_support.init(w, top)
    return w, top 
Example 29
Project: symspellpy   Author: mammothb   File: symspellpy.py    License: MIT License 5 votes vote down vote up
def create_dictionary(self, corpus, encoding=None):
        """Load multiple dictionary words from a file containing plain
        text.

        **NOTE**: Merges with any dictionary data already loaded.

        Parameters
        ----------
        corpus : str
            The path+filename of the file.
        encoding : str, optional
            Text encoding of the corpus file.

        Returns
        -------
        bool
            True if file loaded, or False if file not found.
        """
        if isinstance(corpus, str):
            if not os.path.exists(corpus):
                return False
            with open(corpus, "r", encoding=encoding) as infile:
                for line in infile:
                    for key in self._parse_words(line):
                        self.create_dictionary_entry(key, 1)
        else:
            for line in corpus:
                for key in self._parse_words(line):
                    self.create_dictionary_entry(key, 1)
        return True 
Example 30
Project: topical_word_embeddings   Author: thunlp   File: ucicorpus.py    License: MIT License 5 votes vote down vote up
def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary 
Example 31
Project: ComicStreamer   Author: beville   File: comicbookinfo.py    License: Apache License 2.0 5 votes vote down vote up
def createJSONDictionary( self, metadata ):
		
		# Create the dictionary that we will convert to JSON text
		cbi = dict()
		cbi_container = {'appID' : 'ComicTagger/' + '1.0.0', #ctversion.version, 
		                 'lastModified' : str(datetime.now()), 
		                 'ComicBookInfo/1.0' : cbi }
		
		#helper func
		def assign( cbi_entry, md_entry):
			if md_entry is not None:
				cbi[cbi_entry] = md_entry
			
		#helper func
		def toInt(s):
			i = None
			if type(s) in [ str, unicode, int ]:
				try:
					i = int(s)
				except ValueError:
					pass
			return i
				
		assign( 'series', metadata.series )
		assign( 'title', metadata.title )
		assign( 'issue', metadata.issue )
		assign( 'publisher', metadata.publisher )
		assign( 'publicationMonth', toInt(metadata.month) )
		assign( 'publicationYear', toInt(metadata.year) )
		assign( 'numberOfIssues', toInt(metadata.issueCount) )
		assign( 'comments', metadata.comments )
		assign( 'genre', metadata.genre )
		assign( 'volume', toInt(metadata.volume) )
		assign( 'numberOfVolumes', toInt(metadata.volumeCount) )
		assign( 'language', utils.getLanguageFromISO(metadata.language) )
		assign( 'country', metadata.country )
		assign( 'rating', metadata.criticalRating )
		assign( 'credits', metadata.credits )
		assign( 'tags', metadata.tags )
		
		return cbi_container 
Example 32
Project: char-rnn-text-generation   Author: yxtay   File: utils.py    License: MIT License 5 votes vote down vote up
def create_dictionary():
    """
    create char2id, id2char and vocab_size
    from printable ascii characters.
    """
    chars = sorted(ch for ch in string.printable if ch not in ("\x0b", "\x0c", "\r"))
    char2id = dict((ch, i + 1) for i, ch in enumerate(chars))
    char2id.update({"": 0})
    id2char = dict((char2id[ch], ch) for ch in char2id)
    vocab_size = len(char2id)
    return char2id, id2char, vocab_size 
Example 33
def create_dictionary_from_log_line(log_line):
    log_dict = {"metric": log_line[0],
                "points": [(log_line[1], log_line[2])],
                "tags": [log_line[5]]
            }

    logging.debug(log_dict)
    return log_dict 
Example 34
Project: smappPy   Author: SMAPPNYU   File: dictionary_util.py    License: GNU General Public License v2.0 5 votes vote down vote up
def create_dictionary(doc_iterator, dict_file, as_text=False):
    """
    Creates a gensim.corpora.Dictionary object from given document iterator 
    and serializes it to given dict_file (filename) in a memory efficient way.
    @Params:
      as_text   - flag: dictionary saved as text (default: binary)
    """    
    d = Dictionary(doc.strip().lower().split() for doc in doc_iterator)
    if as_text:
        d.save_as_text(dict_file)
    else:
        d.save(dict_file) 
Example 35
Project: cs-ranking   Author: kiudee   File: likelihoods.py    License: Apache License 2.0 5 votes vote down vote up
def create_weight_dictionary(model_args, shapes):
    weights_dict = dict()
    for key, value in model_args.items():
        prior, params = copy.deepcopy(value)
        for k in params.keys():
            if isinstance(params[k], tuple):
                params[k][1]["name"] = "{}_{}".format(key, k)
                params[k] = params[k][0](**params[k][1])
        params["name"] = key
        params["shape"] = shapes[key]
        weights_dict[key] = prior(**params)
    return weights_dict 
Example 36
Project: English-to-IPA   Author: mphilli   File: ipa_to_sql.py    License: MIT License 5 votes vote down vote up
def create_dictionary_table():
    try:
        c.execute("""CREATE TABLE eng_ipa
                    (id INTEGER PRIMARY KEY,
                    word text NOT NULL,
                    phonemes text NOT NULL,
                    ipa text NOT NULL
                    )""")
        conn.commit()
    except sqlite3.OperationalError:
        c.execute("DROP TABLE eng_ipa;")
        conn.commit()
        create_dictionary_table() 
Example 37
Project: TextClassification   Author: erfannoury   File: util.py    License: MIT License 5 votes vote down vote up
def createDictionary(classes, tokens_pool):
    """
    this method will create a dictionary out of the tokens_pool it has been provided.

    Parameters
    ----------
    classes: list
             list of the names of the classes of documents
    tokens_pool: dictionary
                 dictionary of tokens. Each value of the dictionary is an list of lists,
                 each list belonging to a document in the corresponding class that has a list of tokens


    Returns
    -------
    token_dict: dictionary
                *Note that the tokens in the dictionary are not sorted, since in the vector space model
                that we are going to use, all words are treated equal.
                We practically believe in justice. Words in dictionary are tired of
                all this injustice they have been forced to take for such a long time.
                Now is the time to rise and earn the justice that belongs to them.
    """

    token_dict = {}
    idx = 0 #a unique index for words in dictionary
    for cl in classes:
        for tokens_list in tokens_pool[cl]:
            for token in tokens_list:
                if token in token_dict:             #if token has been added to the dictionary before
                    if cl in token_dict[token]:
                        token_dict[token][cl] += 1
                    else:
                        token_dict[token][cl] = 1
                else:
                    token_dict[token] = {}
                    token_dict[token][idx_lbl] = idx
                    idx += 1
                    token_dict[token][cl] = 1
    return token_dict 
Example 38
Project: content   Author: demisto   File: PwnedV2.py    License: MIT License 5 votes vote down vote up
def create_dbot_score_dictionary(indicator_value, indicator_type, dbot_score):
    return {
        'Indicator': indicator_value,
        'Type': indicator_type,
        'Vendor': VENDOR,
        'Score': dbot_score
    } 
Example 39
Project: bottom-up-attention-vqa   Author: hengyuan-hu   File: create_dictionary.py    License: GNU General Public License v3.0 5 votes vote down vote up
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary 
Example 40
Project: spampy   Author: abdullahselek   File: email_processor.py    License: MIT License 5 votes vote down vote up
def create_enron_dictionary(root_dir: str = 'spampy/datasets/enron') -> Dict:
    """
    A function which create a dictionary from enron dataset.
    Uses multiple process.
    
    Args:
      root_dir (str):
        Root folders for enron dataset.
    """

    manager = mp.Manager()
    return_dict = manager.dict()
    jobs = []
    emails_dirs = [os.path.join(root_dir, f) for f in listdir(root_dir)]
    for emails_dir in emails_dirs:
        p = mp.Process(target=enron_processor, args=(emails_dir, return_dict))
        jobs.append(p)
        p.start()

    for proc in jobs:
        proc.join()

    dictionary = return_dict['all_words']
    list_to_remove = return_dict['list_to_remove']

    for item in list_to_remove:
        if item.isalpha() == False: 
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    np.save('dict_enron.npy', dictionary)
    return dictionary 
Example 41
Project: Attention-on-Attention-for-VQA   Author: SinghJasdeep   File: create_dictionary.py    License: MIT License 5 votes vote down vote up
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary 
Example 42
Project: wagtail-personalisation   Author: wagtail   File: utils.py    License: MIT License 5 votes vote down vote up
def create_segment_dictionary(segment):
    """Creates a dictionary with all the required segment information.

    :param segment: Segment object
    :type segment: wagtail_personalisation.models.Segment
    :return: Dictionary with name, id, timestamp and persistent state.
    :rtype: dict

    """
    return {
        "encoded_name": segment.encoded_name(),
        "id": segment.pk,
        "timestamp": int(time.time()),
        "persistent": segment.persistent
    } 
Example 43
Project: qgis-earthengine-examples   Author: giswqs   File: 3_water_class_transition.py    License: MIT License 5 votes vote down vote up
def createPieChartSliceDictionary(fc):
  return ee.List(fc.aggregate_array("transition_class_palette")) \
    .map(lambda p: {'color': p}).getInfo()


###############################
# Calculations
###############################

# Create a dictionary for looking up names of transition classes. 
Example 44
Project: topical_word_embeddings   Author: largelymfs   File: ucicorpus.py    License: MIT License 5 votes vote down vote up
def create_dictionary(self):
        """
        Utility method to generate gensim-style Dictionary directly from
        the corpus and vocabulary data.
        """
        dictionary = Dictionary()

        # replace dfs with defaultdict to avoid downstream KeyErrors
        # uci vocabularies may contain terms that are not used in the document data
        dictionary.dfs = defaultdict(int)

        dictionary.id2token = self.id2word
        dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word))

        dictionary.num_docs = self.num_docs
        dictionary.num_nnz = self.num_nnz

        for docno, doc in enumerate(self):
            if docno % 10000 == 0:
                logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs))

            for word, count in doc:
                dictionary.dfs[word] += 1
                dictionary.num_pos += count

        return dictionary 
Example 45
Project: VQA_ReGAT   Author: linjieli222   File: create_dictionary.py    License: MIT License 5 votes vote down vote up
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'Questions/v2_OpenEnded_mscoco_train2014_questions.json',
        'Questions/v2_OpenEnded_mscoco_val2014_questions.json',
        'Questions/v2_OpenEnded_mscoco_test2015_questions.json',
        'Questions/v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary 
Example 46
Project: zentral   Author: zentralopensource   File: 0033_auto_20200305_2322.py    License: Apache License 2.0 5 votes vote down vote up
def create_body_from_dictionary(apps, schema_editor):
    DeviceCommand = apps.get_model("mdm", "DeviceCommand")
    for dc in DeviceCommand.objects.all():
        dc.dictionary["RequestType"] = dc.request_type
        body = {"Command": dc.dictionary,
                "CommandUUID": str(dc.uuid)}
        dc.body = plistlib.dumps(body).decode("utf-8")
        dc.save() 
Example 47
Project: bottom-up-attention-tf   Author: LeeDoYup   File: create_dictionary.py    License: MIT License 5 votes vote down vote up
def create_dictionary(dataroot):
    dictionary = Dictionary()
    questions = []
    files = [
        'v2_OpenEnded_mscoco_train2014_questions.json',
        'v2_OpenEnded_mscoco_val2014_questions.json',
        'v2_OpenEnded_mscoco_test2015_questions.json',
        'v2_OpenEnded_mscoco_test-dev2015_questions.json'
    ]
    for path in files:
        question_path = os.path.join(dataroot, path)
        qs = json.load(open(question_path))['questions']
        for q in qs:
            dictionary.tokenize(q['question'], True)
    return dictionary 
Example 48
Project: fairness-indicators   Author: tensorflow   File: weight_utils.py    License: Apache License 2.0 5 votes vote down vote up
def create_percentage_difference_dictionary(
    eval_result: tfma.EvalResult,
    baseline_name: Text, metric_name: Text) -> Dict[Text, Any]:
  """Creates dictionary of a % difference between a baseline and other slices.

  Args:
    eval_result: Loaded eval result from running TensorFlow Model Analysis.
    baseline_name: Name of the baseline slice, 'Overall' or a specified tuple.
    metric_name: Name of the metric on which to perform comparisons.

  Returns:
    Dictionary mapping slices to percentage difference from the baseline slice.
  """
  baseline_value = get_baseline_value(eval_result, baseline_name, metric_name)
  difference = {}
  for metrics_tuple in eval_result.slicing_metrics:
    slice_key = metrics_tuple[0]
    metrics = metrics_tuple[1]
    # Concatenate feature name/values for intersectional features.
    column = '-'.join([elem[0] for elem in slice_key])
    feature_val = '-'.join([elem[1] for elem in slice_key])
    if column not in difference:
      difference[column] = {}
    difference[column][feature_val] = (_get_metric_value(metrics, metric_name)
                                       - baseline_value) / baseline_value
  return difference 
Example 49
Project: ascii-combat   Author: aelmosalamy   File: combat.py    License: MIT License 5 votes vote down vote up
def create_dictionary(self):
        dict = {}
        counter = 1
        for enemy in self.enemies:
            if enemy.alive:
                dict[str(counter)] = enemy
                counter += 1
        return dict

    # Returns a string of alive enemy names 
Example 50
Project: Greynir   Author: mideind   File: builder.py    License: GNU General Public License v3.0 5 votes vote down vote up
def create_dictionary(self):
        """ Iterate through the article database
            and create a fresh Gensim dictionary """
        ci = CorpusIterator()
        dic = ReynirDictionary(ci)
        # Drop words that only occur only once or twice in the entire set
        dic.filter_extremes(no_below=3, keep_n=None)
        dic.save(self._DICTIONARY_FILE)
        self._dictionary = dic 
Example 51
Project: text2image   Author: mansimov   File: create-captions.py    License: MIT License 5 votes vote down vote up
def create_reverse_dictionary(dictionary):
    dictionary_reverse = {}

    for word in dictionary:
        index = dictionary[word]
        dictionary_reverse[index] = word
    return dictionary_reverse 
Example 52
Project: ban-vqa   Author: jnhwkim   File: create_dictionary.py    License: MIT License 5 votes vote down vote up
def create_dictionary(dataroot, task='vqa'):
    dictionary = Dictionary()
    if task == 'vqa':
        files = [
            'v2_OpenEnded_mscoco_train2014_questions.json',
            'v2_OpenEnded_mscoco_val2014_questions.json',
            'v2_OpenEnded_mscoco_test2015_questions.json',
            'v2_OpenEnded_mscoco_test-dev2015_questions.json'
        ]
        for path in files:
            question_path = os.path.join(dataroot, path)
            qs = json.load(open(question_path))['questions']
            for q in qs:
                dictionary.tokenize(q['question'], True)

    elif task == 'flickr':
        files = [
            'train_ids.pkl',
            'val_ids.pkl',
            'test_ids.pkl',
        ]
        sentence_dir = os.path.join(dataroot, 'Flickr30kEntities/Sentences')

        for path in files:
            ids_file = os.path.join(dataroot, path)

            with open(ids_file, 'rb') as f:
                imgids = cPickle.load(f)

            for image_id in imgids:
                question_path = os.path.join(sentence_dir, '%d.txt' % image_id)
                phrases = get_sent_data(question_path)
                for phrase in phrases:
                    dictionary.tokenize(phrase, True)
    return dictionary 
Example 53
Project: pan-genome-analysis   Author: neherlab   File: sf_gain_loss.py    License: GNU General Public License v3.0 4 votes vote down vote up
def create_visible_pattern_dictionary(tree):
    """
    create a sequence in all leaves such that each presence absence pattern occurs only once
    """
    #create a pattern dictionary
    #patterndict = {pattern_tuple: [first position in pseudoalignment with pattern, number of genes with this pattern,indicator to include this pattern in the estimation]}
    #clusterdict = {first position with pattern: [number of genes with pattern,indicator to include gene in gtr inference]}
    #initialize dictionaries
    tree.tree.patterndict = {}
    numstrains = len(tree.tree.get_terminals())
    corepattern = ('1',)*numstrains
    nullpattern = ('0',)*numstrains
    tree.tree.clusterdict = {}
    #create dictionaries
    numgenes = tree.tree.get_terminals()[0].genepresence.shape[0]
    for genenumber in range(numgenes):
        pattern=()
        for leaf in tree.tree.get_terminals():
            pattern = pattern + (leaf.genepresence[genenumber],)
        if pattern == nullpattern:
            print("Warning: There seems to be a nullpattern in the data! Check your presence absence pseudoalignment at pos", genenumber+1)
        if pattern in tree.tree.patterndict:
            tree.tree.patterndict[pattern][1] = tree.tree.patterndict[pattern][1]+1
            tree.tree.clusterdict[tree.tree.patterndict[pattern][0]] = [tree.tree.patterndict[pattern][1],1]
        else:
            tree.tree.patterndict[pattern] = [genenumber,1,1]
            tree.tree.clusterdict[tree.tree.patterndict[pattern][0]] = [tree.tree.patterndict[pattern][1],1]

    #thin sequence to unique pattern and save result to node.patternseq
    for node in tree.tree.find_clades():
        if hasattr(node, 'sequence'):
            if len(node.sequence) != numgenes:
                print ("Warning: Nonmatching number of genes in sequence")
            node.patternseq = node.sequence[sorted(tree.tree.clusterdict.keys())]
            # add the all zero pattern at the end of all pattern
            node.patternseq = np.append(node.patternseq,['0',])

    # add an artificial pattern of all zero (nullpattern)
    tree.tree.patterndict[nullpattern] = [numgenes,0,0]
    tree.tree.clusterdict[tree.tree.patterndict[nullpattern][0]] = [tree.tree.patterndict[nullpattern][1],0]
    #create lists for abundance of pattern and inclusion_flag, resp..
    tree.tree.pattern_abundance = [tree.tree.clusterdict[key][0] for key in sorted(tree.tree.clusterdict.keys())]
    tree.tree.pattern_include = [tree.tree.clusterdict[key][1] for key in sorted(tree.tree.clusterdict.keys())]
    #save the index of the first core pattern
    # check whether there is a corepattern (there should always be a corepattern, unless you are using single cell sequencing data.)
    if corepattern in tree.tree.patterndict:
        tree.tree.corepattern_index = sorted(tree.tree.clusterdict.keys()).index(tree.tree.patterndict[corepattern][0]) 
Example 54
Project: Learning-Python-for-Forensics-Second-Edition   Author: PacktPublishing   File: userassist_parser.py    License: MIT License 4 votes vote down vote up
def create_dictionary(registry):
	"""
	The create_dictionary function creates a list of dictionaries
	where keys are the ROT-13 decoded app names and values are
	the raw hex data of said app.
	:param registry: Registry Hive to process
	:return: apps_list, A list containing dictionaries for
	each app
	"""
	try:
		# Open the registry file to be parsed
		registry_file = open(registry, "rb")
		reg = Registry.RegistryHive(registry_file)
	except (IOError, UnicodeDecodeError) as e:
		msg = 'Invalid NTUSER.DAT path or Registry ID.'
		print('[-]', msg)
		logging.error(msg)
		sys.exit(2)

	# Navigate to the UserAssist key
	ua_key = reg.find_key(
	('SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Explorer'
	'\\UserAssist'))
	if ua_key is None:
		msg = 'UserAssist Key not found in Registry file.'
		print('[-]', msg)
		logging.error(msg)
		sys.exit(3)
	apps_list = []
	# Loop through each subkey in the UserAssist key
	for ua_subkey in ua_key.subkeys():
		# For each subkey in the UserAssist key, detect a subkey
        # called Count that has more than 0 values to parse.
		if(ua_subkey.subkey('Count') and
		ua_subkey.subkey('Count').values_count() > 0):
			apps = {}
			for v in ua_subkey.subkey('Count').values():
				if sys.version_info[0] == 2:
					apps[v.name().encode('utf-8').decode(
					'rot-13')] = v.data_raw()
				elif sys.version_info[0] == 3:
					import codecs
					enc = codecs.getencoder('rot-13')
					apps[enc(str(v.name()))[0]] = v.data_raw()

				apps_list.append(apps)
	return apps_list 
Example 55
Project: symspellpy   Author: mammothb   File: symspellpy.py    License: MIT License 4 votes vote down vote up
def create_dictionary_entry(self, key, count):
        """Create/Update an entry in the dictionary. For every word
        there are deletes with an edit distance of 1..max_edit_distance
        created and added to the dictionary. Every delete entry has a
        suggestions list, which points to the original term(s) it was
        created from. The dictionary may be dynamically updated (word
        frequency and new words) at any time by calling
        create_dictionary_entry

        Parameters
        ----------
        key : str
            The word to add to dictionary.
        count : int
            The frequency count for word.

        Returns
        -------
        bool
            True if the word was added as a new correctly spelled
            word, or False if the word is added as a below threshold
            word, or updates an existing correctly spelled word.
        """
        if count <= 0:
            # no point doing anything if count is zero, as it can't
            # change anything
            if self._count_threshold > 0:
                return False
            count = 0

        # look first in below threshold words, update count, and allow
        # promotion to correct spelling word if count reaches threshold
        # threshold must be >1 for there to be the possibility of low
        # threshold words
        if self._count_threshold > 1 and key in self._below_threshold_words:
            count_previous = self._below_threshold_words[key]
            # calculate new count for below threshold word
            count = (count_previous + count
                     if sys.maxsize - count_previous > count
                     else sys.maxsize)
            # has reached threshold - remove from below threshold
            # collection (it will be added to correct words below)
            if count >= self._count_threshold:
                self._below_threshold_words.pop(key)
            else:
                self._below_threshold_words[key] = count
                return False
        elif key in self._words:
            count_previous = self._words[key]
            # just update count if it's an already added above
            # threshold word
            count = (count_previous + count
                     if sys.maxsize - count_previous > count
                     else sys.maxsize)
            self._words[key] = count
            return False
        elif count < self._count_threshold:
            # new or existing below threshold word
            self._below_threshold_words[key] = count
            return False

        # what we have at this point is a new, above threshold word
        self._words[key] = count

        # edits/suggestions are created only once, no matter how often
        # word occurs. edits/suggestions are created as soon as the
        # word occurs in the corpus, even if the same term existed
        # before in the dictionary as an edit from another word
        if len(key) > self._max_length:
            self._max_length = len(key)

        # create deletes
        edits = self._edits_prefix(key)
        for delete in edits:
            self._deletes[delete].append(key)
        return True 
Example 56
Project: seq2seq   Author: kenkov   File: util.py    License: MIT License 4 votes vote down vote up
def create_dictionary(
    corpuses: List[str],
    min_freq: int=1,
    with_symbol=True
) -> corpora.Dictionary:
    """辞書を作成する。

    Args:
        corpuses ([str]): コーパスファイル名。一行が

                今日 は 疲れ た 。

            のように、単語毎にスペースで分割された文がはいっている必要がある。
        save_file (str): 保存するファイル名
        with_symbol (bool): START_SYMBOL, END_SYMBOL を追加するかどうか
    Returns:
        corpora.Dictionary: 辞書
    """
    # make and load dictionary
    dic = corpora.Dictionary()
    print("creating dictionary".format(len(dic.values())))
    if with_symbol:
        dic.add_documents([[config.START_SYMBOL]])
        dic.add_documents([[config.END_SYMBOL]])
        dic.add_documents([[config.UNK_SYMBOL]])
    print("add start and end symbols in dictionary".format(
        len(dic.values())
    ))
    for corpus in corpuses:
        print("adding words from {} to dictionary".format(corpus))
        dic.add_documents(
            line.split() for line in open(corpus)
        )

    # filter words
    ones_ids = {
        tokenid for tokenid, docfreq in dic.dfs.items() if docfreq <= min_freq
    }
    # start symbol と end symbold は含めない
    dic.filter_tokens(ones_ids - {dic.token2id[config.START_SYMBOL],
                                  dic.token2id[config.END_SYMBOL],
                                  dic.token2id[config.UNK_SYMBOL],
                                  })
    dic.compactify()
    if with_symbol:
        if config.START_SYMBOL in dic.token2id and \
                config.END_SYMBOL in dic.token2id and \
                config.UNK_SYMBOL in dic.token2id:
            pass
        else:
            raise Exception("START/END/UNK symbol are not in dictionary")

    return dic 
Example 57
Project: diaphora   Author: joxeankoret   File: diaphora_ida.py    License: GNU Affero General Public License v3.0 4 votes vote down vote up
def create_function_dictionary(self, l):
    (name, nodes, edges, indegree, outdegree, size, instructions, mnems, names,
    proto, cc, prime, f, comment, true_name, bytes_hash, pseudo, pseudo_lines,
    pseudo_hash1, pseudocode_primes, function_flags, asm, proto2,
    pseudo_hash2, pseudo_hash3, strongly_connected_size, loops, rva, bb_topological,
    strongly_connected_spp, clean_assembly, clean_pseudo, mnemonics_spp, switches,
    function_hash, bytes_sum, md_index, constants, constants_size, seg_rva,
    assembly_addrs, kgh_hash, userdata, callers, callees, basic_blocks_data,
    bb_relations) = l
    d = dict(
          name = name,
          nodes = nodes,
          edges = edges,
          indegree = indegree,
          outdegree = outdegree,
          size = size,
          instructions = instructions,
          mnems = mnems,
          names = names,
          proto = proto,
          cc = cc,
          prime = prime,
          f = f,
          comment = comment,
          true_name = true_name,
          bytes_hash = bytes_hash,
          pseudo = pseudo,
          pseudo_lines = pseudo_lines,
          pseudo_hash1 = pseudo_hash1,
          pseudocode_primes = pseudocode_primes,
          function_flags = function_flags,
          asm = asm,
          proto2 = proto2,
          pseudo_hash2 = pseudo_hash2,
          pseudo_hash3 = pseudo_hash3,
          strongly_connected_size = strongly_connected_size,
          loops = loops,
          rva = rva,
          bb_topological = bb_topological,
          strongly_connected_spp = strongly_connected_spp,
          clean_assembly = clean_assembly,
          clean_pseudo = clean_pseudo,
          mnemonics_spp = mnemonics_spp,
          switches = switches,
          function_hash = function_hash,
          bytes_sum = bytes_sum,
          md_index = md_index,
          constants = constants,
          constants_size = constants_size,
          seg_rva = seg_rva,
          assembly_addrs = assembly_addrs,
          kgh_hash = kgh_hash,
          callers = callers,
          callees = callees,
          basic_blocks_data = basic_blocks_data,
          bb_relations = bb_relations,
          userdata = userdata)
    return d 
Example 58
Project: OpenDeep   Author: vitruvianscience   File: config.py    License: Apache License 2.0 4 votes vote down vote up
def create_dictionary_like(input):
    """
    This takes in either an object or filename and parses it into a dictionary. Mostly useful for parsing JSON or YAML
    config files, and returning the dictionary representation.

    Parameters
    ----------
    input : collections.Mapping or str
        Dictionary-like object (implements collections.Mapping), JSON filename, or YAML filename.

    Returns
    -------
    collections.Mapping
        The parsed dictionary-like object, or None if it could not be parsed.

    .. note::

        YAML is parsed by the pyyaml library, which would be an optional dependency.
        Install with 'pip install pyyaml' if you want YAML-parsing capabilities.

    """
    if input is None:
        log.debug('Input to create_dictionary_like was None.')
        return None
    # check if it is a dictionary-like object (implements collections.Mapping)
    elif isinstance(input, collections.Mapping):
        return input
    # otherwise, check if it is a filename to a .json or .yaml
    elif os.path.isfile(input):
        _, extension = os.path.splitext(input)
        # if ends in .json
        if extension.lower() is '.json':
            with open(input, 'r') as json_data:
                return json.load(json_data)
        # if ends in .yaml
        elif (extension.lower() is '.yaml' or extension.lower() is '.yml') and has_pyyaml:
            with open(input, 'r') as yaml_data:
                return yaml.load(yaml_data)
        else:
            log.critical('Configuration file %s with extension %s not supported', str(input), extension)
            if not has_pyyaml:
                log.critical('Please install pyyaml with "pip install pyyaml" to parse yaml files.')
            return None
    # otherwise not recognized/supported:
    else:
        log.critical('Could not find config. Either was not collections.Mapping object or not found in filesystem.')
        return None 
Example 59
Project: NMT-Coverage   Author: tuzhaopeng   File: preprocess.py    License: BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
def create_dictionary():
    # Part I: Counting the words
    counters = []
    sentence_counts = []
    global_counter = Counter()

    for input_file, base_filename in zip(args.input, base_filenames):
        count_filename = base_filename + '.count.pkl'
        input_filename = os.path.basename(input_file.name)
        if os.path.isfile(count_filename) and not args.overwrite:
            logger.info("Loading word counts for %s from %s"
                        % (input_filename, count_filename))
            with open(count_filename, 'rb') as f:
                counter = cPickle.load(f)
            sentence_count = sum([1 for line in input_file])
        else:
            logger.info("Counting words in %s" % input_filename)
            counter = Counter()
            sentence_count = 0
            for line in input_file:
                if args.lowercase:
                    line = line.lower()
                words = None
                if args.char:
                    words = list(line.strip().decode('utf-8'))
                else:
                    words = line.strip().split(' ')
                counter.update(words)
                global_counter.update(words)
                sentence_count += 1
        counters.append(counter)
        sentence_counts.append(sentence_count)
        logger.info("%d unique words in %d sentences with a total of %d words."
                    % (len(counter), sentence_count, sum(counter.values())))
        if args.each and args.count:
            safe_pickle(counter, count_filename)
        input_file.seek(0)

    # Part II: Combining the counts
    combined_counter = global_counter
    logger.info("Total: %d unique words in %d sentences with a total "
                "of %d words."
                % (len(combined_counter), sum(sentence_counts),
                   sum(combined_counter.values())))
    if args.count:
        safe_pickle(combined_counter, 'combined.count.pkl')

    # Part III: Creating the dictionary
    if args.vocab is not None:
        if args.vocab <= 2:
            logger.info('Building a dictionary with all unique words')
            args.vocab = len(combined_counter) + 2
        vocab_count = combined_counter.most_common(args.vocab - 2)
        logger.info("Creating dictionary of %s most common words, covering "
                    "%2.1f%% of the text."
                    % (args.vocab,
                       100.0 * sum([count for word, count in vocab_count]) /
                       sum(combined_counter.values())))
    else:
        logger.info("Creating dictionary of all words")
        vocab_count = counter.most_common()
    vocab = {'UNK': 1, '<s>': 0, '</s>': 0}
    for i, (word, count) in enumerate(vocab_count):
        vocab[word] = i + 2
    safe_pickle(vocab, args.dictionary)
    return combined_counter, sentence_counts, counters, vocab 
Example 60
Project: Ciw   Author: CiwPython   File: import_params.py    License: MIT License 4 votes vote down vote up
def create_network_from_dictionary(params_input):
    """
    Creates a Network object from a parameters dictionary.
    """
    params = fill_out_dictionary(params_input)
    validify_dictionary(params)
    # Then make the Network object
    arrivals = [params['arrival_distributions']['Class ' + str(clss)]
        for clss in range(len(params['arrival_distributions']))]
    services = [params['service_distributions']['Class ' + str(clss)]
        for clss in range(len(params['service_distributions']))]
    if all(isinstance(f, types.FunctionType) for f in params['routing']):
        routing = params['routing']
    else:
        routing = [params['routing']['Class ' + str(clss)]
            for clss in range(len(params['routing']))]
    priorities = [params['priority_classes']['Class ' + str(clss)]
        for clss in range(len(params['priority_classes']))]
    baulking_functions = [params['baulking_functions']['Class ' + str(clss)]
        for clss in range(len(params['baulking_functions']))]
    batches = [params['batching_distributions']['Class ' + str(clss)]
        for clss in range(len(params['batching_distributions']))]
    number_of_classes = params['number_of_classes']
    number_of_nodes = params['number_of_nodes']
    queueing_capacities = [float(i) if i == "Inf" else i for i in params['queue_capacities']]
    class_change_matrices = params.get('class_change_matrices',
        {'Node ' + str(nd + 1): None for nd in range(number_of_nodes)})
    number_of_servers, schedules, nodes, classes, preempts = [], [], [], [], []
    for c in params['number_of_servers']:
        if isinstance(c, (tuple, list)):
            if isinstance(c, tuple):
                s = c[0]
                p = c[1]
            if isinstance(c, list):
                s = c
                p = False
            number_of_servers.append('schedule')
            schedules.append(s)
            preempts.append(p)
        elif c == 'Inf':
            number_of_servers.append(float(c))
            schedules.append(None)  
            preempts.append(False)
        else:
            number_of_servers.append(c)
            schedules.append(None) 
            preempts.append(False)   
    for nd in range(number_of_nodes):
        nodes.append(ServiceCentre(
            number_of_servers[nd],
            queueing_capacities[nd],
            class_change_matrices['Node ' + str(nd + 1)],
            schedules[nd],
            preempts[nd]))
    for clss in range(number_of_classes):
        if all(isinstance(f, types.FunctionType) for f in params['routing']):
            classes.append(CustomerClass(
                arrivals[clss],
                services[clss],
                routing,
                priorities[clss],
                baulking_functions[clss],
                batches[clss]))
        else:
            classes.append(CustomerClass(
                arrivals[clss],
                services[clss],
                routing[clss],
                priorities[clss],
                baulking_functions[clss],
                batches[clss]))
    n = Network(nodes, classes)
    if all(isinstance(f, types.FunctionType) for f in params['routing']):
        n.process_based = True
    else:
        n.process_based = False
    return n