Python collections.Counter() Examples

The following are code examples for showing how to use collections.Counter(). They are extracted from open source Python projects. You can vote up the examples you like or vote down the ones you don't like. You can also save this page to your account.

Example 1
Project: redberry   Author: michaelcho   File: post.py    (Apache License 2.0) View Source Project 7 votes vote down vote up
def keywords(self, num=5):

        words_only = self.strip_tags(self.content, strip_punctuation=True)
        words = words_only.split()

        counter = collections.Counter(words)
        common = counter.most_common()

        keywords = []

        INSIGNIFICANT_WORDS = ('should', 'which', 'therefore')

        for word in common:
            lower_word = word[0].lower()
            if len(lower_word) > 4 and lower_word not in INSIGNIFICANT_WORDS:
                keywords.append(lower_word)

            if len(keywords) >= num:
                break

        return ", ".join(keywords) 
Example 2
Project: STA141C   Author: clarkfitzg   File: problem2.py    (MIT License) View Source Project 6 votes vote down vote up
def overlap_score(q1, q2):
    """
    q1, q2 are preprocessed sentences (strings)

    >>> overlap_score("a b", "a")
    0.6666666666666666

    """

    c1 = Counter(q1.split())
    c2 = Counter(q2.split())
    c1c2 = c1 + c2

    both = set(c1.keys())
    both = both.intersection(c2.keys())

    bothscore = float(sum(c1c2[x] for x in both))
    mplusn = float(sum(c1c2.values()))

    score = bothscore / mplusn

    return score 
Example 3
Project: STA141C   Author: clarkfitzg   File: problem2_backup.py    (MIT License) View Source Project 6 votes vote down vote up
def overlap_score(q1, q2):
    """
    >>> overlap_score("fun", "real fun")
    0.6666666666666666
    >>> overlap_score("  ", "   ")
    0
    """

    q1count = Counter(q1.split())
    q2count = Counter(q2.split())

    both = set(q1count.keys())
    both = both.intersection(q2count.keys())
    combined = q1count + q2count

    mplusn = float(sum(combined.values()))
    overlap = float(sum(combined[x] for x in both))

    try:
        return overlap / mplusn
    except ZeroDivisionError:
        return 0 
Example 4
Project: Modeling_Preparation   Author: Yangruipis   File: decision_tree.py    (license) View Source Project 6 votes vote down vote up
def vote(df, columns_name, value):
        label_data = df.loc[df[columns_name] == value, 'label'].values
        return Counter(label_data).most_common()[0][0] 
Example 5
Project: NeoVintageous   Author: NeoVintageous   File: state.py    (license) View Source Project 6 votes vote down vote up
def update_xpos(self, force=False):
        if self.must_update_xpos or force:
            try:
                # TODO: we should check the current mode instead. ============
                sel = self.view.sel()[0]
                pos = sel.b
                if not sel.empty():
                    if sel.a < sel.b:
                        pos -= 1
                # ============================================================
                r = sublime.Region(self.view.line(pos).a, pos)
                counter = Counter(self.view.substr(r))
                tab_size = self.view.settings().get('tab_size')
                xpos = (self.view.rowcol(pos)[1] +
                        ((counter['\t'] * tab_size) - counter['\t']))
            except Exception as e:
                nvim.console_message(e)
                _logger.exception('error setting xpos; default to 0')
                self.xpos = 0
                return
            else:
                self.xpos = xpos 
Example 6
Project: IgDiscover   Author: NBISweden   File: multidiscover.py    (MIT License) View Source Project 6 votes vote down vote up
def main(args):
	if args.minimum_frequency is None:
		minimum_frequency = max((len(args.tables) + 1) // 2, 2)
	else:
		minimum_frequency = args.minimum_frequency
	logger.info('Minimum frequency set to %s', minimum_frequency)

	# Read in tables
	tables = []
	for path in args.tables:
		table = pd.read_csv(path, sep='\t')
		table = table[table.database_diff >= args.minimum_db_diff]
		table = table.dropna()
		tables.append(table)
		if len(table) == 0:
			logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)

	# Count V sequence occurrences
	counter = Counter()
	for table in tables:
		counter.update(set(table.consensus))

	# Find most frequent occurrences and print result
	print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
	for sequence, frequency in counter.most_common():
		if frequency < minimum_frequency:
			break
		names = []
		gene = None
		for table in tables:
			matching_rows = table[table.consensus == sequence]
			if matching_rows.empty:
				continue
			names.extend(matching_rows.name)
			if gene is None:
				row = matching_rows.iloc[0]
				gene = row.gene
				database_diff = row.database_diff
				#shm = row['V_SHM']
		print(frequency, gene, database_diff, sequence, *names, sep='\t') 
Example 7
Project: IgDiscover   Author: NBISweden   File: commonv.py    (MIT License) View Source Project 6 votes vote down vote up
def main(args):
	if args.minimum_frequency is None:
		# args.table is a list of file names
		minimum_frequency = max((len(args.table) + 1) // 2, 2)
	else:
		minimum_frequency = args.minimum_frequency
	logger.info('Minimum frequency set to %s', minimum_frequency)

	# Read in tables
	tables = []
	for path in args.table:
		table = read_table(path)
		table = table.loc[:,['V_gene', 'V_SHM', 'V_nt', 'name']]
		tables.append(table)

	# Count V sequence occurrences
	counter = Counter()
	for table in tables:
		counter.update(set(table.V_nt))

	# Find most frequent occurrences and print result
	print('Frequency', 'Gene', '%SHM', 'Sequence', sep='\t')
	for sequence, frequency in counter.most_common():
		if frequency < minimum_frequency:
			break
		names = []
		gene = None
		for table in tables:
			matching_rows = table[table.V_nt == sequence]
			if matching_rows.empty:
				continue
			names.extend(matching_rows.name)
			if gene is None:
				row = matching_rows.iloc[0]
				gene = row['V_gene']
				shm = row['V_SHM']
		print(frequency, gene, shm, sequence, *names, sep='\t') 
Example 8
Project: xpandas   Author: alan-turing-institute   File: bag_of_features_transformer.py    (license) View Source Project 6 votes vote down vote up
def __init__(self, dictionary=None, **kwargs):
        '''
        :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
        '''
        self.dictionary = dictionary

        accepted_types = [
            pd.Series, list, np.array, tuple
        ]

        def bag_of_words_transform_function(corpus):
            counter = Counter(corpus)
            for el in self.dictionary:
                if counter.get(el) is None:
                    counter[el] = 0
            return counter

        super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
                                                    columns=None,
                                                    transform_function=bag_of_words_transform_function) 
Example 9
Project: zipline-chinese   Author: zhanghan1990   File: test_term.py    (Apache License 2.0) View Source Project 6 votes vote down vote up
def assertDifferentObjects(self, *objs):
        id_counts = Counter(map(id, objs))
        ((most_common_id, count),) = id_counts.most_common(1)
        if count > 1:
            dupe = [o for o in objs if id(o) == most_common_id][0]
            self.fail("%s appeared %d times in %s" % (dupe, count, objs)) 
Example 10
Project: trf   Author: aistairc   File: analyser.py    (license) View Source Project 6 votes vote down vote up
def calc_n_types(self) -> int:
        """Calculate the number of types of input text
        Returns:
            int: the number of types of input text
        """
        surfaces = []
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            surfaces += [mrph.midasi for mrph in juman_result.mrph_list()]
        word_type_counter = Counter(surfaces)
        return len(word_type_counter) 
Example 11
Project: trf   Author: aistairc   File: analyser.py    (license) View Source Project 6 votes vote down vote up
def calc_rs_modality(self) -> Dict[str, float]:

        modality_counter = Counter()
        for i, s in enumerate(self.sentences):
            chunks = []
            for bnst in self.knp.parse(s).bnst_list():
                chunk = Chunk(chunk_id=bnst.bnst_id,
                              link=bnst.parent,
                              description=bnst.fstring)
                chunks.append(chunk)

            s = "".join([chunk.description for chunk in chunks])
            ms = set(re.findall("<?????-(.+?)>", s))
            modality_counter += Counter(ms)

            n = len(self.sentences)

        return dict([(k, float(c) / n)
                     for k, c in modality_counter.items()]) 
Example 12
Project: PlasoScaffolder   Author: ClaudiaSaxer   File: sqlite_type_helper.py    (license) View Source Project 6 votes vote down vote up
def GetDuplicateColumnNames(
      self, columns: sql_query_column_model.SQLColumnModel) -> [str]:
    """Find out if the query has duplicate column names and if a alias is
        needed.

    Args:
      columns (sql_query_column_model.SQLColumnModel): all columns parsed
          from the cursor
    Returns:
      [str]: a list of all the duplicate column names, if its empty it means it
          is a distinct list of columns
    """
    single_column_name_list = [column.sql_column for column in columns]
    duplicate_list = [column for column, count in
                      collections.Counter(single_column_name_list).items() if
                      count > 1]
    return sorted(duplicate_list) 
Example 13
Project: tf_rnnlm   Author: Ubiqus   File: dataset.py    (Apache License 2.0) View Source Project 6 votes vote down vote up
def _build_vocab(self, filename):
    counts = Counter()
    with tf.gfile.GFile(filename, "r") as f:
      #for line in f:
      #  words = line.replace("\n"," ").split()
      #  counts += Counter(words)
      while True:
        chunk = f.read(int(500000000/2))
        if not chunk: 
          break
        counts += Counter(chunk.replace("\n", " ").split())

    sorted_pairs = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
    self.word_to_id = {e[0]: (i+3) for (i, e) in enumerate(sorted_pairs)}
    self.word_to_id[EOS] = IEOS
    self.word_to_id[BOS] = IBOS
    self.word_to_id[PAD] = IPAD 
Example 14
Project: dl4mt-multi   Author: nyu-dl   File: models.py    (BSD 3-Clause "New" or "Revised" License) View Source Project 6 votes vote down vote up
def print_params(self, cgs):
        """
        cgs : list of computational graph names
        """
        for name, cg in cgs.iteritems():
            shapes = [param.get_value().shape for param in cg.parameters]
            logger.info(
                "Parameter shapes for computation graph[{}]".format(name))
            for shape, count in Counter(shapes).most_common():
                logger.info('    {:15}: {}'.format(shape, count))
            logger.info(
                "Total number of parameters for computation graph[{}]: {}"
                .format(name, len(shapes)))

            logger.info(
                "Parameter names for computation graph[{}]: ".format(name))
            for item in cg.parameters:
                logger.info(
                    "    {:15}: {}".format(item.get_value().shape, item.name))
            logger.info(
                "Total number of parameters for computation graph[{}]: {}"
                .format(name, len(cg.parameters))) 
Example 15
Project: manubot   Author: greenelab   File: manuscript.py    (license) View Source Project 6 votes vote down vote up
def get_manuscript_stats(text, citation_df):
    """
    Compute manuscript statistics.
    """
    stats = collections.OrderedDict()

    # Number of distinct references by type
    ref_counts = (
        citation_df
        .standard_citation
        .drop_duplicates()
        .map(lambda x: x.split(':')[0])
        .pipe(collections.Counter)
    )
    ref_counts['total'] = sum(ref_counts.values())
    stats['reference_counts'] = ref_counts
    stats['word_count'] = len(text.split())
    logging.info(f"Generated manscript stats:\n{json.dumps(stats, indent=2)}")
    return stats 
Example 16
Project: otRebuilder   Author: Pal3love   File: __init__.py    (MIT License) View Source Project 6 votes vote down vote up
def subset_glyphs(self, s):
    table = self.table.Baseline
    if table.Format in (1, 3):
        baselines = {glyph: table.BaselineValues.get(glyph, table.DefaultBaseline)
                     for glyph in s.glyphs}
        if len(baselines) > 0:
            mostCommon, _cnt = Counter(baselines.values()).most_common(1)[0]
            table.DefaultBaseline = mostCommon
            baselines = {glyph: b for glyph, b in baselines.items()
                         if b != mostCommon}
        if len(baselines) > 0:
            table.BaselineValues = baselines
        else:
            table.Format = {1: 0, 3: 2}[table.Format]
            del table.BaselineValues
    return True 
Example 17
Project: otRebuilder   Author: Pal3love   File: __init__.py    (MIT License) View Source Project 6 votes vote down vote up
def subset_glyphs(self, s):
    prop = self.table.GlyphProperties
    if prop.Format == 0:
        return prop.DefaultProperties != 0
    elif prop.Format == 1:
        prop.Properties = {g: prop.Properties.get(g, prop.DefaultProperties)
                           for g in s.glyphs}
        mostCommon, _cnt = Counter(prop.Properties.values()).most_common(1)[0]
        prop.DefaultProperties = mostCommon
        prop.Properties = {g: prop for g, prop in prop.Properties.items()
                           if prop != mostCommon}
        if len(prop.Properties) == 0:
            del prop.Properties
            prop.Format = 0
            return prop.DefaultProperties != 0
        return True
    else:
        assert False, "unknown 'prop' format %s" % prop.Format 
Example 18
Project: Deep-Learning-with-Keras   Author: PacktPublishing   File: mem-network.py    (MIT License) View Source Project 6 votes vote down vote up
def build_vocab(train_data, test_data):
    counter = collections.Counter()
    for stories, questions, answers in [train_data, test_data]:
        for story in stories:
            for sent in story:
                for word in nltk.word_tokenize(sent):
                    counter[word.lower()] += 1
        for question in questions:
            for word in nltk.word_tokenize(question):
                counter[word.lower()] += 1
        for answer in answers:
            for word in nltk.word_tokenize(answer):
                counter[word.lower()] += 1
    # no OOV here because there are not too many words in dataset
    word2idx = {w:(i+1) for i, (w, _) in enumerate(counter.most_common())}
    word2idx["PAD"] = 0
    idx2word = {v:k for k, v in word2idx.items()}
    return word2idx, idx2word 
Example 19
Project: mbin   Author: fanglab   File: create_kmer_freq_vectors.py    (license) View Source Project 6 votes vote down vote up
def kmer_freq ( ref_str, k ):
	"""
	Walk through sequence and return k-mer counts plus
	a pseudocount of 1.
	"""
	ref_str = ref_str.upper()
	kmers = []
	for seq in product("ATGC",repeat=k):
		kmers.append( "".join(seq) )

	kmer_counts = Counter()
	for j in range( len(ref_str)-(k-1) ):
		motif    = ref_str[j:j+k]
		kmer_counts[motif] += 1

	# Combine forward and reverse complement motifs into one count
	combined_kmer = Counter()
	for kmer in kmers:
		kmer_rc = rev_comp_motif(kmer)
		if not combined_kmer.get(kmer_rc):
			combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1

	return combined_kmer 
Example 20
Project: mbin   Author: fanglab   File: read_scanner.py    (license) View Source Project 6 votes vote down vote up
def kmer_freq ( mode, ref_str, strand, opts ):
	ref_str = ref_str.upper()
	if strand==1:
		ref_str = ref_str[::-1]
	k = opts.comp_kmer
	kmers = []
	for seq in product("ATGC",repeat=k):
		kmers.append( "".join(seq) )

	kmer_counts = Counter()
	for j in range( len(ref_str)-(k-1) ):
		motif    = ref_str[j:j+k]
		kmer_counts[motif] += 1

	# Combine forward and reverse complement motifs into one count
	combined_kmer = Counter()
	for kmer in kmers:
		kmer_rc = motif_tools.rev_comp_motif(kmer)
		if not combined_kmer.get(kmer_rc):
			combined_kmer[kmer] = kmer_counts[kmer] + kmer_counts[kmer_rc] + 1

	return combined_kmer 
Example 21
Project: keras-utilities   Author: cbaziotis   File: data_preparation.py    (MIT License) View Source Project 6 votes vote down vote up
def get_class_weights2(y, smooth_factor=0):
    """
    Returns the normalized weights for each class based on the frequencies of the samples
    :param smooth_factor: factor that smooths extremely uneven weights
    :param y: list of true labels (the labels must be hashable)
    :return: dictionary with the weight for each class
    """
    counter = Counter(y)

    if smooth_factor > 0:
        p = max(counter.values()) * smooth_factor
        for k in counter.keys():
            counter[k] += p

    majority = max(counter.values())

    return {cls: float(majority / count) for cls, count in counter.items()} 
Example 22
Project: DeepPath   Author: xwhan   File: utils.py    (license) View Source Project 6 votes vote down vote up
def path_clean(path):
	rel_ents = path.split(' -> ')
	relations = []
	entities = []
	for idx, item in enumerate(rel_ents):
		if idx%2 == 0:
			relations.append(item)
		else:
			entities.append(item)
	entity_stats = Counter(entities).items()
	duplicate_ents = [item for item in entity_stats if item[1]!=1]
	duplicate_ents.sort(key = lambda x:x[1], reverse=True)
	for item in duplicate_ents:
		ent = item[0]
		ent_idx = [i for i, x in enumerate(rel_ents) if x == ent]
		if len(ent_idx)!=0:
			min_idx = min(ent_idx)
			max_idx = max(ent_idx)
			if min_idx!=max_idx:
				rel_ents = rel_ents[:min_idx] + rel_ents[max_idx:]
	return ' -> '.join(rel_ents) 
Example 23
Project: dactyl   Author: ripple   File: dactyl_style_checker.py    (MIT License) View Source Project 6 votes vote down vote up
def main(cli_args):
    if len(config["targets"]) == 0:
        exit("No target found; maybe you need to specify a Dactyl config file?")

    issues = check_all_pages(target=cli_args.target)
    if issues:
        num_issues = sum(len(p[1]) for p in issues)
        print("Found %d issues:" % num_issues)
        for pagename,issuelist in issues:
            print("Page: %s" % pagename)
            c = collections.Counter(issuelist)
            for i, count_i in c.items():
                if i[0]=="Unplain Phrase":
                    print("   Discouraged phrase: %s (%d instances); suggest '%s' instead." %
                                    ( i[1], count_i, config["disallowed_phrases"][i[1].lower()] ))
                elif i[0]=="Unplain Word":
                    print("   Discouraged word: %s (%d instances); suggest '%s' instead." %
                                    ( i[1], count_i, config["disallowed_words"][i[1].lower()] ))
                else:
                    print("   %s: %s (%d instances)" % (i[0], i[1], count_i))
        exit(1)
    else:
        print("Style check passed with flying colors!")
        exit(0) 
Example 24
Project: evaluation_tools   Author: JSALT-Rosetta   File: sampling.py    (license) View Source Project 6 votes vote down vote up
def get_nb_caption_per_img(n, selected_captions): 
    """
    Get image id from audio caption file names that were selected by their speakers
    Choose images that have at least n captions per image
    ----------
    n : int, 
        desired number of caption per image
    selected_captions : list of string, 
        list of caption file names selected by their speakers
    """
    
    counter_nb_caption=Counter()
    
    for cap in selected_captions: 
        #get image id 
        ImgID = cap.split('_')[-0]
        # add a count 
        counter_nb_caption[ImgID]+=1
        
    #choose img_id that have a count of n
    d=dict((k, v) for k, v in counter_nb_caption.items() if v == n)
    
    ImgID_selected=d.keys()
    
    return(ImgID_selected) 
Example 25
Project: deeppavlov   Author: deepmipt   File: utils.py    (license) View Source Project 6 votes vote down vote up
def _f1_score(pred, answers):
    """Compute the F1 score."""

    def _score(g_tokens, a_tokens):
        common = Counter(g_tokens) & Counter(a_tokens)
        num_same = sum(common.values())
        if num_same == 0:
            return 0
        precision = 1. * num_same / len(g_tokens)
        recall = 1. * num_same / len(a_tokens)
        f1 = (2 * precision * recall) / (precision + recall)
        return f1

    if pred is None or answers is None:
        return 0
    g_tokens = _normalize_answer(pred).split()
    scores = [_score(g_tokens, _normalize_answer(a).split()) for a in answers]
    return max(scores) 
Example 26
Project: dsb3   Author: EliasVansteenkiste   File: test_data.py    (license) View Source Project 6 votes vote down vote up
def test2():
    patient_data_paths = utils_lung.get_patient_data_paths(pathfinder.DATA_PATH)
    print len(patient_data_paths)
    pixel_spacings_xy = []
    n_slices = []

    for k, p in enumerate(patient_data_paths):
        pid = utils_lung.extract_pid_dir(p)
        sid2data, sid2metadata = utils_lung.get_patient_data(p)
        mtd = sid2metadata.itervalues().next()

        assert mtd['PixelSpacing'][0] == mtd['PixelSpacing'][1]
        pixel_spacings_xy.append(mtd['PixelSpacing'][0])
        n_slices.append(len(sid2metadata))
        print pid, pixel_spacings_xy[-1], n_slices[-1]

    print 'nslices', np.max(n_slices), np.min(n_slices), np.mean(n_slices)
    counts = collections.Counter(pixel_spacings_xy)
    new_list = sorted(pixel_spacings_xy, key=counts.get, reverse=True)
    print 'spacing', new_list 
Example 27
Project: KATE   Author: hugochan   File: retrieval.py    (BSD 3-Clause "New" or "Revised" License) View Source Project 6 votes vote down vote up
def retrieval_perlabel(X_train, Y_train, X_test, Y_test, fractions=[0.01, 0.5, 1.0]):
    X_train = unitmatrix(X_train) # normalize
    X_test = unitmatrix(X_test)
    score = X_test.dot(X_train.T)
    precisions = defaultdict(dict)
    label_counter = Counter(Y_test.tolist())

    for idx in range(len(X_test)):
        retrieval_idx = score[idx].argsort()[::-1]
        for fr in fractions:
            ntop = int(fr * len(X_train))
            pr = float(len([i for i in retrieval_idx[:ntop] if Y_train[i] == Y_test[idx]])) / ntop
            try:
                precisions[fr][Y_test[idx]] += pr
            except:
                precisions[fr][Y_test[idx]] = pr
    new_pr = {}
    for fr, val in precisions.iteritems():
        avg_pr = 0.
        for label, pr in val.iteritems():
            avg_pr += pr / label_counter[label]
        new_pr[fr] = avg_pr / len(label_counter)

    return sorted(new_pr.items(), key=lambda d:d[0]) 
Example 28
Project: EventStoryLine   Author: tommasoc80   File: baseline_PPMI1.py    (license) View Source Project 6 votes vote down vote up
def cross_sentence(event_lemma_dict):
    """
    function to create all possible pairs between event mentions in a file
    :param event_lemma_dict: dictionary of event lemmas in file
    :return: counter dictionary of event pairs in a file
    """

    full_event_file = []
    pairs_circumstantial_corpus = Counter([])

    for k, v in event_lemma_dict.items():
        full_event_file.append(k)

    event_pairs_full = list(product(full_event_file, repeat=2))

    for i in event_pairs_full:
        pairs_circumstantial_corpus.update([i])

    return pairs_circumstantial_corpus 
Example 29
Project: sentrycli   Author: operasoftware   File: group.py    (MIT License) View Source Project 6 votes vote down vote up
def print_grouping(attributes, grouping, top):
    """
    Print computed groups.

    :param attributes: list of grouped attributes
    :type: list(str)
    :param grouping: counter for each combination of attributes' values
    :type: Counter
    :type top: int
    """
    total = sum(grouping.values())

    table = Table(attributes + ['count', '%'])
    table.add_rows(total, grouping.most_common(top))

    print '\n' + table.by_count()
    print 'Total:', total 
Example 30
Project: gmlan_gw   Author: tmkdev   File: gmlan_gw.py    (license) View Source Project 6 votes vote down vote up
def __init__(self):
        self.handlers = {
            0x001: self._power,
            0x186: self._text,
            0x185: self._textparam,
            0x061: self._exttemp,
            0x005: self._tpms,
            #0x18e: self._textparam,
            0x026: self._fuel,
            0x053: self._gpsdate,
            0x055: self._gps,
        }

        self.counter = Counter() 
        self.locations = []
        self.fuel = [0,0] 
Example 31
Project: Eskapade   Author: KaveIO   File: value_counter.py    (license) View Source Project 6 votes vote down vote up
def fill_histogram(self, idf, columns):
        """Fill input histogram with column(s) of input dataframe

        :param idf: input data frame used for filling histogram
        :param list columns: histogram column(s)
        """

        name = ':'.join(columns)
        if name not in self._counts:
            # create an (empty) value counts dict
            self._counts[name] = Counter()
        # value_counts() is faster than groupby().size(), but only works for series (1d).
        # else use groupby() for multi-dimensions
        g = idf.groupby(by=columns).size() if len(columns) > 1 else idf[columns[0]].value_counts()
        counts = Counter(g.to_dict())
        # remove specific keys from histogram before merging, if so requested
        counts = self.drop_requested_keys(name, counts)
        self._counts[name].update(counts) 
Example 32
Project: Eskapade   Author: KaveIO   File: test_histogram.py    (license) View Source Project 6 votes vote down vote up
def test_bin_edges(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        # uniform
        bin_edges = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
        self.assertListEqual(h.get_uniform_bin_edges(), bin_edges)

        # truncated uniform bin edges
        truncated_bin_edges = [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]
        self.assertListEqual(h.truncated_bin_edges([5.5,12.5]), truncated_bin_edges)

        h_bin_edges = h.bin_edges()
        self.assertIsInstance(h_bin_edges, np.ndarray)
        self.assertListEqual(h_bin_edges.tolist(), bin_edges) 
Example 33
Project: Eskapade   Author: KaveIO   File: test_histogram.py    (license) View Source Project 6 votes vote down vote up
def test_bin_centers(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_centers = [0.5, 2.5, 4.5, 6.5, 8.5, 10.5, 12.5, 14.5, 16.5, 18.5]
        h_bin_centers = h.bin_centers()
        self.assertIsInstance(h_bin_centers, np.ndarray)
        self.assertListEqual(h_bin_centers.tolist(), bin_centers) 
Example 34
Project: Eskapade   Author: KaveIO   File: test_histogram.py    (license) View Source Project 6 votes vote down vote up
def test_bin_entries(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_entries = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
        h_bin_entries = h.bin_entries()
        self.assertIsInstance(h_bin_entries, np.ndarray)
        self.assertListEqual(h_bin_entries.tolist(), bin_entries) 
Example 35
Project: Eskapade   Author: KaveIO   File: test_histogram.py    (license) View Source Project 6 votes vote down vote up
def test_bin_labels(self):

        # constructor
        cnt = Counter()
        for i in range(10):
            cnt[i*2] = i

        vc = ValueCounts(key='x', counts=cnt)
        bin_specs = { 'bin_width': 1, 'bin_offset': 0 }

        h = Histogram(vc, variable='x', bin_specs = bin_specs)

        bin_labels = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
        h_bin_labels = h.bin_labels()
        self.assertIsInstance(h_bin_labels, np.ndarray)
        self.assertListEqual(h_bin_labels.tolist(), bin_labels) 
Example 36
Project: identifiera-sarkasm   Author: risnejunior   File: preprocess_data.py    (license) View Source Project 6 votes vote down vote up
def build_vocabulary( words, max_size ):
	vocab_instances = 0
	unique_counts = Counter(words)
	d = dict(unique_counts.most_common(cfg.vocabulary_size-2) )
	vocabulary = OrderedDict( sorted(d.items(), key=lambda t: t[1],  reverse=True) )

	# start at 2 to leave room for padding & unknown
	pb = Progress_bar(len(d) - 1) 
	for i, (key, value) in enumerate(vocabulary.items(), start=2):		
		vocab_instances += value
		vocabulary[key] = i
		pb.tick()

	vocabulary[cfg.padding_char] = 0
	vocabulary[cfg.placeholder_char] = 1
	#reverse the vocbulary (for reverse lookup)
	rev_vocabulary = {v: k for k, v in vocabulary.items()}	
	vocab = (len(unique_counts), vocab_instances, vocabulary, rev_vocabulary)

	return vocab 
Example 37
Project: histwords   Author: williamleif   File: counts2vocab.py    (license) View Source Project 6 votes vote down vote up
def main():
    args = docopt("""
    Usage:
        counts2pmi.py <counts>
    """)
    
    counts_path = args['<counts>']

    words = Counter()
    contexts = Counter()
    with open(counts_path) as f:
        for line in f:
            count, word, context = line.strip().split()
            count = int(count)
            words[word] += count
            contexts[context] += count

    words = sorted(words.items(), key=lambda (x, y): y, reverse=True)
    contexts = sorted(contexts.items(), key=lambda (x, y): y, reverse=True)

    save_count_vocabulary(counts_path + '.words.vocab', words)
    save_count_vocabulary(counts_path + '.contexts.vocab', contexts) 
Example 38
Project: MetaphoricChange   Author: Garrafao   File: dsm_module.py    (license) View Source Project 6 votes vote down vote up
def build_frequency_file(dtatcfdir, freq_file, MIN_FREQ, join_sign):
    """
    Builds file with all lemma + POS pairs above certain frequency threshold. 
    :param dtatcfdir: path to directory with dta tcf files
    :param freq_file: path to frequency file
    :param MIN_FREQ: frequency threshold
    :param join_sign: sign to join lemma + first char of POS
    """
    
    # build frequency file from lemmas
    outputpath = freq_file
    print 'Building frequency file to ' + outputpath + "..."
    lemma_count = Counter(build_lemma_list(dtatcfdir, join_sign))
    frequent_lemmas = filter(lambda x: lemma_count[x] >= MIN_FREQ, lemma_count)
    with open(outputpath, 'w') as f_out:
        for lemma in frequent_lemmas:
            print >> f_out, lemma.encode('utf-8') 
Example 39
Project: mordecai   Author: openeventdata   File: geoparse.py    (license) View Source Project 6 votes vote down vote up
def _feature_most_common(self, results):
        """
        Find the most common country name in ES/Geonames results

        Paramaters
        ----------
        results: dict
            output of `query_geonames`

        Returns
        -------
        most_common: str
            ISO code of most common country, or empty string if none
        """
        try:
            country_count = Counter([i['country_code3'] for i in results['hits']['hits']])
            most_common = country_count.most_common()[0][0]
            return most_common
        except IndexError:
            return ""
        except TypeError:
            return "" 
Example 40
Project: atma   Author: AtmaHou   File: bleu.py    (license) View Source Project 6 votes vote down vote up
def MP(candidate, references, n):
    """
    calculate modified precision
    """
    counts = Counter(ngrams(candidate, n))
    if not counts:
        return 0

    max_counts = {}
    for reference in references:
        reference_counts = Counter(ngrams(reference, n))
        for ngram in counts:
            max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram])

    clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items())

    return sum(clipped_counts.values()) / sum(counts.values()) 
Example 41
Project: STA141C   Author: clarkfitzg   File: problem2c.py    (MIT License) View Source Project 5 votes vote down vote up
def overlap_score(q1, q2):
    """
    >>> overlap_score("a b c", "a b")
    0.8

    >>> overlap_score("   ", " ")
    0
    """

    c1 = Counter(q1.split())
    c2 = Counter(q2.split())

    numerator = 0
    for word in c1:
        if word in c2:
            numerator += c1[word]
    for word in c2:
        if word in c1:
            numerator += c2[word]

    m = sum(c1.values())
    n = sum(c2.values())

    try:
        score = numerator / (m + n)
    except ZeroDivisionError:
        score = 0
    return score 
Example 42
Project: ThreatPrep   Author: ThreatResponse   File: checker.py    (MIT License) View Source Project 5 votes vote down vote up
def get_category_stats(self):
        """Get a count of CheckState results for each category of checks.
        Ignore collection counts to avoid duplications"""
        flat_results = self.get_flattened_results()
        categories = list(set([x.category for x in flat_results]))
        metrics = {}
        for category in categories:
            metrics[category] = collections.Counter([
                x.status for x in filter(
                    lambda y: len(y.subchecks) == 0 and y.category==category,
                    flat_results
                )
            ])
        return metrics 
Example 43
Project: monasca-transform   Author: openstack   File: test_data_driven_specs.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def check_list_field_for_row(
            self, row=None, field_name=None, expected_list=None):
        found_list = getattr(row, field_name)
        self.assertEqual(Counter(expected_list), Counter(found_list)) 
Example 44
Project: python-driver   Author: bblfsh   File: issue62_b.py    (GNU General Public License v3.0) View Source Project 5 votes vote down vote up
def convert_uasts(self, file_uast_generator):
        for file_uast in file_uast_generator:
            print("-" * 20 + " " + str(file_uast.filepath))
            id_cnt = Counter()
            self.collect_id_cnt(file_uast.response.uast, id_cnt)
            print(id_cnt) 
Example 45
Project: companycase   Author: duedil-ltd   File: companycase.py    (MIT License) View Source Project 5 votes vote down vote up
def fetch_all_transitions(self, language, ngram_length):
        """ Generate a dict of counts for transitions for all n-grams in the language word list """
        wordlist = os.path.join(os.path.dirname(__file__), "wordlists/{0}.txt".format(language))
        if not os.path.exists(wordlist):
            raise SystemError("Language '{0}' does not exist".format(language))

        all_grams = []
        with codecs.open(wordlist, 'r', encoding='utf-8') as f:
            for line in f:
                words = line.strip('\n').lower().split()
                ngrams = reduce(lambda x, y: x + y, map(lambda word: self.find_ngrams(word, ngram_length), words))
                all_grams += ngrams
        return dict(Counter(all_grams)) 
Example 46
Project: variational-text-tensorflow   Author: carpedm20   File: reader.py    (MIT License) View Source Project 5 votes vote down vote up
def _build_vocab(self, file_path, vocab_path):
    counter = Counter(self._read_text(file_path).split())

    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    self.vocab = dict(zip(words, range(len(words))))

    save_pkl(vocab_path, self.vocab) 
Example 47
Project: treecat   Author: posterior   File: util.py    (Apache License 2.0) View Source Project 5 votes vote down vote up
def log_profiling_stats():
    logger.info('-----------------------------------------------------------')
    logger.info('Series:')
    for name, series in sorted(SERIES.items()):
        logger.info('  {}: {}'.format(name, ' '.join(map(str, series))))

    logger.info('-----------------------------------------------------------')
    logger.info('Histograms:')
    for name, histogram in sorted(HISTOGRAMS.items()):
        logger.info('{: >10s} {}'.format('Count', name))
        for value, count in sorted(histogram.items()):
            logger.info('{: >10d} {}'.format(count, value))

    logger.info('-----------------------------------------------------------')
    logger.info('Counters:')
    logger.info('{: >10s} {}'.format('Count', 'Counter'))
    for name, count in sorted(COUNTERS.items()):
        logger.info('{: >10d} {}'.format(count, name))

    logger.info('-----------------------------------------------------------')
    logger.info('Timers:')
    times = [(t.elapsed, t.count, f) for (f, t) in TIMERS.items()]
    times.sort(reverse=True, key=lambda x: x[0])
    logger.info('{: >10} {: >10} {}'.format('Seconds', 'Calls', 'Function'))
    for time, count, name in times:
        logger.info('{: >10.3f} {: >10} {}'.format(time, count, name)) 
Example 48
Project: IgDiscover   Author: NBISweden   File: discover.py    (MIT License) View Source Project 5 votes vote down vote up
def _guess_cdr3_start(group):
		"""
		Return a guess for the CDR3 start within sequences in the given group
		"""
		return Counter(group.V_CDR3_start).most_common()[0][0] 
Example 49
Project: trf   Author: aistairc   File: analyser.py    (license) View Source Project 5 votes vote down vote up
def calc_rs_pos(self) -> Dict[str, float]:
        """Calculate the ratio of each pos of words in input text
        Returns:
            float: the ratio of each pos of words in input text
        """
        pos = []
        # TODO: It may take a long time when the number of sentences are large
        for sentence in self.sentences:
            juman_result = self.juman.analysis(sentence)
            pos += [mrph.hinsi for mrph in juman_result.mrph_list()]
        pos_counter = Counter(pos)
        total = sum(pos_counter.values())
        return {name: float(num) / total for name, num in pos_counter.items()} 
Example 50
Project: cellranger   Author: 10XGenomics   File: report.py    (license) View Source Project 5 votes vote down vote up
def __init__(self, **kwargs):
        Metric.__init__(self, **kwargs)
        self.d = collections.Counter()