Python collections.Counter() Examples

The following are 30 code examples for showing how to use collections.Counter(). These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

You may check out the related API usage on the sidebar.

You may also want to check out all available functions/classes of the module collections , or try the search function .

Example 1
Project: decisiontrees   Author: jayelm   File: dtree.py    License: MIT License 6 votes vote down vote up
def value_counts(self, subset, attr, value, base=False):
        """
        Get the number of currences per value of the dependent variable when
        the given attribute is equal to the given value.

        FIXME: Can attr/value be eliminated??

        Args:
            subset: the subset with which to act upon.
            attr: the attribute of the value.
            value: the value with which to track counts.
            base: whether or not to calculate values based on the dependent
                value (default False).
        Returns:
            A Counter instance detailing the number of occurrences per
            dependent variable.

        """
        counts = Counter()
        for row in subset:
            if row[attr] == value or base:
                counts[row[self.dependent]] += 1
        return counts 
Example 2
Project: decisiontrees   Author: jayelm   File: dtree.py    License: MIT License 6 votes vote down vote up
def attr_counts(self, subset, attr):
        """
        Get the number of occurrences per value of the given attribute

        Args:
            subset: the subset with which to act upon.
            attr: the selected attribute.
        Returns:
            A Counter instance detailing the number of occurrences per
            attribute value.

        """
        counts = Counter()
        for row in subset:
            counts[row[attr]] += 1
        return counts 
Example 3
Project: gated-graph-transformer-network   Author: hexahedria   File: ggtnn_graph_parse.py    License: MIT License 6 votes vote down vote up
def get_buckets(stories, max_ignore_unbatched=100, max_pad_amount=25):
    sentencecounts = [len(sents_graphs) for (sents_graphs, query, answer) in stories]
    countpairs = sorted(collections.Counter(sentencecounts).items())

    buckets = []
    smallest_left_val = 0
    num_unbatched = max_ignore_unbatched
    for val,ct in countpairs:
        num_unbatched += ct
        if val - smallest_left_val > max_pad_amount or num_unbatched > max_ignore_unbatched:
            buckets.append(val)
            smallest_left_val = val
            num_unbatched = 0
    if buckets[-1] != countpairs[-1][0]:
        buckets.append(countpairs[-1][0])

    return buckets 
Example 4
Project: CAMISIM   Author: CAMI-challenge   File: strainsimulationwrapper.py    License: Apache License 2.0 6 votes vote down vote up
def _get_genome_amounts(self, probability, max_genome_amount):
		"""
		Get amounts of genomes by original genome

		@param probability: Proportion of simulated original genomes
		@type probability: int | long | float
		@param max_genome_amount: Total number of genomes
		@type max_genome_amount: int | long

		@return: List of integers representing amount of strains
		@rtype: list[int]
		"""
		assert isinstance(probability, (int, long, float))
		assert 0 <= probability <= 1
		assert isinstance(max_genome_amount, (int, long))

		genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
		diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
		if max_genome_amount >= 10:
			while abs(diverence - probability) > 0.05:
				# print "need: {}, gotten: {}".format(probability, diverence)
				genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
				diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
		return genome_amounts 
Example 5
Project: deep-learning-note   Author: wdxtub   File: w2v_utils.py    License: MIT License 6 votes vote down vote up
def build_vocab(words, vocab_size, visual_fld):
    """ Build vocabulary of VOCAB_SIZE most frequent words and write it to
    visualization/vocab.tsv
    """
    utils.safe_mkdir(visual_fld)
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')

    dictionary = dict()
    count = [('UNK', -1)]
    index = 0
    count.extend(Counter(words).most_common(vocab_size - 1))

    for word, _ in count:
        dictionary[word] = index
        index += 1
        file.write(word + '\n')

    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    file.close()
    return dictionary, index_dictionary 
Example 6
def generate_bi_graphemes_dictionary(label_list):
    freqs = Counter()
    for label in label_list:
        label = label.split(' ')
        for i in label:
            for pair in split_every(2, i):
                if len(pair) == 2:
                    freqs[pair] += 1


    with open('resources/unicodemap_en_baidu_bi_graphemes.csv', 'w') as bigram_label:
        bigramwriter = csv.writer(bigram_label, delimiter = ',')
        baidu_labels = list('\' abcdefghijklmnopqrstuvwxyz')
        for index, key in enumerate(baidu_labels):
            bigramwriter.writerow((key, index+1))
        for index, key in enumerate(freqs.keys()):
            bigramwriter.writerow((key, index+len(baidu_labels)+1)) 
Example 7
def test_tokens_to_indices():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>',
                                  reserved_tokens=None)

    i1 = vocab.to_indices('c')
    assert i1 == 1

    i2 = vocab.to_indices(['c'])
    assert i2 == [1]

    i3 = vocab.to_indices(['<unk>', 'non-exist'])
    assert i3 == [0, 0]

    i4 = vocab.to_indices(['a', 'non-exist', 'a', 'b'])
    assert i4 == [3, 0, 3, 2] 
Example 8
def test_indices_to_tokens():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1,
                                  unknown_token='<unknown>', reserved_tokens=None)
    i1 = vocab.to_tokens(1)
    assert i1 == 'c'

    i2 = vocab.to_tokens([1])
    assert i2 == ['c']

    i3 = vocab.to_tokens([0, 0])
    assert i3 == ['<unknown>', '<unknown>']

    i4 = vocab.to_tokens([3, 0, 3, 2])
    assert i4 == ['a', '<unknown>', 'a', 'b']

    assertRaises(ValueError, vocab.to_tokens, 100) 
Example 9
Project: DOTA_models   Author: ringringyi   File: errorcounter.py    License: Apache License 2.0 6 votes vote down vote up
def CountErrors(ocr_text, truth_text):
  """Counts the drops and adds between 2 bags of iterables.

  Simple bag of objects count returns the number of dropped and added
  elements, regardless of order, from anything that is iterable, eg
  a pair of strings gives character errors, and a pair of word lists give
  word errors.
  Args:
    ocr_text:    OCR text iterable (eg string for chars, word list for words).
    truth_text:  Truth text iterable.

  Returns:
    ErrorCounts named tuple.
  """
  counts = collections.Counter(truth_text)
  counts.subtract(ocr_text)
  drops = sum(c for c in counts.values() if c > 0)
  adds = sum(-c for c in counts.values() if c < 0)
  return ErrorCounts(drops, adds, len(truth_text), len(ocr_text)) 
Example 10
Project: fine-lm   Author: akzaidi   File: bleu_hook.py    License: MIT License 6 votes vote down vote up
def _get_ngrams(segment, max_order):
  """Extracts all n-grams up to a given maximum order from an input segment.

  Args:
    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this
        methods.

  Returns:
    The Counter containing all n-grams up to max_order in segment
    with a count of how many times each n-gram occurred.
  """
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i + order])
      ngram_counts[ngram] += 1
  return ngram_counts 
Example 11
Project: fine-lm   Author: akzaidi   File: babi_qa.py    License: MIT License 6 votes vote down vote up
def _build_vocab(generator, vocab_dir, vocab_name):
  """Build a vocabulary from examples.

  Args:
    generator: text generator for creating vocab.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

  Returns:
    text encoder.
  """
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    data = []
    for line in generator:
      data.extend(line.split())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder.store_to_file(vocab_path)
  else:
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 
Example 12
Project: fine-lm   Author: akzaidi   File: wikitext103.py    License: MIT License 6 votes vote down vote up
def _build_vocab(filename, vocab_dir, vocab_name):
  """Reads a file to build a vocabulary.

  Args:
    filename: file to read list of words from.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

  Returns:
    text encoder.
  """
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    with tf.gfile.GFile(filename, "r") as f:
      data = f.read().split()
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder.store_to_file(vocab_path)
  else:
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 
Example 13
Project: fine-lm   Author: akzaidi   File: text_encoder_test.py    License: MIT License 6 votes vote down vote up
def test_custom_reserved_tokens(self):
    """Test that we can pass custom reserved tokens to SubwordTextEncoder."""
    corpus = "The quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    start_symbol = "<S>"
    end_symbol = "<E>"
    reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
                                                      end_symbol]
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        10, token_counts, 2, 10, reserved_tokens=reserved_tokens)

    # Make sure that reserved tokens appear in the right places.
    self.assertEqual(encoder.decode([2]), start_symbol)
    self.assertEqual(encoder.decode([3]), end_symbol)

    # Make sure that we haven't messed up the ability to reconstruct.
    reconstructed_corpus = encoder.decode(encoder.encode(corpus))
    self.assertEqual(corpus, reconstructed_corpus) 
Example 14
Project: fine-lm   Author: akzaidi   File: text_encoder_test.py    License: MIT License 6 votes vote down vote up
def test_encodable_when_not_in_alphabet(self):
    corpus = "the quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 10)
    original = "This has UPPER CASE letters that are out of alphabet"

    # Early versions could have an infinite loop when breaking into subtokens
    # if there was any out-of-alphabet characters in the encoded string.
    encoded = encoder.encode(original)
    decoded = encoder.decode(encoded)

    self.assertEqual(original, decoded)
    encoded_str = "".join(encoder.all_subtoken_strings[i] for i in encoded)
    self.assertIn("\\84;", encoded_str) 
Example 15
Project: fine-lm   Author: akzaidi   File: text_encoder_test.py    License: MIT License 6 votes vote down vote up
def test_reserved_token_chars_not_in_alphabet(self):
    corpus = "dog"
    token_counts = collections.Counter(corpus.split(" "))
    encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 100)
    filename = os.path.join(self.test_temp_dir, "out.voc")
    encoder1.store_to_file(filename)
    encoder2 = text_encoder.SubwordTextEncoder(filename=filename)

    self.assertEqual(encoder1._alphabet, encoder2._alphabet)

    for t in text_encoder.RESERVED_TOKENS:
      for c in t:
        # Verify that encoders can encode all reserved token chars.
        encoder1.encode(c)
        encoder2.encode(c) 
Example 16
Project: fine-lm   Author: akzaidi   File: text_encoder_test.py    License: MIT License 6 votes vote down vote up
def test_save_and_reload_no_single_quotes(self):
    corpus = "the quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    # Deliberately exclude some required encoding chars from the alphabet
    # and token list, making some strings unencodable.
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 10)

    filename = os.path.join(self.test_temp_dir, "out.voc")
    encoder.store_to_file(filename, add_single_quotes=False)
    new_encoder = text_encoder.SubwordTextEncoder(filename)

    self.assertEqual(encoder._alphabet, new_encoder._alphabet)
    self.assertEqual(encoder.all_subtoken_strings,
                     new_encoder.all_subtoken_strings)
    self.assertEqual(encoder._subtoken_string_to_id,
                     new_encoder._subtoken_string_to_id)
    self.assertEqual(encoder._max_subtoken_len, new_encoder._max_subtoken_len) 
Example 17
Project: fine-lm   Author: akzaidi   File: ptb.py    License: MIT License 6 votes vote down vote up
def _build_vocab(filename, vocab_path, vocab_size):
  """Reads a file to build a vocabulary of `vocab_size` most common words.

   The vocabulary is sorted by occurrence count and has one word per line.
   Originally from:
   https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py

  Args:
    filename: file to read list of words from.
    vocab_path: path where to save the vocabulary.
    vocab_size: size of the vocabulary to generate.
  """
  data = _read_words(filename)
  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
  words, _ = list(zip(*count_pairs))
  words = words[:vocab_size]
  with open(vocab_path, "w") as f:
    f.write("\n".join(words)) 
Example 18
Project: fine-lm   Author: akzaidi   File: tokenizer.py    License: MIT License 6 votes vote down vote up
def corpus_token_counts(
    text_filepattern, corpus_max_lines, split_on_newlines=True):
  """Read the corpus and compute a dictionary of token counts.

  Args:
    text_filepattern: A pattern matching one or more files.
    corpus_max_lines: An integer; maximum total lines to read.
    split_on_newlines: A boolean. If true, then split files by lines and strip
        leading and trailing whitespace from each line. Otherwise, treat each
        file as a single string.

  Returns:
    a dictionary mapping token to count.
  """
  counts = collections.Counter()
  for doc in _read_filepattern(
      text_filepattern,
      max_lines=corpus_max_lines,
      split_on_newlines=split_on_newlines):
    counts.update(encode(_native_to_unicode(doc)))

  return counts 
Example 19
Project: fine-lm   Author: akzaidi   File: subject_verb_agreement.py    License: MIT License 6 votes vote down vote up
def _build_vocab(examples, example_field, vocab_dir, vocab_name):
  """Build a vocabulary from examples.

  Args:
    examples: a dict containing all the examples.
    example_field: field of example from which the vocabulary is built.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

  Returns:
    text encoder.
  """
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    data = []
    for e in examples:
      data.extend(e[example_field].split())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder.store_to_file(vocab_path)
  else:
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 
Example 20
Project: trees   Author: gdanezis   File: forests.py    License: Apache License 2.0 6 votes vote down vote up
def build_tree(train, features, levels=5, numfeatures=100):
    'Train a decision tree based on labeled data and features'
    if levels == 0:
        C1 = Counter([b for _, b in train])
        Leaf = (None, C1)
        return Leaf
    else:
        try:
            X = (split(train, F) for F in random.sample(features, numfeatures))
            H, L1, L2, F = max(X)
            M1 = build_tree(L1, features, levels - 1, numfeatures)
            M2 = build_tree(L2, features, levels - 1, numfeatures)
            Branch = (F, M1, M2)
            return Branch
        except:
            return build_tree(train, features, levels=0) 
Example 21
Project: slot-filling   Author: llhthinker   File: builddataset.py    License: MIT License 6 votes vote down vote up
def build_vocab(data, min_count=1):
    count = [("<UNK>", -1), ("<PAD>", -1)]
    words = []
    for sentence, _ in data: 
        words.extend(sentence)
  
    counter = Counter(words)
    counter_list = counter.most_common()
    for word, c in counter_list:
        if c >= min_count:
            count.append((word, c))
    word2idx = dict()
    for word, _ in count:
        word2idx[word] = len(word2idx)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
  
    return word2idx, idx2word 
Example 22
Project: nlp-tensorflow   Author: bzantium   File: data_process.py    License: MIT License 6 votes vote down vote up
def build_character(sentences):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for sentence in sentences:
        tokens = list(sentence)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<GO>'] = 1
    vocab['<UNK>'] = 2
    vocab_idx = 3

    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size 
Example 23
Project: nlp-tensorflow   Author: bzantium   File: data_process.py    License: MIT License 6 votes vote down vote up
def build_vocab(sentences):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for sentence in sentences:
        tokens = tokenizer(sentence)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<GO>'] = 1
    vocab['<UNK>'] = 2
    vocab_idx = 3

    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size 
Example 24
Project: svviz   Author: svviz   File: variants.py    License: MIT License 5 votes vote down vote up
def commonSegments(self):
        """ return the segment IDs of the segments that are identical between 
        the ref and alt alleles (eg, flanking regions) """
        common = []
        refCounter = collections.Counter((segment.id for segment in self._segments("ref")))
        altCounter = collections.Counter((segment.id for segment in self._segments("alt")))
        if max(refCounter.values()) > 1 or max(altCounter.values()) > 1:
            logging.warn(" Same genomic region repeated multiple times within one allele; "
                "all flanking reads will be marked as ambiguous")
            return []


        refSegments = dict((segment.id, segment) for segment in self._segments("ref"))
        altSegments = dict((segment.id, segment) for segment in self._segments("alt"))

        for segmentID, refSegment in refSegments.items():
            if not segmentID in altSegments:
                continue
            altSegment = altSegments[segmentID]

            # Could remove the requirement to have the strand be the same
            # allowing the reads within the inversion to be plotted too
            if refSegment.chrom==altSegment.chrom and \
                refSegment.start == altSegment.start and \
                refSegment.end == altSegment.end and \
                refSegment.strand == altSegment.strand and \
                refSegment.source == altSegment.source:
                common.append(segmentID)

        return common 
Example 25
Project: svviz   Author: svviz   File: datahub.py    License: MIT License 5 votes vote down vote up
def getCounts(self):
        if self._counts is None:
            self._counts = collections.OrderedDict()
            for name, sample in self.samples.items():
                self._counts[name] = collections.Counter([alnCollection.choice for alnCollection in sample.alnCollections])
            self._counts["Total"] = dict((allele, sum(self._counts[name][allele] for name in self.samples)) 
                                          for allele in ["alt", "ref", "amb"])

        return self._counts 
Example 26
Project: svviz   Author: svviz   File: summarystats.py    License: MIT License 5 votes vote down vote up
def addVariantResults(self, dataHub):
        variant = str(dataHub.variant)
        for sampleName, sample in dataHub.samples.items():
            counts = collections.Counter()
            reasons = {}
            alnScores = collections.defaultdict(list)
            insertSizes = collections.defaultdict(list)

            # collect stats
            for alnCollection in sample.alnCollections:
                allele = alnCollection.choice
                counts[allele] += 1

                if not allele in reasons:
                    reasons[allele] = collections.Counter()

                reasons[allele][alnCollection.why] += 1
                alnScores[allele].append(sum(aln.score for aln in alnCollection.chosenSet().getAlignments()))
                insertSizes[allele].append(len(alnCollection.chosenSet()))



            # record stats
            for allele, count in counts.items():
                self.stats.append([variant, sampleName, allele, "count", count])

            for allele in reasons:
                for reason in reasons[allele]:
                    self.stats.append([variant, sampleName, allele, "reason_{}".format(reason), reasons[allele][reason]])

            for allele in alnScores:
                self.stats.append([variant, sampleName, allele, "alnScore_mean", numpy.mean(alnScores[allele])])
                self.stats.append([variant, sampleName, allele, "alnScore_std", numpy.std(alnScores[allele])])

            for allele in insertSizes:
                self.stats.append([variant, sampleName, allele, "insertSize_mean", numpy.mean(insertSizes[allele])])
                self.stats.append([variant, sampleName, allele, "insertSize_std", numpy.std(insertSizes[allele])]) 
Example 27
Project: mutatest   Author: EvanKepner   File: report.py    License: MIT License 5 votes vote down vote up
def get_status_summary(trial_results: List[MutantTrialResult]) -> Dict[str, Union[str, int]]:
    """Create a status summary dictionary for later formatting.

    Args:
        trial_results: list of mutant trials

    Returns:
        Dictionary with keys for formatting in the report
    """
    status: Dict[str, Union[str, int]] = dict(Counter([t.status for t in trial_results]))
    status["TOTAL RUNS"] = len(trial_results)
    status["RUN DATETIME"] = str(datetime.now())

    return status 
Example 28
Project: EDeN   Author: fabriziocosta   File: util.py    License: MIT License 5 votes vote down vote up
def report_base_statistics(vec, separator='\n'):
    """report_base_statistics."""
    from collections import Counter
    c = Counter(vec)
    msg = ''
    for k in c:
        msg += "class: %s count:%d (%0.2f)%s" % (
            k, c[k], c[k] / float(len(vec)), separator)
    return msg 
Example 29
Project: EDeN   Author: fabriziocosta   File: estimator_utils.py    License: MIT License 5 votes vote down vote up
def balance(graphs, targets, estimator, ratio=2):
    """balance."""
    class_counts = Counter(targets)
    majority_class = None
    max_count = 0
    minority_class = None
    min_count = 1e6
    for class_key in class_counts:
        if max_count < class_counts[class_key]:
            majority_class = class_key
            max_count = class_counts[class_key]
        if min_count > class_counts[class_key]:
            minority_class = class_key
            min_count = class_counts[class_key]

    desired_size = int(min_count * ratio)

    tg = zip(targets, graphs)
    class_graphs = groupby(lambda x: first(x), tg)
    maj_graphs = [second(x) for x in class_graphs[majority_class]]
    min_graphs = [second(x) for x in class_graphs[minority_class]]

    if estimator:
        # select only the instances in the majority class that
        # have a small margin
        preds = estimator.decision_function(maj_graphs)
    else:
        # select at random
        preds = [random.random() for i in range(len(maj_graphs))]
    preds = [abs(pred) for pred in preds]
    pred_graphs = sorted(zip(preds, maj_graphs))[:desired_size]
    maj_graphs = [g for p, g in pred_graphs]

    bal_graphs = min_graphs + maj_graphs
    bal_pos = [minority_class] * len(min_graphs)
    bal_neg = [majority_class] * len(maj_graphs)
    bal_targets = bal_pos + bal_neg

    return paired_shuffle(bal_graphs, bal_targets) 
Example 30
Project: hgraph2graph   Author: wengong-jin   File: get_vocab.py    License: MIT License 5 votes vote down vote up
def fragment_process(data):
    counter = Counter()
    for smiles in data:
        mol = get_mol(smiles)
        fragments = find_fragments(mol)
        for fsmiles, _ in fragments:
            counter[fsmiles] += 1
    return counter