Python collections.Counter() Examples

The following are 30 code examples of collections.Counter(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module collections , or try the search function .
Example #1
Source File: dtree.py    From decisiontrees with MIT License 7 votes vote down vote up
def value_counts(self, subset, attr, value, base=False):
        """
        Get the number of currences per value of the dependent variable when
        the given attribute is equal to the given value.

        FIXME: Can attr/value be eliminated??

        Args:
            subset: the subset with which to act upon.
            attr: the attribute of the value.
            value: the value with which to track counts.
            base: whether or not to calculate values based on the dependent
                value (default False).
        Returns:
            A Counter instance detailing the number of occurrences per
            dependent variable.

        """
        counts = Counter()
        for row in subset:
            if row[attr] == value or base:
                counts[row[self.dependent]] += 1
        return counts 
Example #2
Source File: errorcounter.py    From DOTA_models with Apache License 2.0 6 votes vote down vote up
def CountErrors(ocr_text, truth_text):
  """Counts the drops and adds between 2 bags of iterables.

  Simple bag of objects count returns the number of dropped and added
  elements, regardless of order, from anything that is iterable, eg
  a pair of strings gives character errors, and a pair of word lists give
  word errors.
  Args:
    ocr_text:    OCR text iterable (eg string for chars, word list for words).
    truth_text:  Truth text iterable.

  Returns:
    ErrorCounts named tuple.
  """
  counts = collections.Counter(truth_text)
  counts.subtract(ocr_text)
  drops = sum(c for c in counts.values() if c > 0)
  adds = sum(-c for c in counts.values() if c < 0)
  return ErrorCounts(drops, adds, len(truth_text), len(ocr_text)) 
Example #3
Source File: ggtnn_graph_parse.py    From gated-graph-transformer-network with MIT License 6 votes vote down vote up
def get_buckets(stories, max_ignore_unbatched=100, max_pad_amount=25):
    sentencecounts = [len(sents_graphs) for (sents_graphs, query, answer) in stories]
    countpairs = sorted(collections.Counter(sentencecounts).items())

    buckets = []
    smallest_left_val = 0
    num_unbatched = max_ignore_unbatched
    for val,ct in countpairs:
        num_unbatched += ct
        if val - smallest_left_val > max_pad_amount or num_unbatched > max_ignore_unbatched:
            buckets.append(val)
            smallest_left_val = val
            num_unbatched = 0
    if buckets[-1] != countpairs[-1][0]:
        buckets.append(countpairs[-1][0])

    return buckets 
Example #4
Source File: builddataset.py    From slot-filling with MIT License 6 votes vote down vote up
def build_vocab(data, min_count=1):
    count = [("<UNK>", -1), ("<PAD>", -1)]
    words = []
    for sentence, _ in data: 
        words.extend(sentence)
  
    counter = Counter(words)
    counter_list = counter.most_common()
    for word, c in counter_list:
        if c >= min_count:
            count.append((word, c))
    word2idx = dict()
    for word, _ in count:
        word2idx[word] = len(word2idx)
    idx2word = dict(zip(word2idx.values(), word2idx.keys()))
  
    return word2idx, idx2word 
Example #5
Source File: data_process.py    From nlp-tensorflow with MIT License 6 votes vote down vote up
def build_vocab(sentences):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for sentence in sentences:
        tokens = tokenizer(sentence)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<GO>'] = 1
    vocab['<UNK>'] = 2
    vocab_idx = 3

    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size 
Example #6
Source File: forests.py    From trees with Apache License 2.0 6 votes vote down vote up
def build_tree(train, features, levels=5, numfeatures=100):
    'Train a decision tree based on labeled data and features'
    if levels == 0:
        C1 = Counter([b for _, b in train])
        Leaf = (None, C1)
        return Leaf
    else:
        try:
            X = (split(train, F) for F in random.sample(features, numfeatures))
            H, L1, L2, F = max(X)
            M1 = build_tree(L1, features, levels - 1, numfeatures)
            M2 = build_tree(L2, features, levels - 1, numfeatures)
            Branch = (F, M1, M2)
            return Branch
        except:
            return build_tree(train, features, levels=0) 
Example #7
Source File: data_process.py    From nlp-tensorflow with MIT License 6 votes vote down vote up
def build_character(sentences):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for sentence in sentences:
        tokens = list(sentence)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<GO>'] = 1
    vocab['<UNK>'] = 2
    vocab_idx = 3

    for key, value in word_counter.most_common(len(word_counter)):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size 
Example #8
Source File: strainsimulationwrapper.py    From CAMISIM with Apache License 2.0 6 votes vote down vote up
def _get_genome_amounts(self, probability, max_genome_amount):
		"""
		Get amounts of genomes by original genome

		@param probability: Proportion of simulated original genomes
		@type probability: int | long | float
		@param max_genome_amount: Total number of genomes
		@type max_genome_amount: int | long

		@return: List of integers representing amount of strains
		@rtype: list[int]
		"""
		assert isinstance(probability, (int, long, float))
		assert 0 <= probability <= 1
		assert isinstance(max_genome_amount, (int, long))

		genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
		diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
		if max_genome_amount >= 10:
			while abs(diverence - probability) > 0.05:
				# print "need: {}, gotten: {}".format(probability, diverence)
				genome_amounts = self._get_genome_amounts_geometric(probability, max_genome_amount)
				diverence = Counter(genome_amounts)[1] / float(len(genome_amounts))
		return genome_amounts 
Example #9
Source File: subject_verb_agreement.py    From fine-lm with MIT License 6 votes vote down vote up
def _build_vocab(examples, example_field, vocab_dir, vocab_name):
  """Build a vocabulary from examples.

  Args:
    examples: a dict containing all the examples.
    example_field: field of example from which the vocabulary is built.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

  Returns:
    text encoder.
  """
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    data = []
    for e in examples:
      data.extend(e[example_field].split())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder.store_to_file(vocab_path)
  else:
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 
Example #10
Source File: tokenizer.py    From fine-lm with MIT License 6 votes vote down vote up
def corpus_token_counts(
    text_filepattern, corpus_max_lines, split_on_newlines=True):
  """Read the corpus and compute a dictionary of token counts.

  Args:
    text_filepattern: A pattern matching one or more files.
    corpus_max_lines: An integer; maximum total lines to read.
    split_on_newlines: A boolean. If true, then split files by lines and strip
        leading and trailing whitespace from each line. Otherwise, treat each
        file as a single string.

  Returns:
    a dictionary mapping token to count.
  """
  counts = collections.Counter()
  for doc in _read_filepattern(
      text_filepattern,
      max_lines=corpus_max_lines,
      split_on_newlines=split_on_newlines):
    counts.update(encode(_native_to_unicode(doc)))

  return counts 
Example #11
Source File: ptb.py    From fine-lm with MIT License 6 votes vote down vote up
def _build_vocab(filename, vocab_path, vocab_size):
  """Reads a file to build a vocabulary of `vocab_size` most common words.

   The vocabulary is sorted by occurrence count and has one word per line.
   Originally from:
   https://github.com/tensorflow/models/blob/master/tutorials/rnn/ptb/reader.py

  Args:
    filename: file to read list of words from.
    vocab_path: path where to save the vocabulary.
    vocab_size: size of the vocabulary to generate.
  """
  data = _read_words(filename)
  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
  words, _ = list(zip(*count_pairs))
  words = words[:vocab_size]
  with open(vocab_path, "w") as f:
    f.write("\n".join(words)) 
Example #12
Source File: w2v_utils.py    From deep-learning-note with MIT License 6 votes vote down vote up
def build_vocab(words, vocab_size, visual_fld):
    """ Build vocabulary of VOCAB_SIZE most frequent words and write it to
    visualization/vocab.tsv
    """
    utils.safe_mkdir(visual_fld)
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')

    dictionary = dict()
    count = [('UNK', -1)]
    index = 0
    count.extend(Counter(words).most_common(vocab_size - 1))

    for word, _ in count:
        dictionary[word] = index
        index += 1
        file.write(word + '\n')

    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    file.close()
    return dictionary, index_dictionary 
Example #13
Source File: stt_bi_graphemes_util.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def generate_bi_graphemes_dictionary(label_list):
    freqs = Counter()
    for label in label_list:
        label = label.split(' ')
        for i in label:
            for pair in split_every(2, i):
                if len(pair) == 2:
                    freqs[pair] += 1


    with open('resources/unicodemap_en_baidu_bi_graphemes.csv', 'w') as bigram_label:
        bigramwriter = csv.writer(bigram_label, delimiter = ',')
        baidu_labels = list('\' abcdefghijklmnopqrstuvwxyz')
        for index, key in enumerate(baidu_labels):
            bigramwriter.writerow((key, index+1))
        for index, key in enumerate(freqs.keys()):
            bigramwriter.writerow((key, index+len(baidu_labels)+1)) 
Example #14
Source File: dtree.py    From decisiontrees with MIT License 6 votes vote down vote up
def attr_counts(self, subset, attr):
        """
        Get the number of occurrences per value of the given attribute

        Args:
            subset: the subset with which to act upon.
            attr: the selected attribute.
        Returns:
            A Counter instance detailing the number of occurrences per
            attribute value.

        """
        counts = Counter()
        for row in subset:
            counts[row[attr]] += 1
        return counts 
Example #15
Source File: bleu_hook.py    From fine-lm with MIT License 6 votes vote down vote up
def _get_ngrams(segment, max_order):
  """Extracts all n-grams up to a given maximum order from an input segment.

  Args:
    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this
        methods.

  Returns:
    The Counter containing all n-grams up to max_order in segment
    with a count of how many times each n-gram occurred.
  """
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i + order])
      ngram_counts[ngram] += 1
  return ngram_counts 
Example #16
Source File: babi_qa.py    From fine-lm with MIT License 6 votes vote down vote up
def _build_vocab(generator, vocab_dir, vocab_name):
  """Build a vocabulary from examples.

  Args:
    generator: text generator for creating vocab.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

  Returns:
    text encoder.
  """
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    data = []
    for line in generator:
      data.extend(line.split())
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder.store_to_file(vocab_path)
  else:
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 
Example #17
Source File: wikitext103.py    From fine-lm with MIT License 6 votes vote down vote up
def _build_vocab(filename, vocab_dir, vocab_name):
  """Reads a file to build a vocabulary.

  Args:
    filename: file to read list of words from.
    vocab_dir: directory where to save the vocabulary.
    vocab_name: vocab file name.

  Returns:
    text encoder.
  """
  vocab_path = os.path.join(vocab_dir, vocab_name)
  if not tf.gfile.Exists(vocab_path):
    with tf.gfile.GFile(filename, "r") as f:
      data = f.read().split()
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*count_pairs))
    encoder = text_encoder.TokenTextEncoder(None, vocab_list=words)
    encoder.store_to_file(vocab_path)
  else:
    encoder = text_encoder.TokenTextEncoder(vocab_path)
  return encoder 
Example #18
Source File: test_contrib_text.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def test_indices_to_tokens():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1,
                                  unknown_token='<unknown>', reserved_tokens=None)
    i1 = vocab.to_tokens(1)
    assert i1 == 'c'

    i2 = vocab.to_tokens([1])
    assert i2 == ['c']

    i3 = vocab.to_tokens([0, 0])
    assert i3 == ['<unknown>', '<unknown>']

    i4 = vocab.to_tokens([3, 0, 3, 2])
    assert i4 == ['a', '<unknown>', 'a', 'b']

    assertRaises(ValueError, vocab.to_tokens, 100) 
Example #19
Source File: test_contrib_text.py    From dynamic-training-with-apache-mxnet-on-aws with Apache License 2.0 6 votes vote down vote up
def test_tokens_to_indices():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>',
                                  reserved_tokens=None)

    i1 = vocab.to_indices('c')
    assert i1 == 1

    i2 = vocab.to_indices(['c'])
    assert i2 == [1]

    i3 = vocab.to_indices(['<unk>', 'non-exist'])
    assert i3 == [0, 0]

    i4 = vocab.to_indices(['a', 'non-exist', 'a', 'b'])
    assert i4 == [3, 0, 3, 2] 
Example #20
Source File: text_encoder_test.py    From fine-lm with MIT License 6 votes vote down vote up
def test_custom_reserved_tokens(self):
    """Test that we can pass custom reserved tokens to SubwordTextEncoder."""
    corpus = "The quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    start_symbol = "<S>"
    end_symbol = "<E>"
    reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
                                                      end_symbol]
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        10, token_counts, 2, 10, reserved_tokens=reserved_tokens)

    # Make sure that reserved tokens appear in the right places.
    self.assertEqual(encoder.decode([2]), start_symbol)
    self.assertEqual(encoder.decode([3]), end_symbol)

    # Make sure that we haven't messed up the ability to reconstruct.
    reconstructed_corpus = encoder.decode(encoder.encode(corpus))
    self.assertEqual(corpus, reconstructed_corpus) 
Example #21
Source File: text_encoder_test.py    From fine-lm with MIT License 6 votes vote down vote up
def test_encodable_when_not_in_alphabet(self):
    corpus = "the quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 10)
    original = "This has UPPER CASE letters that are out of alphabet"

    # Early versions could have an infinite loop when breaking into subtokens
    # if there was any out-of-alphabet characters in the encoded string.
    encoded = encoder.encode(original)
    decoded = encoder.decode(encoded)

    self.assertEqual(original, decoded)
    encoded_str = "".join(encoder.all_subtoken_strings[i] for i in encoded)
    self.assertIn("\\84;", encoded_str) 
Example #22
Source File: text_encoder_test.py    From fine-lm with MIT License 6 votes vote down vote up
def test_reserved_token_chars_not_in_alphabet(self):
    corpus = "dog"
    token_counts = collections.Counter(corpus.split(" "))
    encoder1 = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 100)
    filename = os.path.join(self.test_temp_dir, "out.voc")
    encoder1.store_to_file(filename)
    encoder2 = text_encoder.SubwordTextEncoder(filename=filename)

    self.assertEqual(encoder1._alphabet, encoder2._alphabet)

    for t in text_encoder.RESERVED_TOKENS:
      for c in t:
        # Verify that encoders can encode all reserved token chars.
        encoder1.encode(c)
        encoder2.encode(c) 
Example #23
Source File: text_encoder_test.py    From fine-lm with MIT License 6 votes vote down vote up
def test_save_and_reload_no_single_quotes(self):
    corpus = "the quick brown fox jumps over the lazy dog"
    token_counts = collections.Counter(corpus.split(" "))

    # Deliberately exclude some required encoding chars from the alphabet
    # and token list, making some strings unencodable.
    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 10)

    filename = os.path.join(self.test_temp_dir, "out.voc")
    encoder.store_to_file(filename, add_single_quotes=False)
    new_encoder = text_encoder.SubwordTextEncoder(filename)

    self.assertEqual(encoder._alphabet, new_encoder._alphabet)
    self.assertEqual(encoder.all_subtoken_strings,
                     new_encoder.all_subtoken_strings)
    self.assertEqual(encoder._subtoken_string_to_id,
                     new_encoder._subtoken_string_to_id)
    self.assertEqual(encoder._max_subtoken_len, new_encoder._max_subtoken_len) 
Example #24
Source File: to_string.py    From QCElemental with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def formula_generator(elem):
    """Return simple chemical formula from element list `elem`.

    >>> formula_generator(['C', 'Ca', 'O', 'O', 'Ag']
    AgCCaO2

    """
    counted = collections.Counter(elem)
    return "".join((el if cnt == 1 else (el + str(cnt))) for el, cnt in sorted(counted.items())) 
Example #25
Source File: text_encoder_test.py    From fine-lm with MIT License 5 votes vote down vote up
def test_encode_decode(self):
    corpus = (
        "This is a corpus of text that provides a bunch of tokens from which "
        "to build a vocabulary. It will be used when strings are encoded "
        "with a TextEncoder subclass. The encoder was coded by a coder.")
    token_counts = collections.Counter(corpus.split(" "))
    alphabet = set(corpus) - {" "}

    original = "This is a coded sentence encoded by the SubwordTextEncoder."
    token_counts.update(original.split(" "))

    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
        100, token_counts, 2, 10)

    # Encoding should be reversible.
    encoded = encoder.encode(original)
    decoded = encoder.decode(encoded)
    self.assertEqual(original, decoded)

    # The substrings coded and coder are frequent enough in the corpus that
    # they should appear in the vocabulary even though they are substrings
    # of other included strings.
    subtoken_strings = {encoder.all_subtoken_strings[i] for i in encoded}
    self.assertIn("encoded_", subtoken_strings)
    self.assertIn("coded_", subtoken_strings)
    self.assertIn("TextEncoder", encoder.all_subtoken_strings)
    self.assertIn("coder", encoder.all_subtoken_strings)

    # Every character in the corpus should be in the encoders alphabet and
    # its subtoken vocabulary.
    self.assertTrue(alphabet.issubset(encoder._alphabet))
    for a in alphabet:
      self.assertIn(a, encoder.all_subtoken_strings) 
Example #26
Source File: data_process.py    From nlp-tensorflow with MIT License 5 votes vote down vote up
def build_vocab(lines, max_vocab=None):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for line in lines:
        tokens = tokenizer(line)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    vocab_idx = 2
    
    if max_vocab == None or max_vocab > len(word_counter):
        max_vocab = len(word_counter)
    
    for key, value in word_counter.most_common(max_vocab):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size 
Example #27
Source File: data_process.py    From nlp-tensorflow with MIT License 5 votes vote down vote up
def build_vocab_pos(lines, max_vocab=None):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for line in lines:
        tokens = pos_extractor(line)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    vocab_idx = 2
    
    if max_vocab == None or max_vocab > len(word_counter):
        max_vocab = len(word_counter)
    
    for key, value in word_counter.most_common(max_vocab):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size 
Example #28
Source File: generate_vocab_file.py    From End-to-end-ASR-Pytorch with MIT License 5 votes vote down vote up
def main(args):

    if args.mode == "subword":
        logging.warn("Subword model is based on `sentencepiece`.")

        import sentencepiece as splib

        cmd = ("--input={} --model_prefix={} --model_type=bpe "
               "--vocab_size={} --character_coverage={} "
               "--pad_id=0 --eos_id=1 --unk_id=2 --bos_id=-1 "
               "--eos_piece=<eos> --remove_extra_whitespaces=true".format(
                   args.input_file, args.output_file,
                   args.vocab_size, args.character_coverage))

        splib.SentencePieceTrainer.Train(cmd)
    else:
        with open(args.input_file, "r") as f:
            lines = [line.strip("\r\n ") for line in f]
        counter = Counter()
        if args.mode == "word":
            for line in lines:
                counter.update(line.split())
            # In word mode, vocab_list is sorted by frequency
            # Only selected top `vocab_size` vocabularies
            vocab_list = sorted(
                counter.keys(), key=lambda k: counter[k], reverse=True)[:args.vocab_size]
        elif args.mode == "character":
            for line in lines:
                counter.update(line)
            # In character mode, vocab_list is sorted in alphabetical order
            vocab_list = sorted(counter)

        logging.info("Collected totally {} vocabularies.".format(len(counter)))
        logging.info("Selected {} vocabularies.".format(len(vocab_list)))

        with open(args.output_file, "w") as f:
            f.write("\n".join(vocab_list)) 
Example #29
Source File: jsonscheme.py    From lyrebird-api-coverage with MIT License 5 votes vote down vote up
def check_url_redundant(obj):
    repeat_urls = []
    url_list = list(map(lambda x: x.get('url'), obj.get('api_list')))
    url_count = dict(collections.Counter(url_list))
    for k, v in url_count.items():
        if v > 1:
            repeat_urls.append(k)
    return repeat_urls 
Example #30
Source File: data_process.py    From nlp-tensorflow with MIT License 5 votes vote down vote up
def build_vocab_morphs(lines, max_vocab=None):
    word_counter = Counter()
    vocab = dict()
    reverse_vocab = dict()

    for line in lines:
        tokens = morphs_extractor(line)
        word_counter.update(tokens)

    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    vocab_idx = 2
    
    if max_vocab == None or max_vocab > len(word_counter):
        max_vocab = len(word_counter)
    
    for key, value in word_counter.most_common(max_vocab):
        vocab[key] = vocab_idx
        vocab_idx += 1

    for key, value in vocab.items():
        reverse_vocab[value] = key

    vocab_size = len(vocab.keys())

    return vocab, reverse_vocab, vocab_size