Python collections.Counter() Examples

The following are code examples for showing how to use collections.Counter(). They are from open source Python projects. You can vote up the examples you like or vote down the ones you don't like.

Example 1
Project: aurora   Author: carnby   File: tasks.py    MIT License 7 votes vote down vote up
def build_token_counts(characterizer, texts):
    tokenizer = Tokenizer(characterizer=characterizer)
    tokenizer.train([t['text'] for t in texts])

    token_counts = Counter()
    seq_matcher = difflib.SequenceMatcher()

    for t in texts:
        t['tokens'] = tokenizer.tokenize(t['text'])
        if not t['tokens']:
            continue

        if 'urls' in t['entities'] and t['entities']['urls']:
            #TODO: replace those urls instead of adding them
            for url in t['entities']['urls']:
                t['tokens'].append(url['display_url'])

        if t['__is_rt__']:
            t['tokens'].append(u'@{0}'.format(t['user']['screen_name']).lower())

        token_counts.update(t['tokens'])

    return token_counts 
Example 2
Project: fs_image   Author: facebookincubator   File: test_filesystem_storage.py    MIT License 6 votes vote down vote up
def test_write_and_read_back(self):
        expected_content_count = Counter()
        with self._temp_storage() as storage:
            for writes, _ in self.check_storage_impl(storage):
                expected_content_count[b''.join(writes)] += 1

            # Make a histogram of the contents of the output files
            content_count = Counter()
            for f in itertools.chain.from_iterable(
                [os.path.join(p, f) for f in fs]
                    for p, _, fs in os.walk(storage.base_dir) if fs
            ):
                with open(f, 'rb') as infile:
                    content_count[infile.read()] += 1

            # Did we produce the expected number of each kind of output?
            self.assertEqual(expected_content_count, content_count)

    # This test cannot be in the base since there's no generic way to check
    # if we left a trace on the storage system -- there's no ID to fetch. 
Example 3
Project: visidata-plugins   Author: jsvine   File: vdnormcol.py    MIT License 6 votes vote down vote up
def gen_normalize_names(names):
    """
    Given a list of strings, yield fully-normalized conversions of those
    strings, ensuring that each is unique.
    """
    base = list(map(normalize_name, names))
    counts = Counter(base)
    
    # Append __{i} to non-unique names
    seen = dict((key, 0) for key in counts.keys())
    for name in base:
        if counts[name] == 1 or name == "":
            norm_name = name
        else:
            norm_name = name + "__" + str(seen[name])
            seen[name] += 1
        yield norm_name 
Example 4
Project: DataComp   Author: Cojabi   File: utils.py    Apache License 2.0 6 votes vote down vote up
def create_contin_mat(data, dataset_labels, observation_col):
    """
    Creates a contingency table from a dictionary of observations.

    :param data: Dataframe containing observations.
    :param dataset_labels: Labels of the datasets used as keys in 'data' dict.
    :param observation_col: Name of the column in which the values of interest are stored. e.g. "Gender".
    :return: contingency matrix
    """
    contingency_matrix = dict()

    # count for each label
    for dataset_nr in data[dataset_labels].unique():
        # select subset of the dataframe, that belongs to one of the original datasets
        dataset = data[data[dataset_labels] == dataset_nr][::]
        # drop data points with missing values in value column
        dataset.dropna(subset=[observation_col], inplace=True)

        # count occurences
        counts = Counter(dataset[observation_col])

        # add to confusion matrix
        contingency_matrix[dataset_nr] = counts

    return pd.DataFrame(contingency_matrix).transpose() 
Example 5
Project: DataComp   Author: Cojabi   File: utils.py    Apache License 2.0 6 votes vote down vote up
def get_cat_frequencies(series):
    """
    Counts the occurrences for each factor of a categorical variable and calculates the relative frequencies.

    :param series: Iterable storing the realisations of a categorical random variable / feature.
    :return freqs: Pandas Series storing the relative frequencies using the corresponding factor as index
    :return counts.sum(): Total number of realisations of the categorical variable
    :return counts: Pandas Series storing the counts using the corresponding factor as index
    """

    # count occurrences and store in Series
    counts = pd.Series(Counter(series))
    # calculate frequencies
    freqs = counts / counts.sum()

    return freqs, counts.sum(), counts 
Example 6
Project: google-tech-dev-guide   Author: alvinctk   File: is_anagram.py    Apache License 2.0 6 votes vote down vote up
def isAnagram(self, s: str, t: str) -> bool:
        """
        92.29% faster than Python3 online submission
        Time complexity: O(n) linear scan
        Space complexity: O(n) use of Counter objects
        """
        if not s and not t:
            return True
        elif not s:
            return False
        elif not t:
            return False

        n, m = len(s), len(t)
        if n != m:
            return False

        a, b = Counter(s), Counter(t)
        for k, v in a.items():
            if a[k] != b[k]:
                return False
        return True 
Example 7
Project: google-tech-dev-guide   Author: alvinctk   File: permutation_ii.py    Apache License 2.0 6 votes vote down vote up
def permuteUnique(self, nums):
        """
        Time complexity: (N - w) * O(Summation of P(N-w, k))
            w = number of duplicate elements
            P(N - w, k) := permutation of unique elements of k from N - w elements
            P(N - w, k) = (N-w)!/(N-w-k)!

        Space complexity: O((N-w)!) to store result
        """
        def helper(path, counter):
            if len(path) == len(nums):
                result.append(path.copy())

            for x in counter:  # dont pick duplicates
                if counter[x] > 0:
                    path.append(x)
                    counter[x] -= 1

                    helper(path, counter)
                    path.pop()
                    counter[x] += 1
        result = []
        helper([], Counter(nums))
        return result 
Example 8
Project: google-tech-dev-guide   Author: alvinctk   File: single_number_ii.py    Apache License 2.0 6 votes vote down vote up
def singleNumber2(self, nums: List[int]) -> int:
        """
        Every element appears three items except for one, which appear exactly once.
        Returns the single element.

        N = number of elements
        Time complexity: O(N) to iterate over the input array
        Space complexity: O(N) to keep the count of N/3 elements

        Runtime: 56 ms, faster than 99.64% of Python3 online submissions for Single Number II.
        Memory Usage: 15.7 MB, less than 6.67% of Python3 online submissions for Single Number II.
        """
        count = Counter(nums)
        for k, v in count.items():
            if v == 1:
                return k 
Example 9
Project: google-tech-dev-guide   Author: alvinctk   File: single_number_ii.py    Apache License 2.0 6 votes vote down vote up
def singleNumber2(self, nums: List[int]) -> int:
        """
        Every element appears three items except for one, which appear exactly once.
        Returns the single element.

        N = number of elements
        Time complexity: O(N) to iterate over the input array
        Space complexity: O(N) to keep the count of N/3 elements

        Runtime: 56 ms, faster than 99.64% of Python3 online submissions for Single Number II.
        Memory Usage: 15.7 MB, less than 6.67% of Python3 online submissions for Single Number II.
        """
        count = Counter(nums)
        for k, v in count.items():
            if v == 1:
                return k 
Example 10
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: vmware_dvs_host.py    MIT License 6 votes vote down vote up
def main():
    argument_spec = vmware_argument_spec()
    argument_spec.update(dict(esxi_hostname=dict(required=True, type='str'),
                              switch_name=dict(required=True, type='str'),
                              vmnics=dict(required=True, type='list'),
                              state=dict(default='present',
                                         choices=['present', 'absent'],
                                         type='str')
                              )
                         )

    module = AnsibleModule(argument_spec=argument_spec,
                           supports_check_mode=True)

    if not HAS_COLLECTIONS_COUNTER:
        module.fail_json(msg='collections.Counter from Python-2.7 is required for this module')

    vmware_dvs_host = VMwareDvsHost(module)
    vmware_dvs_host.process_state() 
Example 11
Project: deep-learning-note   Author: wdxtub   File: w2v_utils.py    MIT License 6 votes vote down vote up
def build_vocab(words, vocab_size, visual_fld):
    """ Build vocabulary of VOCAB_SIZE most frequent words and write it to
    visualization/vocab.tsv
    """
    utils.safe_mkdir(visual_fld)
    file = open(os.path.join(visual_fld, 'vocab.tsv'), 'w')

    dictionary = dict()
    count = [('UNK', -1)]
    index = 0
    count.extend(Counter(words).most_common(vocab_size - 1))

    for word, _ in count:
        dictionary[word] = index
        index += 1
        file.write(word + '\n')

    index_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    file.close()
    return dictionary, index_dictionary 
Example 12
Project: pnp   Author: HazardDede   File: gpio.py    MIT License 6 votes vote down vote up
def __init__(self, pins, default=CONST_RISING, **kwargs):
        super().__init__(**kwargs)
        self._mode_default = default
        Validator.one_of(
            CONST_RISING_OPTIONS
            + CONST_FALLING_OPTIONS
            + CONST_SWITCH_OPTIONS
            + CONST_MOTION_OPTIONS,
            mode_default=self._mode_default
        )
        self._pins = [Callback.from_str(pin_str, default=default) for pin_str in make_list(pins)]
        _without_duplicate = set(self._pins)
        if len(_without_duplicate) != len(self._pins):
            diff = list((Counter(self._pins) - Counter(_without_duplicate)).elements())
            self.logger.warning(
                "You provided duplicate gpio pin configurations. Will ignore '%s'", diff
            )
            self._pins = _without_duplicate 
Example 13
Project: mmi-tagger   Author: karlstratos   File: data.py    MIT License 6 votes vote down vote up
def __init__(self, data_path):
        self.data_path = data_path
        self.PAD = '<pad>'
        self.UNK = '<unk>'

        self.sents = []   # Index sequences
        self.golds = []
        self.w2i = {self.PAD: 0, self.UNK: 1}
        self.i2w = [self.PAD, self.UNK]
        self.c2i = {self.PAD: 0, self.UNK: 1}
        self.i2c = [self.PAD, self.UNK]
        self.word_counter = []
        self.char_counter = []
        self.label_counter = Counter()

        self.get_data() 
Example 14
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: stt_bi_graphemes_util.py    Apache License 2.0 6 votes vote down vote up
def generate_bi_graphemes_dictionary(label_list):
    freqs = Counter()
    for label in label_list:
        label = label.split(' ')
        for i in label:
            for pair in split_every(2, i):
                if len(pair) == 2:
                    freqs[pair] += 1


    with open('resources/unicodemap_en_baidu_bi_graphemes.csv', 'w') as bigram_label:
        bigramwriter = csv.writer(bigram_label, delimiter = ',')
        baidu_labels = list('\' abcdefghijklmnopqrstuvwxyz')
        for index, key in enumerate(baidu_labels):
            bigramwriter.writerow((key, index+1))
        for index, key in enumerate(freqs.keys()):
            bigramwriter.writerow((key, index+len(baidu_labels)+1)) 
Example 15
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: test_contrib_text.py    Apache License 2.0 6 votes vote down vote up
def test_tokens_to_indices():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1, unknown_token='<unk>',
                                  reserved_tokens=None)

    i1 = vocab.to_indices('c')
    assert i1 == 1

    i2 = vocab.to_indices(['c'])
    assert i2 == [1]

    i3 = vocab.to_indices(['<unk>', 'non-exist'])
    assert i3 == [0, 0]

    i4 = vocab.to_indices(['a', 'non-exist', 'a', 'b'])
    assert i4 == [3, 0, 3, 2] 
Example 16
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: test_contrib_text.py    Apache License 2.0 6 votes vote down vote up
def test_indices_to_tokens():
    counter = Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])

    vocab = text.vocab.Vocabulary(counter, most_freq_count=None, min_freq=1,
                                  unknown_token='<unknown>', reserved_tokens=None)
    i1 = vocab.to_tokens(1)
    assert i1 == 'c'

    i2 = vocab.to_tokens([1])
    assert i2 == ['c']

    i3 = vocab.to_tokens([0, 0])
    assert i3 == ['<unknown>', '<unknown>']

    i4 = vocab.to_tokens([3, 0, 3, 2])
    assert i4 == ['a', '<unknown>', 'a', 'b']

    assertRaises(ValueError, vocab.to_tokens, 100) 
Example 17
Project: DOTA_models   Author: ringringyi   File: errorcounter.py    Apache License 2.0 6 votes vote down vote up
def CountErrors(ocr_text, truth_text):
  """Counts the drops and adds between 2 bags of iterables.

  Simple bag of objects count returns the number of dropped and added
  elements, regardless of order, from anything that is iterable, eg
  a pair of strings gives character errors, and a pair of word lists give
  word errors.
  Args:
    ocr_text:    OCR text iterable (eg string for chars, word list for words).
    truth_text:  Truth text iterable.

  Returns:
    ErrorCounts named tuple.
  """
  counts = collections.Counter(truth_text)
  counts.subtract(ocr_text)
  drops = sum(c for c in counts.values() if c > 0)
  adds = sum(-c for c in counts.values() if c < 0)
  return ErrorCounts(drops, adds, len(truth_text), len(ocr_text)) 
Example 18
Project: advent-of-code-2018   Author: badouralix   File: wenceslas.py    MIT License 6 votes vote down vote up
def run(self, s):
        # :param s: input in string format
        # :return: solution flag
        # Your code goes here

        ids_twice = 0
        ids_thrice = 0

        for box_id in s.splitlines():
            counter = Counter(box_id)
            if 2 in counter.values():
                ids_twice += 1
            if 3 in counter.values():
                ids_thrice += 1
    
        return ids_twice*ids_thrice 
Example 19
Project: aurora   Author: carnby   File: functions.py    MIT License 6 votes vote down vote up
def entropy(predictions, normalize=False, n_categories=None):
    counts = Counter(list(predictions)).values()

    if len(counts) <= 1:
        return 0.0

    total_count = float(len(predictions))

    if not total_count:
        return 0.0

    probabilities = []
    for count in counts:
        probabilities.append(count / total_count)

    probabilities = np.array(probabilities)
    value = - np.sum(probabilities * np.log(probabilities))
    if normalize is True:
        if n_categories is not None:
            value /= float(np.log(n_categories))
        else:
            value /= float(np.log(len(counts)))
    return value 
Example 20
Project: aurora   Author: carnby   File: filtering.py    MIT License 6 votes vote down vote up
def __estimate_entropy__(self):
        counts = self.feature_vector_counts #Counter(self.timeline_feature_vectors)
        #print counts
        #N = float(sum(counts.values()))
        N = float(len(self.timeline) + 1)
        max_H = np.log(float(len(list(filter(lambda x: x, counts)))))

        if np.equal(max_H, 0.0):
            return 0.0

        entropy = 0.0

        for key in counts.keys():
            if counts[key] > 0:
                key_probability = counts[key] / N
                entropy += -(key_probability * np.log(key_probability))

        entropy /= max_H

        #print u'N={0}, |counts|={3}, max_H={1}, entropy={2}, counter={4}'.format(N, max_H, entropy, len(counts), counts)
        return entropy 
Example 21
Project: fs_image   Author: facebookincubator   File: inode_utils.py    MIT License 5 votes vote down vote up
def __init__(self, inodes: Iterator[Union['Inode', IncompleteInode]]):
        self.counter = Counter(
            ino.xattrs[_SELINUX_XATTR]
                for ino in inodes if _SELINUX_XATTR in ino.xattrs
        ) 
Example 22
Project: fs_image   Author: facebookincubator   File: send_stream.py    MIT License 5 votes vote down vote up
def get_frequency_of_selinux_xattrs(items):
    'Returns {"xattr_value": <count>}. Useful for ItemFilters.selinux_xattr.'
    counter = Counter()
    for item in items:
        if isinstance(item, SendStreamItems.set_xattr):
            if item.name == _SELINUX_XATTR:
                counter[item.data] += 1
    return counter 
Example 23
Project: fs_image   Author: facebookincubator   File: subvolume_set.py    MIT License 5 votes vote down vote up
def new(cls, **kwargs) -> 'SubvolumeSet':
        kwargs.setdefault('uuid_to_subvolume', {})
        kwargs.setdefault('name_uuid_prefix_counts', Counter())
        return cls(**kwargs) 
Example 24
Project: mutatest   Author: EvanKepner   File: report.py    MIT License 5 votes vote down vote up
def get_status_summary(trial_results: List[MutantTrialResult]) -> Dict[str, Union[str, int]]:
    """Create a status summary dictionary for later formatting.

    Args:
        trial_results: list of mutant trials

    Returns:
        Dictionary with keys for formatting in the report
    """
    status: Dict[str, Union[str, int]] = dict(Counter([t.status for t in trial_results]))
    status["TOTAL RUNS"] = len(trial_results)
    status["RUN DATETIME"] = str(datetime.now())

    return status 
Example 25
Project: explirefit   Author: codogogo   File: data_helper.py    Apache License 2.0 5 votes vote down vote up
def build_vocab(texts):
	"""
	Builds a vocabulary mapping from word to index based on the sentences.
	Returns vocabulary mapping and inverse vocabulary mapping.
	"""
	# Build vocabulary
	word_counts = Counter(itertools.chain(*texts))
	# Mapping from index to word
	vocabulary_invariable = [x[0] for x in word_counts.most_common()]
	vocabulary_invariable = list(sorted(vocabulary_invariable))
	# Mapping from word to index
	vocabulary = {x: i for i, x in enumerate(vocabulary_invariable)}
	inverse_vocabulary = {v: k for k, v in vocabulary.items()}
	return [vocabulary, inverse_vocabulary] 
Example 26
Project: rhodonite   Author: nestauk   File: basic.py    MIT License 5 votes vote down vote up
def cooccurrence_graph(sequences, directed=False):
    '''cooccurrence_graph
    Creates a cooccurrence graph from a nested sequence.

    Parameters
    ----------
    sequences : :obj:`iter` of :obj:`iter` of :obj:`int` 
        A nested list-like object containing groups of vertices represented by
        integers.
    directed : :obj:`bool`, optional  
        If `True`, then each coocurring pair will be linked by a pair of 
        antiparallel edges, each with an equal weight. Defaults to `False`.
    
    Returns
    -------
    g : :obj:`graph_tool.Graph`
        A cooccurrence graph.
    o : :obj:`graph_tool.VertexPropertyMap`
        A vertex property map of vertex occurrence frequency.
    co : :obj:`graph_tool.EdgePropertyMap`
        An edge property map of vertex cooccurrence frequency. 
    '''
    g = Graph(directed=directed)

    o = Counter(flatten(sequences))
    n_vertices = len(o)
    g.add_vertex(n_vertices)
    o_vprop = dict_to_vertex_prop(g, o, 'int')

    co = cooccurrence_counts(sequences)
    if directed:
            co.update({k[::-1]: v for k, v in co.items()})
    edge_list = ((c[0], c[1], count) for c, count in co.items())
    co_eprop = g.new_edge_property('int')
    g.add_edge_list(edge_list, eprops=[co_eprop])

    return g, o_vprop, co_eprop 
Example 27
Project: pyblish-win   Author: pyblish   File: case.py    GNU Lesser General Public License v3.0 5 votes vote down vote up
def assertItemsEqual(self, expected_seq, actual_seq, msg=None):
        """An unordered sequence specific comparison. It asserts that
        actual_seq and expected_seq have the same element counts.
        Equivalent to::

            self.assertEqual(Counter(iter(actual_seq)),
                             Counter(iter(expected_seq)))

        Asserts that each element has the same count in both sequences.
        Example:
            - [0, 1, 1] and [1, 0, 1] compare equal.
            - [0, 0, 1] and [0, 1] compare unequal.
        """
        first_seq, second_seq = list(expected_seq), list(actual_seq)
        with warnings.catch_warnings():
            if sys.py3kwarning:
                # Silence Py3k warning raised during the sorting
                for _msg in ["(code|dict|type) inequality comparisons",
                             "builtin_function_or_method order comparisons",
                             "comparing unequal types"]:
                    warnings.filterwarnings("ignore", _msg, DeprecationWarning)
            try:
                first = collections.Counter(first_seq)
                second = collections.Counter(second_seq)
            except TypeError:
                # Handle case with unhashable elements
                differences = _count_diff_all_purpose(first_seq, second_seq)
            else:
                if first == second:
                    return
                differences = _count_diff_hashable(first_seq, second_seq)

        if differences:
            standardMsg = 'Element counts were not equal:\n'
            lines = ['First has %d, Second has %d:  %r' % diff for diff in differences]
            diffMsg = '\n'.join(lines)
            standardMsg = self._truncateMessage(standardMsg, diffMsg)
            msg = self._formatMessage(msg, standardMsg)
            self.fail(msg) 
Example 28
Project: python-samples   Author: dek-odoo   File: dek_program022.py    Apache License 2.0 5 votes vote down vote up
def main(sentence):

    res = collections.Counter()
    sortedwords = sorted(sentence.split(' '))
    for word in sortedwords:
        res[word] += 1

    result = sorted(res)
    print result
    for result_word in result:
        print result_word, ':', res[result_word] 
Example 29
Project: wikilinks   Author: trovdimi   File: WikipediaFedTextParser.py    MIT License 5 votes vote down vote up
def __set_fed_text(self, fed_text):
        links = self.get_links_position(fed_text)
        counts = Counter(links)  # so we have: {'name':3, 'state':1, 'city':1, 'zip':2}
        for s, num in counts.items():
            if num > 1:  # ignore strings that only appear once
                for suffix in range(1, num + 1):  # suffix starts at 1 and increases by 1 each time
                    links[links.index(s)] = s + '-----##$$$##-----' + str(suffix)  # replace each appearance of s
        for s in links:  # replace each not unique link in fed_text with unique one
            if s.find('-----##$$$##-----')!=-1:
                search = s.split('-----##$$$##-----')[0]
                next_occ = fed_text.find('[['+search+']]')
                fed_text = fed_text[:next_occ] + '[[' + s + ']]' + fed_text[next_occ+len('[['+search+']]'):]
        return fed_text 
Example 30
Project: 2016adventofcode   Author: bildzeitung   File: p.py    GNU General Public License v3.0 5 votes vote down vote up
def isreal(sector, room, checksum):
    strack = ''.join([z[0] for z in sorted(Counter(room).most_common(),
                                           key=lambda (x, y): (-y, x)
                                           )][0:5]) 
Example 31
Project: Neural-LP   Author: fanyangxyz   File: data.py    MIT License 5 votes vote down vote up
def resplit(train, facts, no_link_percent):
    num_train = len(train)
    num_facts = len(facts)
    all = train + facts
    
    if no_link_percent == 0.:
        np.random.shuffle(all)
        new_train = all[:num_train]
        new_facts = all[num_train:]
    else:
        link_cntr = Counter()
        for tri in all:
            link_cntr[(tri[1], tri[2])] += 1
        tmp_train = []
        tmp_facts = []
        for tri in all:
            if link_cntr[(tri[1], tri[2])] + link_cntr[(tri[2], tri[1])] > 1:
                if np.random.random() < no_link_percent:
                    tmp_facts.append(tri)
                else:
                    tmp_train.append(tri)
            else:
                tmp_train.append(tri)
        
        if len(tmp_train) > num_train:
            np.random.shuffle(tmp_train)
            new_train = tmp_train[:num_train]
            new_facts = tmp_train[num_train:] + tmp_facts
        else:
            np.random.shuffle(tmp_facts)
            num_to_fill = num_train - len(tmp_train)
            new_train = tmp_train + tmp_facts[:num_to_fill]
            new_facts = tmp_facts[num_to_fill:]
    
    assert(len(new_train) == num_train)
    assert(len(new_facts) == num_facts)

    return new_train, new_facts 
Example 32
Project: Neural-LP   Author: fanyangxyz   File: data.py    MIT License 5 votes vote down vote up
def _count_batch(self, samples, batch_size):
        relations = zip(*samples)[0]
        relations_counts = Counter(relations)
        num_batches = [ceil(1. * x / batch_size) for x in relations_counts.values()]
        return int(sum(num_batches)) 
Example 33
Project: zmirror   Author: aploium   File: zmirror.py    MIT License 5 votes vote down vote up
def _regex_generate__basic_mirrorlization():
    """产生 regex_basic_mirrorlization
    用一个函数包裹起来是因为在 try_match_and_add_domain_to_rewrite_white_list()
    中需要动态修改 external_domains, 修改以后可能需要随之生成新的正则, 包裹一下比较容易调用
    """
    from collections import Counter

    # 统计各个后缀出现的频率, 并且按照出现频率降序排列, 有助于提升正则效率
    c = Counter(re.escape(x.split(".")[-1]) for x in allowed_domains_set)
    regex_all_remote_tld = sorted(list(c.keys()), key=lambda x: c[x], reverse=True)

    regex_all_remote_tld = "(?:" + "|".join(regex_all_remote_tld) + ")"
    return re.compile(
        r"""(?:""" +
        (  # [[http(s):]//] or [\?["']] or %27 %22 or &quot;
            r"""(?P<scheme>""" +
            (  # [[http(s):]//]
                (  # [http(s):]
                    r"""(?:https?(?P<colon>{REGEX_COLON}))?""".format(REGEX_COLON=REGEX_COLON)  # https?:
                ) +
                r"""(?P<scheme_slash>%s)(?P=scheme_slash)""" % REGEX_SLASH  # //
            ) +
            r""")""" +
            r"""|""" +
            # [\?["']] or %27 %22 or &quot
            r"""(?P<quote>{REGEX_QUOTE})""".format(REGEX_QUOTE=REGEX_QUOTE)
        ) +
        r""")""" +
        # End prefix.
        # Begin domain
        r"""(?P<domain>([a-zA-Z0-9-]+\.){1,5}%s)\b""" % regex_all_remote_tld +
        # Optional suffix slash
        r"""(?P<suffix_slash>(?(scheme_slash)(?P=scheme_slash)|{SLASH}))?""".format(SLASH=REGEX_SLASH) +

        # right quote (if we have left quote)
        r"""(?(quote)(?P=quote))"""
    ) 
Example 34
Project: DataComp   Author: Cojabi   File: utils.py    Apache License 2.0 5 votes vote down vote up
def _categorical_table(data):
    """
    Returns the number of occurrences for the categories. Is used to build the observation table
    for a chi square test.

    :param data:
    :return:
    """
    # count occurences
    c = Counter(data)
    # delete NaNs
    c = {key: c[key] for key in c if not pd.isnull(key)}

    return pd.Series(c) 
Example 35
Project: google-tech-dev-guide   Author: alvinctk   File: minimum_substring_window.py    Apache License 2.0 5 votes vote down vote up
def minWindow(self, s:str, t: str) -> str:
        n, m = len(s), len(t)
        if (s == "" and t == "") or m > n:
            print("minWindow({}, {}) = \"{}\"".format(s, t, ""))
            return ""
        left = right = min_left = min_right = 0
        count = Counter(t)
        window = defaultdict(int)
        w = 0
        length = float("inf")
        while right < n:
            if w == 0 and s[right] not in count:
                left += 1
                right += 1
                continue
            if w < m and s[right] in count:
                window[s[right]] += 1
                if window[s[right]] <= count[s[right]]:
                    w += 1

            if w == m:
                #print(s[left:right+1], right - left + 1)
                if right - left + 1 < length:
                    length = right - left + 1
                    min_left = left
                    min_right = right
                if s[left] in count:
                    window[s[left]] -= 1
                    if window[s[left]] + 1 <= count[s[left]]:
                        w -= 1
                        right += 1

                left += 1
            else:
                right += 1
        result =  "" if length == float("inf") else s[min_left:min_right+1]
        print("minWindow({}, {}) = {}".format(s, t, result))
        return result 
Example 36
Project: google-tech-dev-guide   Author: alvinctk   File: longest_palindrome.py    Apache License 2.0 5 votes vote down vote up
def longestPalindrome2(self, s: str) -> int:
        """
        Runtime: 36 ms, faster than 84.30% of Python3 online submissions for Longest Palindrome.
        Memory Usage: 13.8 MB, less than 8.33% of Python3 online submissions for Longest Palindrome.
        """
        odd = 0
        for k, v in Counter(s).items():
            if v % 2 == 1:
                odd += 1
        return len(s) if odd <= 1 else len(s) - odd + 1 
Example 37
Project: google-tech-dev-guide   Author: alvinctk   File: longest_palindrome.py    Apache License 2.0 5 votes vote down vote up
def longestPalindrome3(self, s: str) -> int:

        unique = Counter(s)
        odd = [k for k, v in unique.items() if v % 2 == 1]
        if not odd or len(odd) == 1:
            return len(s)
        else:
            return len(s) - len(odd) + 1 
Example 38
Project: google-tech-dev-guide   Author: alvinctk   File: palindrome_permutation.py    Apache License 2.0 5 votes vote down vote up
def canPermutePalindrome(self, s: str) -> bool:
        unique = Counter(s)
        odd = [k for k, v in unique.items() if v % 2 == 1]
        if odd and len(odd) > 1:
            return False
        else:
            return True 
Example 39
Project: models   Author: kipoi   File: model.py    MIT License 5 votes vote down vote up
def _count_gc_content(self, seq):
        import collections
        count_gc = collections.Counter(seq)
        return (count_gc['g'] + count_gc['G'] + count_gc['c'] + count_gc['C']) / len(seq) 
Example 40
Project: models   Author: kipoi   File: model.py    MIT License 5 votes vote down vote up
def _count_gc_content(self, seq):
        import collections
        count_gc = collections.Counter(seq)
        return (count_gc['g'] + count_gc['G'] + count_gc['c'] + count_gc['C']) / len(seq) 
Example 41
Project: programsynthesishunting   Author: flexgp   File: baselines.py    GNU General Public License v3.0 5 votes vote down vote up
def fit_maj_class(train_X, train_y, test_X):
    """
    Use the majority class, for a binary problem...
    
    :param train_X: An array of input (X) training data.
    :param train_y: An array of expected output (Y) training data.
    :param test_X: An array of input (X) testint data.
    :return:
    """
    
    # Set training Y data to int type.
    train_y = train_y.astype(int)
    
    # Get all classes from training Y data, often just {0, 1} or {-1, 1}.
    classes = set(train_y)
    
    # Get majority class.
    maj = Counter(train_y).most_common(1)[0][0]
    
    # Generate model.
    model = "Majority class %d" % maj
    
    # Generate training and testing output values.
    yhat_train = maj * np.ones(len(train_y))
    yhat_test = maj * np.ones(len(test_y))
    
    return model, yhat_train, yhat_test 
Example 42
Project: Ansible-Example-AB2018   Author: umit-ozturk   File: vmware_dvs_host.py    MIT License 5 votes vote down vote up
def check_uplinks(self):
        pnic_device = []

        for dvs_host_member in self.dv_switch.config.host:
            if dvs_host_member.config.host == self.host:
                for pnicSpec in dvs_host_member.config.backing.pnicSpec:
                    pnic_device.append(pnicSpec.pnicDevice)

        return Counter(pnic_device) == Counter(self.vmnics) 
Example 43
Project: tensorflow_generate_headlines   Author: FanWan   File: data_utils.py    GNU General Public License v3.0 5 votes vote down vote up
def build_dict(train=False, word2index_path=None):
    article_list, title_list = get_init_data(train)
    if not os.path.exists(os.path.dirname(word2index_path)):
        # get word2index dictionary
        words = list()
        for sent in article_list + title_list:
            for word in sent:
                words.append(word)

        word_counter = collections.Counter(words).most_common(n=50000)
        word_dict = dict()
        word_dict["<padding>"] = 0
        word_dict["<unk>"] = 1
        word_dict["<s>"] = 2
        word_dict["</s>"] = 3
        for word, _ in word_counter:
            word_dict[word] = len(word_dict)

        # save word2index dictionary
        os.makedirs(os.path.dirname(word2index_path))
        with open(word2index_path, "wb") as f:
            pickle.dump(word_dict, f)
    else:
        with open(word2index_path, "rb") as f:
            word_dict = pickle.load(f)

    reversed_dict = dict(zip(word_dict.values(), word_dict.keys()))
    return word_dict, reversed_dict, article_list, title_list 
Example 44
Project: VSE-C   Author: ExplorerFreda   File: vocab.py    MIT License 5 votes vote down vote up
def build_vocab(data_path, data_name, jsons, threshold):
    """Build a simple vocabulary wrapper."""
    counter = Counter()
    for path in jsons[data_name]:
        full_path = os.path.join(os.path.join(data_path, data_name), path)
        if data_name == 'coco':
            captions = from_coco_json(full_path)
        elif data_name == 'f8k' or data_name == 'f30k':
            captions = from_flickr_json(full_path)
        else:
            captions = from_txt(full_path)
        for i, caption in enumerate(captions):
            tokens = nltk.tokenize.word_tokenize(
                caption.lower().decode('utf-8'))
            counter.update(tokens)

            if i % 1000 == 0:
                print(("[%d/%d] tokenized the captions." % (i, len(captions))))

    # Discard if the occurrence of the word is less than min_word_cnt.
    words = [word for word, cnt in list(counter.items()) if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab 
Example 45
Project: deep-learning-note   Author: wdxtub   File: utils.py    MIT License 5 votes vote down vote up
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return torchtext.vocab.Vocab(counter, min_freq=5) 
Example 46
Project: deep-learning-note   Author: wdxtub   File: 53_machine_translation.py    MIT License 5 votes vote down vote up
def build_data(all_tokens, all_seqs):
    vocab = Vocab.Vocab(collections.Counter(all_tokens),
                        specials=[PAD, BOS, EOS])
    indices = [[vocab.stoi[w] for w in seq] for seq in all_seqs]
    return vocab, torch.tensor(indices) 
Example 47
Project: L   Author: vaultah   File: records.py    MIT License 5 votes vote down vote up
def __enter__(self):
        ''' Return the instance.
            A caller can update the fields, the updated document will be put 
            into database on __exit__() '''
        self._updates = self._updates or {}
        self._removals = self._removals or []
        self._counters = self._counters or collections.Counter()
        # Must be set at the end of this method
        self.__context_depth__ += 1
        return self 
Example 48
Project: mmi-tagger   Author: karlstratos   File: data.py    MIT License 5 votes vote down vote up
def get_data(self):
        wcount = Counter()
        ccount = Counter()
        def add(w):
            wcount[w] += 1
            if w not in self.w2i:
                self.i2w.append(w)
                self.w2i[w] = len(self.i2w) - 1
            for c in w:
                ccount[c] += 1
                if c not in self.c2i:
                    self.i2c.append(c)
                    self.c2i[c] = len(self.i2c) - 1
            return self.w2i[w]

        with open(self.data_path, 'r') as data_file:
            for line in data_file:
                toks = line.split()
                if toks:
                    self.sents.append([add(tok) for tok in toks])

        self.word_counter = [wcount[self.i2w[i]] for i in range(len(self.i2w))]
        self.char_counter = [ccount[self.i2c[i]] for i in range(len(self.i2c))]

        gold_path = self.data_path[:-5] + 'tags'
        assert os.path.isfile(gold_path)
        self.get_golds(gold_path) 
Example 49
Project: mmi-tagger   Author: karlstratos   File: evaluate.py    MIT License 5 votes vote down vote up
def compute_v_measure(tseqs, zseqs):
    num_instances = 0
    t2i = {}
    z2i = {}
    cocount = Counter()
    for i in range(len(tseqs)):
        for (t, z) in zip(tseqs[i], zseqs[i]):
            num_instances += 1
            if not t in t2i: t2i[t] = len(t2i)
            if not z in z2i: z2i[z] = len(z2i)
            cocount[(t2i[t], z2i[z])] += 1

    B = np.empty([len(t2i), len(z2i)])
    for i in range(len(t2i)):
        for j in range(len(z2i)):
            B[i, j] = cocount[(i, j)] / num_instances

    p_T = np.sum(B, axis=1)
    p_Z = np.sum(B, axis=0)
    H_T = sum([- p_T[i] * np.log2(p_T[i]) for i in range(len(t2i))])
    H_Z = sum([- p_Z[i] * np.log2(p_Z[i]) for i in range(len(z2i))])

    H_T_given_Z = 0
    for j in range(len(z2i)):
        for i in range(len(t2i)):
            if B[i, j] > 0.0:
                H_T_given_Z -= B[i, j] * \
                               (np.log2(B[i, j]) - np.log2(p_Z[j]))
    H_Z_given_T = 0
    for j in range(len(t2i)):
        for i in range(len(z2i)):
            if B[j, i] > 0.0:
                H_Z_given_T -= B[j, i] * \
                               (np.log2(B[j, i]) - np.log2(p_T[j]))

    h = 1 if len(t2i) == 1 else 1 - H_T_given_Z / H_T
    c = 1 if len(z2i) == 1 else 1 - H_Z_given_T / H_Z

    return 2 * h * c / (h + c) * 100.0 
Example 50
Project: mmi-tagger   Author: karlstratos   File: evaluate.py    MIT License 5 votes vote down vote up
def count_cooccurence(tseqs, zseqs):
    cooccur = {}
    assert len(tseqs) == len(zseqs)
    for i in range(len(tseqs)):
        assert len(tseqs[i]) == len(zseqs[i])
        for (t, z) in zip(tseqs[i], zseqs[i]):
            if not z in cooccur: cooccur[z] = Counter()
            cooccur[z][t] += 1
    return cooccur 
Example 51
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: data.py    Apache License 2.0 5 votes vote down vote up
def __init__(self):
        self._token_to_id = {}
        self._token_to_count = collections.Counter()
        self._id_to_token = []
        self._num_tokens = 0
        self._total_count = 0
        self._s_id = None
        self._unk_id = None 
Example 52
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: ner.py    Apache License 2.0 5 votes vote down vote up
def build_vocab(nested_list):
    """
    :param nested_list: list of list of string
    :return: dictionary mapping from string to int, inverse of that dictionary
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*nested_list))

    # Mapping from index to label
    vocabulary_inv = [x[0] for x in word_counts.most_common()]

    # Mapping from label to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return vocabulary, vocabulary_inv 
Example 53
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: data_helpers.py    Apache License 2.0 5 votes vote down vote up
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv] 
Example 54
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: data_helpers.py    Apache License 2.0 5 votes vote down vote up
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv] 
Example 55
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: test_contrib_text.py    Apache License 2.0 5 votes vote down vote up
def _test_count_tokens_from_str_with_delims(token_delim, seq_delim):
    source_str = _get_test_str_of_tokens(token_delim, seq_delim)

    cnt1 = text.utils.count_tokens_from_str(
        source_str, token_delim, seq_delim, to_lower=False)
    assert cnt1 == Counter(
        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt2 = text.utils.count_tokens_from_str(
        source_str, token_delim, seq_delim, to_lower=True)
    assert cnt2 == Counter(
        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1})

    counter_to_update = Counter({'life': 2})

    cnt3 = text.utils.count_tokens_from_str(
        source_str, token_delim, seq_delim, to_lower=False,
        counter_to_update=counter_to_update.copy())
    assert cnt3 == Counter(
        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
         'bad': 1})

    cnt4 = text.utils.count_tokens_from_str(
        source_str, token_delim, seq_delim, to_lower=True,
        counter_to_update=counter_to_update.copy())
    assert cnt4 == Counter(
        {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}) 
Example 56
Project: directed-probe-matching   Author: rc1035   File: clusterSimilarSSIDSets.py    MIT License 5 votes vote down vote up
def filter_false_pos_tokens_from_cluster(token_to_probes, cluster):
    """ Remove any token from a cluster that does not have the most common fingerprint.
    :param token_to_probes: Dictionary of token to list of probe dictionary
    :param cluster: set of tokens
    """
    token_to_fingerprint = {}

    # First match each token to its probe's fingerprints
    for token in cluster:
        fingerprints = set()
        fingerprints |= {probe["fingerprint"] for probe in token_to_probes[token]}

        # We only care about a token if its fingerprint is stable
        # i.e. it does not change.
        if len(fingerprints) == 1:
            token_to_fingerprint[token] = fingerprints.pop()

    if not token_to_fingerprint:
        # Do nothing - no token has a stable fingerprint
        return cluster

    # Now remove any token whose fingerprint is not consistent with the
    # most common fingerprint.
    most_common_fingerprint = Counter(token_to_fingerprint.values()).most_common(1)[0][0]

    return cluster - {token for token in token_to_fingerprint.keys() 
           if token_to_fingerprint[token] != most_common_fingerprint} 
Example 57
Project: DOTA_models   Author: ringringyi   File: preprocess_dataset.py    Apache License 2.0 5 votes vote down vote up
def main(unused_argv):
  if not FLAGS.input_files:
    raise ValueError("--input_files is required.")
  if not FLAGS.output_dir:
    raise ValueError("--output_dir is required.")

  if not tf.gfile.IsDirectory(FLAGS.output_dir):
    tf.gfile.MakeDirs(FLAGS.output_dir)

  input_files = []
  for pattern in FLAGS.input_files.split(","):
    match = tf.gfile.Glob(FLAGS.input_files)
    if not match:
      raise ValueError("Found no files matching %s" % pattern)
    input_files.extend(match)
  tf.logging.info("Found %d input files.", len(input_files))

  vocab = _build_vocabulary(input_files)

  tf.logging.info("Generating dataset.")
  stats = collections.Counter()
  dataset = []
  for filename in input_files:
    dataset.extend(_process_input_file(filename, vocab, stats))
    if FLAGS.max_sentences and stats["sentences_output"] >= FLAGS.max_sentences:
      break

  tf.logging.info("Generated dataset with %d sentences.", len(dataset))
  for k, v in stats.items():
    tf.logging.info("%s: %d", k, v)

  tf.logging.info("Shuffling dataset.")
  np.random.seed(123)
  shuffled_indices = np.random.permutation(len(dataset))
  val_indices = shuffled_indices[:FLAGS.num_validation_sentences]
  train_indices = shuffled_indices[FLAGS.num_validation_sentences:]

  _write_dataset("train", dataset, train_indices, FLAGS.train_output_shards)
  _write_dataset("validation", dataset, val_indices,
                 FLAGS.validation_output_shards) 
Example 58
Project: DOTA_models   Author: ringringyi   File: reader.py    Apache License 2.0 5 votes vote down vote up
def _build_vocab(filename):
  data = _read_words(filename)

  counter = collections.Counter(data)
  count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

  words, _ = list(zip(*count_pairs))
  word_to_id = dict(zip(words, range(len(words))))

  return word_to_id 
Example 59
Project: advent-of-code-2018   Author: badouralix   File: ludoge.py    MIT License 5 votes vote down vote up
def run(self, s):
        # :param s: input in string format
        # :return: solution flag
        # Your code goes here
        lines = sorted(s.splitlines())
        guards = {}
        for line in lines:
            minute = int(line.split()[1][3:5])
            if "#" in line:
                guard_id = int(line.split()[3][1:])
                if guard_id not in guards:
                    guards[guard_id] = []
                wakes_up = minute
            if "falls asleep" in line:
                falls_asleep = minute
            if "wakes up" in line:
                wakes_up = minute
                guards[guard_id] += list(range(falls_asleep, wakes_up))


        guard_minute_frequency = {k: dict(Counter(v)) for k, v in guards.items() if Counter(v)}
        guard_minute_max = {k: max(v.values()) for k, v in guard_minute_frequency.items()}
        max_guard = max(guard_minute_frequency, key=guard_minute_max.get)
        max_guard_sleeps = guards[max_guard]
        max_minute = max(set(max_guard_sleeps), key=max_guard_sleeps.count)

        return max_minute * max_guard

        pass 
Example 60
Project: advent-of-code-2018   Author: badouralix   File: silvestre.py    MIT License 5 votes vote down vote up
def run(self, s):
        # :param s: input in string format
        # :return: solution flag
        lines = s.splitlines()
        twice = three_times = 0
        for row in lines:
            c = Counter(row)
            if 2 in c.values():
                twice += 1
            if 3 in c.values():
                three_times += 1
        return twice * three_times 
Example 61
Project: advent-of-code-2018   Author: badouralix   File: bebert.py    MIT License 5 votes vote down vote up
def run(self, s: str):
        twos = 0
        threes = 0
        for line in s.splitlines():
            count = Counter(line.strip())
            if 2 in count.values():
                twos += 1
            if 3 in count.values():
                threes += 1
        return twos * threes 
Example 62
Project: advent-of-code-2018   Author: badouralix   File: david.py    MIT License 5 votes vote down vote up
def run(self, s):
        words = s.split("\n")
        appears2, appears3 = (0,0)
        for w in words:
            c = Counter(w)
            if 2 in c.values():
                appears2 += 1
            if 3 in c.values():
                appears3 += 1

        return appears2 * appears3 
Example 63
Project: advent-of-code-2018   Author: badouralix   File: ludoge.py    MIT License 5 votes vote down vote up
def count(self, s):
        for line in s.splitlines():
            counter = Counter(line)
            y1 = 1 if [c for c in counter if counter[c] == 2] else 0
            y2 = 1 if [c for c in counter if counter[c] == 3] else 0
            yield (y1, y2) 
Example 64
Project: advent-of-code-2018   Author: badouralix   File: jon.py    MIT License 5 votes vote down vote up
def run(self, s):

        c2 = 0
        c3 = 0

        for l in s.splitlines():
            c = Counter(l.strip())
            if 2 in c.values():
                c2 += 1
            if 3 in c.values():
                c3 += 1

        return c2 * c3 
Example 65
Project: advent-of-code-2018   Author: badouralix   File: mathieu.py    MIT License 5 votes vote down vote up
def run(self, s):
        # :param s: input in string format
        # :return: solution flag
        # Your code goes here
        lines = s.split('\n')
        count_twice = 0
        count_three = 0
        for line in lines:
            counter = Counter(line)
            if 2 in counter.values():
                count_twice += 1
            if 3 in counter.values():
                count_three += 1

        return count_three * count_twice 
Example 66
Project: advent-of-code-2018   Author: badouralix   File: thomas.py    MIT License 5 votes vote down vote up
def run(self, s):
        two = 0
        three = 0

        for word in s.split('\n'):
            letters = Counter(word)
            two += 2 in letters.values()
            three += 3 in letters.values()

        return two * three 
Example 67
Project: geopar   Author: ebraude   File: tfvalidator.py    MIT License 5 votes vote down vote up
def rule_pairing(a_tf):
        """
        Checks whether a rule of pairing is valid in a triangulated figure a_tf.

        PRE
        a_tf is an instance of TriangulatedFigure class containing at least one triangle.

        POST
        True is returned if the rule is valid, False otherwise.
        """

        ########################################################################
        if a_tf.is_empty():
            raise Exception('a_tf is empty! See precondition PRE')
        ########################################################################

        following, preceding = [], []
        for point in a_tf.get_interior_points():
            for tri in a_tf.triangles_at(point):
                following.append(tri.angle_of_point(tri.point_following(point)))
                preceding.append(tri.angle_of_point(tri.point_preceding(point)))

            if Counter(following) != Counter(preceding):
                return False

        return True 
Example 68
Project: aurora   Author: carnby   File: filtering.py    MIT License 5 votes vote down vote up
def reset(self):
        del self.timeline[:]
        del self.timeline_feature_vectors[:]
        self.feature_vector_counts = Counter()
        self.timeline_ids.clear()
        self.user_ids.clear()
        self.timeline_urls.clear()
        self.discarded_ids.clear() 
Example 69
Project: lyrebird-api-coverage   Author: Meituan-Dianping   File: jsonscheme.py    MIT License 5 votes vote down vote up
def check_url_redundant(obj):
    repeat_urls = []
    url_list = list(map(lambda x: x.get('url'), obj.get('api_list')))
    url_count = dict(collections.Counter(url_list))
    for k, v in url_count.items():
        if v > 1:
            repeat_urls.append(k)
    return repeat_urls 
Example 70
Project: bigquerylayers   Author: smandaric   File: lfu.py    GNU General Public License v3.0 5 votes vote down vote up
def __init__(self, maxsize, getsizeof=None):
        Cache.__init__(self, maxsize, getsizeof)
        self.__counter = collections.Counter() 
Example 71
Project: ImageQA   Author: codedecde   File: reader.py    MIT License 5 votes vote down vote up
def get_vocab(data):
    vocab=Counter()
    for ex in data:
        tokens=tokenize(ex[0])
        tokens+=tokenize(ex[1])
        vocab.update(tokens)
    lst = ["unk", "delimiter"] + [ x for x, y in vocab.iteritems() if y > 0]
    vocab = dict([ (y,x) for x,y in enumerate(lst) ])
    return vocab 
Example 72
Project: fs_image   Author: facebookincubator   File: deepcopy_test.py    MIT License 4 votes vote down vote up
def _check_deepcopy(
        self,
        gen_fn: Callable[[], Generator[Tuple[str, Any], Any, List[str]]],
        replace_step=None, expected_name=None, *, _replace_by=None,
    ) -> List[str]:
        '''
        Steps through `deepcopy_original`, optionally replacing the ID map
        by deepcopy at a specific step of the test.
        '''
        obj = None
        steps = []
        deepcopy_original = None

        with while_not_exited(gen_fn()) as ctx:
            while True:
                step, obj = ctx.send(obj)
                if len(steps) == replace_step:
                    self.assertEqual(expected_name, step)
                    if _replace_by is None:
                        deepcopy_original = obj
                        obj = copy.deepcopy(obj)
                    else:
                        obj = _replace_by
                steps.append(step)

        # Don't repeat step names
        self.assertEqual([], [s for s, n in Counter(steps).items() if n > 1])

        # We just replaced the map with a deepcopy at a specific step.  Now,
        # we run the test one more time up to the same step, and replace the
        # map with the pre-deepcopy original to ensure it has not changed.
        if replace_step is not None and _replace_by is None:
            self.assertIsNotNone(deepcopy_original)
            with self.subTest(deepcopy_original=True):
                self.assertEqual(steps, self._check_deepcopy(
                    gen_fn,
                    replace_step,
                    expected_name,
                    _replace_by=deepcopy_original,
                ))

        return steps 
Example 73
Project: wikilinks   Author: trovdimi   File: WikiBrowser.py    MIT License 4 votes vote down vote up
def _load_finished(self):
        #print "finished"
        frame = self.page().mainFrame()
        size = QSize(self.resolution[0], self.resolution[1])
        self.page().setPreferredContentsSize(size)
        self.resize(frame.contentsSize())
        self.page().setViewportSize(frame.contentsSize())
        html = frame.documentElement()

        #two modes for these lines of code: page lenght and links position mode: page length activeted
        keys = []
        values = []
        for link in html.findAll('a'):
            href = unicode(link.attribute('href'))
            if href.startswith('./'):
                key = href.split('./')[-1].split('#')[0]
                keys.append(key)
                value = link.geometry().topLeft().x(), link.geometry().topLeft().y()
                values.append(value)

        counts = Counter(keys)  # so we have: {'name':3, 'state':1, 'city':1, 'zip':2}
        for s, num in counts.items():
            if num > 1:  # ignore strings that only appear once
                for suffix in range(1, num + 1):  # suffix starts at 1 and increases by 1 each time
                    keys[keys.index(s)] = s + '-----##$$$##-----' + str(suffix)
        self.positions = {k: v for k, v in zip(keys, values)}
        img = QImage(frame.contentsSize(), QImage.Format_ARGB32)
        paint = QPainter(img)
        print("rendering...")
        frame.render(paint)
        paint.end()
        img.save(self.out+"_"+str(self.resolution[0])+".png")
        print("... done")
        print("result: %s"%self.out)
        #print  html.findFirst("div[class=pyqt_is_shit]").geometry().topLeft().y()
        self.page_length = html.findFirst("div[class=pyqt_is_shit]").geometry().topLeft().y()
        print self.page_length
        #self.punched.emit(self.page_length)
        #self.close()
        #print "done"
        self.finished = True
        #self.quit() 
Example 74
Project: Neural-LP   Author: fanyangxyz   File: data.py    MIT License 4 votes vote down vote up
def __init__(self, folder, seed):
        np.random.seed(seed)
        self.seed = seed
        self.kb_relation_file = os.path.join(folder, "kb_relations.txt")
        self.kb_entity_file = os.path.join(folder, "kb_entities.txt")
        self.query_vocab_file = os.path.join(folder, "query_vocabs.txt")

        self.kb_relation_to_number = self._numerical_encode(self.kb_relation_file)
        self.kb_entity_to_number = self._numerical_encode(self.kb_entity_file)
        self.query_vocab_to_number = self._numerical_encode(self.query_vocab_file)

        self.test_file = os.path.join(folder, "test.txt")
        self.train_file = os.path.join(folder, "train.txt")
        self.valid_file = os.path.join(folder, "valid.txt")
        self.facts_file = os.path.join(folder, "facts.txt")

        self.test, self.num_test = self._parse_examples(self.test_file)
        self.train, self.num_train = self._parse_examples(self.train_file)
        self.valid, self.num_valid = self._parse_examples(self.valid_file)
        self.facts, self.num_fact = self._parse_facts(self.facts_file)
        self.all_exams = set([tuple(q + [h, t]) for (q, h, t) in self.train + self.test + self.valid])

        self.num_word = len(self.test[0][0])
        self.num_vocab = len(self.query_vocab_to_number)
        self.num_relation = len(self.kb_relation_to_number)
        self.num_operator = 2 * self.num_relation
        self.num_entity = len(self.kb_entity_to_number)

        self.matrix_db = self._db_to_matrix_db(self.facts)
        self.matrix_db_train = self.matrix_db
        self.matrix_db_test = self.matrix_db
        self.matrix_db_valid = self.matrix_db

        self.type_check = False
        self.domain_size = None
        self.use_extra_facts = False
        self.query_include_reverse = False
        self.share_db = False

        self.parser = self._create_parser()
        #self.query_for_rules = [list(q) for q in Counter([tuple(q) for (q, _, _) in self.test]).keys()]
        self.query_for_rules = [list(q) for q in set([tuple(q) for (q, _, _) in self.test + self.train])] 
Example 75
Project: dynamic-training-with-apache-mxnet-on-aws   Author: awslabs   File: utils.py    Apache License 2.0 4 votes vote down vote up
def count_tokens_from_str(source_str, token_delim=' ', seq_delim='\n',
                          to_lower=False, counter_to_update=None):
    """Counts tokens in the specified string.

    For token_delim=\'<td>\' and seq_delim=\'<sd>\', a specified string of two sequences of
    tokens may look like::

    <td>token1<td>token2<td>token3<td><sd><td>token4<td>token5<td><sd>

    <td> and <sd> are regular expressions. Make use of \\\\ to allow special characters as
    delimiters. The list of
    special characters can be found at https://docs.python.org/3/library/re.html.

    Parameters
    ----------
    source_str : str
        A source string of tokens.
    token_delim : str, default ' '
        A token delimiter.
    seq_delim : str, default '\\\\n'
        A sequence delimiter.
    to_lower : bool, default False
        Whether to convert the source source_str to the lower case.
    counter_to_update : collections.Counter or None, default None
        The collections.Counter instance to be updated with the token counts of `source_str`. If
        None, return a new collections.Counter instance counting tokens from `source_str`.


    Returns
    -------
    collections.Counter
        The `counter_to_update` collections.Counter instance after being updated with the token
        counts of `source_str`. If `counter_to_update` is None, return a new collections.Counter
        instance counting tokens from `source_str`.


    Examples
    --------
    >>> source_str = ' Life is great ! \\n life is good . \\n'
    >>> count_tokens_from_str(token_line, ' ', '\\n', True)
    Counter({'!': 1, '.': 1, 'good': 1, 'great': 1, 'is': 2, 'life': 2})


    >>> source_str = '*Life*is*great*!*\\n*life*is*good*.*\\n'
    >>> count_tokens_from_str(token_line, '\\*', '\\n', True)
    Counter({'is': 2, 'life': 2, '!': 1, 'great': 1, 'good': 1, '.': 1})
    """

    source_str = filter(None,
                        re.split(token_delim + '|' + seq_delim, source_str))
    if to_lower:
        source_str = [t.lower() for t in source_str]

    if counter_to_update is None:
        return collections.Counter(source_str)
    else:
        counter_to_update.update(source_str)
        return counter_to_update 
Example 76
Project: DOTA_models   Author: ringringyi   File: preprocess_dataset.py    Apache License 2.0 4 votes vote down vote up
def _build_vocabulary(input_files):
  """Loads or builds the model vocabulary.

  Args:
    input_files: List of pre-tokenized input .txt files.

  Returns:
    vocab: A dictionary of word to id.
  """
  if FLAGS.vocab_file:
    tf.logging.info("Loading existing vocab file.")
    vocab = collections.OrderedDict()
    with tf.gfile.GFile(FLAGS.vocab_file, mode="r") as f:
      for i, line in enumerate(f):
        word = line.decode("utf-8").strip()
        assert word not in vocab, "Attempting to add word twice: %s" % word
        vocab[word] = i
    tf.logging.info("Read vocab of size %d from %s",
                    len(vocab), FLAGS.vocab_file)
    return vocab

  tf.logging.info("Creating vocabulary.")
  num = 0
  wordcount = collections.Counter()
  for input_file in input_files:
    tf.logging.info("Processing file: %s", input_file)
    for sentence in tf.gfile.FastGFile(input_file):
      wordcount.update(sentence.split())

      num += 1
      if num % 1000000 == 0:
        tf.logging.info("Processed %d sentences", num)

  tf.logging.info("Processed %d sentences total", num)

  words = wordcount.keys()
  freqs = wordcount.values()
  sorted_indices = np.argsort(freqs)[::-1]

  vocab = collections.OrderedDict()
  vocab[special_words.EOS] = special_words.EOS_ID
  vocab[special_words.UNK] = special_words.UNK_ID
  for w_id, w_index in enumerate(sorted_indices[0:FLAGS.num_words - 2]):
    vocab[words[w_index]] = w_id + 2  # 0: EOS, 1: UNK.

  tf.logging.info("Created vocab with %d words", len(vocab))

  vocab_file = os.path.join(FLAGS.output_dir, "vocab.txt")
  with tf.gfile.FastGFile(vocab_file, "w") as f:
    f.write("\n".join(vocab.keys()))
  tf.logging.info("Wrote vocab file to %s", vocab_file)

  word_counts_file = os.path.join(FLAGS.output_dir, "word_counts.txt")
  with tf.gfile.FastGFile(word_counts_file, "w") as f:
    for i in sorted_indices:
      f.write("%s %d\n" % (words[i], freqs[i]))
  tf.logging.info("Wrote word counts file to %s", word_counts_file)

  return vocab 
Example 77
Project: DOTA_models   Author: ringringyi   File: preprocess_dataset.py    Apache License 2.0 4 votes vote down vote up
def _process_input_file(filename, vocab, stats):
  """Processes the sentences in an input file.

  Args:
    filename: Path to a pre-tokenized input .txt file.
    vocab: A dictionary of word to id.
    stats: A Counter object for statistics.

  Returns:
    processed: A list of serialized Example protos
  """
  tf.logging.info("Processing input file: %s", filename)
  processed = []

  predecessor = None  # Predecessor sentence (list of words).
  current = None  # Current sentence (list of words).
  successor = None  # Successor sentence (list of words).

  for successor_str in tf.gfile.FastGFile(filename):
    stats.update(["sentences_seen"])
    successor = successor_str.split()

    # The first 2 sentences per file will be skipped.
    if predecessor and current and successor:
      stats.update(["sentences_considered"])

      # Note that we are going to insert <EOS> later, so we only allow
      # sentences with strictly less than max_sentence_length to pass.
      if FLAGS.max_sentence_length and (
          len(predecessor) >= FLAGS.max_sentence_length or len(current) >=
          FLAGS.max_sentence_length or len(successor) >=
          FLAGS.max_sentence_length):
        stats.update(["sentences_too_long"])
      else:
        serialized = _create_serialized_example(predecessor, current, successor,
                                                vocab)
        processed.append(serialized)
        stats.update(["sentences_output"])

    predecessor = current
    current = successor

    sentences_seen = stats["sentences_seen"]
    sentences_output = stats["sentences_output"]
    if sentences_seen and sentences_seen % 100000 == 0:
      tf.logging.info("Processed %d sentences (%d output)", sentences_seen,
                      sentences_output)
    if FLAGS.max_sentences and sentences_output >= FLAGS.max_sentences:
      break

  tf.logging.info("Completed processing file %s", filename)
  return processed 
Example 78
Project: fishroom   Author: tuna   File: hualao.py    GNU General Public License v3.0 4 votes vote down vote up
def hualao(cmd, *args, **kwargs):
    if 'room' not in kwargs:
        return None
    room = kwargs['room']
    log_key_tmpl = ChatLogger.LOG_QUEUE_TMPL

    if rlimiter.check(room, cmd, period=30, count=2) is False:
        return

    days = 7
    topn = 10

    if len(args) == 1:
        topn = int(args[0])
    elif len(args) == 2:
        topn, days = map(int, args)
    elif len(args) > 2:
        return "hualao: invalid arguments"

    if topn > 10:
        return "hualao: toooooo many hualaos"

    days = min(days, 21)

    c = Counter()
    today = get_now()
    for _ in range(days):
        key = log_key_tmpl.format(date=today.strftime("%Y-%m-%d"), channel=room)
        senders = [Message.loads(bmsg).sender for bmsg in r.lrange(key, 0, -1)]
        c.update(senders)
        today -= timedelta(days=1)

    hualaos = c.most_common(topn)
    most = hualaos[0][1]

    def to_star(n):
        return '⭐️' * round(5 * n / most) or '⭐️'

    head = "Most talkative {} individuals within {} days:\n".format(topn, days)
    return head + "\n".join(
        ["{}: {} {}".format(u, to_star(c), c) for u, c in hualaos])


# vim: ts=4 sw=4 sts=4 expandtab 
Example 79
Project: geopar   Author: ebraude   File: tfpreprocessor.py    MIT License 4 votes vote down vote up
def theorem_3(a_tf):
        # traversing through interior points
        for point in a_tf.get_interior_points():

            # triangles around interior point
            triangles = a_tf.triangles_at(point)

            angle_following_list = []
            angle_preceding_list = []

            unknown_following_count = 0
            unknown_preceding_count = 0
            sum_angles = 0

            points_of_unknown_angles = []

            # traverse through triangles around interior point
            for t in triangles:
                point_following = t.point_following(point)
                point_preceding = t.point_preceding(point)

                angle_following = t.angle_of_point(point_following)
                angle_preceding = t.angle_of_point(point_preceding)

                if angle_following.is_known():
                    angle_following_list.append(angle_following)
                if angle_preceding.is_known():
                    angle_preceding_list.append(angle_preceding)

                if not angle_following.is_known():
                    unknown_following_count += 1
                    points_of_unknown_angles.append(t.get_angle_points_by_point(point_following))
                else:
                    sum_angles += angle_following

                if not angle_preceding.is_known():
                    unknown_preceding_count += 1
                    points_of_unknown_angles.append(t.get_angle_points_by_point(point_preceding))
                else:
                    sum_angles += angle_preceding

            if unknown_following_count == 1 and unknown_following_count == 1 and \
                    Counter(angle_following_list) == Counter(angle_preceding_list):
                angle_to_set = ((len(triangles) - 2) * 180 - sum_angles) / 2
                a_tf.set_angle_by_angle_points(*points_of_unknown_angles[0], angle_to_set)
                a_tf.set_angle_by_angle_points(*points_of_unknown_angles[1], angle_to_set) 
Example 80
Project: aurora   Author: carnby   File: filtering.py    MIT License 4 votes vote down vote up
def __init__(self, characterizer, skip_fields=None, max_entropy_percentile=100.0, time_bucket_size=10,
                 start_strategy=None, pick_strategy=None, approve_tweet_fn=None,
                 min_date=None, allow_repeated_users=False, allow_repeated_urls=False, similarity_threshold=0.85,
                 n_candidates=None, target_entropy=0.99):
        self.characterizer = characterizer
        self.skip_fields = skip_fields
        self.time_bucket_size = time_bucket_size
        self.max_entropy_percentile = max_entropy_percentile
        self.min_date = min_date
        self.allow_repeated_users = allow_repeated_users
        self.allow_repeated_urls = allow_repeated_urls
        self.similarity_threshold = similarity_threshold

        if start_strategy is None:
            self.start_strategy = TimelineFilter.starting_tweet
        else:
            self.start_strategy = start_strategy

        if pick_strategy is None:
            self.pick_strategy = TimelineFilter.select_tweet
        else:
            self.pick_strategy = pick_strategy

        if approve_tweet_fn is None:
            self.approve_tweet_fn = lambda x: True
        else:
            self.approve_tweet_fn = approve_tweet_fn


        if skip_fields is not None:
            self.feature_keys = [p for p in ['popularity', 'followers', 'friends', 'n_tweets', 'hub', 'diffusion', 'reply', 'geography', 'time', 'url', 'topics']
                                if p not in skip_fields]
        else:
            self.feature_keys = ['popularity', 'followers', 'friends', 'n_tweets', 'hub', 'diffusion', 'reply', 'geography', 'time', 'url', 'topics']

        self.timeline = []
        self.timeline_ids = set()
        self.user_ids = set()
        self.timeline_urls = set()
        self.timeline_feature_vectors = []
        self.feature_vector_counts = Counter()
        self.discarded_ids = set()
        self.sequence_matcher = difflib.SequenceMatcher()
        self.n_candidates = n_candidates
        self.target_entropy = target_entropy