Python toolz.partition_all() Examples

The following are 11 code examples of toolz.partition_all(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module toolz , or try the search function .
Example #1
Source File: umis.py    From umis with MIT License 6 votes vote down vote up
def sb_filter(fastq, bc, cores, nedit):
    ''' Filters reads with non-matching sample barcodes
    Expects formatted fastq files.
    '''
    barcodes = set(sb.strip() for sb in bc)
    if nedit == 0:
        filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
    else:
        barcodehash = MutationHash(barcodes, nedit)
        filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_sb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read) 
Example #2
Source File: accounts.py    From hivemind with MIT License 6 votes vote down vote up
def _cache_accounts(cls, accounts, steem, trx=True):
        """Fetch all `accounts` and write to db."""
        timer = Timer(len(accounts), 'account', ['rps', 'wps'])
        for name_batch in partition_all(1000, accounts):
            cached_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')

            timer.batch_start()
            batch = steem.get_accounts(name_batch)

            timer.batch_lap()
            sqls = [cls._sql(acct, cached_at) for acct in batch]
            DB.batch_queries(sqls, trx)

            timer.batch_finish(len(batch))
            if trx or len(accounts) > 1000:
                log.info(timer.batch_status()) 
Example #3
Source File: scraper.py    From steemdata-mongo with MIT License 6 votes vote down vote up
def scrape_blockchain(mongo):
    s = Steem()
    # see how far behind we are
    missing = list(range(last_block_num(mongo), s.last_irreversible_block_num))

    # if we are far behind blockchain head
    # split work in chunks of 100
    if len(missing) > 100:
        for batch in partition_all(100, missing):
            results = s.get_blocks(batch)
            insert_blocks(mongo, results)

    # otherwise continue as normal
    blockchain = Blockchain(mode="irreversible")
    hist = blockchain.stream_from(start_block=last_block_num(mongo), full_blocks=True)
    insert_blocks(mongo, hist) 
Example #4
Source File: umis.py    From umis with MIT License 5 votes vote down vote up
def cb_filter(fastq, bc1, bc2, bc3, cores, nedit):
    ''' Filters reads with non-matching barcodes
    Expects formatted fastq files.
    '''
    with open_gzipsafe(bc1) as bc1_fh:
        bc1 = set(cb.strip() for cb in bc1_fh)

    if bc2:
        with open_gzipsafe(bc2) as bc2_fh:
            bc2 = set(cb.strip() for cb in bc2_fh)
    if bc3:
        with open_gzipsafe(bc3) as bc3_fh:
            bc3 = set(cb.strip() for cb in bc3_fh)

    annotations = detect_fastq_annotations(fastq)
    re_string = construct_transformed_regex(annotations)

    if nedit == 0:
        filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3,
                            re_string=re_string)
    else:
        bc1hash = MutationHash(bc1, nedit)
        bc2hash = None
        bc3hash = None
        if bc2:
            bc2hash = MutationHash(bc2, nedit)
        if bc3:
            bc3hash = MutationHash(bc3, nedit)
        filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash,
                            bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_cb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read) 
Example #5
Source File: umis.py    From umis with MIT License 5 votes vote down vote up
def mb_filter(fastq, cores):
    ''' Filters umis with non-ACGT bases
    Expects formatted fastq files.
    '''
    filter_mb = partial(umi_filter)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_mb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read) 
Example #6
Source File: umis.py    From umis with MIT License 5 votes vote down vote up
def add_uid(fastq, cores):
    ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication
    Expects formatted fastq files with correct sample and cell barcodes.
    '''

    uids = partial(append_uids)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(uids, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read) 
Example #7
Source File: discovery.py    From pyquarkchain with MIT License 5 votes vote down vote up
def send_topic_nodes(
        self, node: kademlia.Node, echo: Hash32, nodes: Tuple[kademlia.Node, ...]
    ) -> None:
        encoded_nodes = tuple(
            n.address.to_endpoint() + [n.pubkey.to_bytes()] for n in nodes
        )
        max_neighbours = self._get_max_neighbours_per_packet()
        for batch in toolz.partition_all(max_neighbours, encoded_nodes):
            message = _pack_v5(CMD_TOPIC_NODES.id, (echo, batch), self.privkey)
            self.logger.trace(">>> topic_nodes to %s: %s", node, batch)
            self.send_v5(node, message) 
Example #8
Source File: prepare_penobscot.py    From seismic-deeplearning with MIT License 5 votes vote down vote up
def split_inline(data_dir, val_ratio, test_ratio, overwrite=False, exclude_files=None):
    """Splits the inline data into train, val and test.

    Args:
        data_dir (str): path to directory that holds the data
        val_ratio (float): the ratio of the partition that will be used for validation
        test_ratio (float): the ratio of the partition that they should use for testing
        exclude_files (list[str]): filenames to exclude from dataset, such as ones that contain
            artifacts. Example:['image1.tiff']
    """
    num_partitions = 5
    image_dir = os.path.join(data_dir, "inlines")
    dir_paths = (os.path.join(image_dir, ddir) for ddir in ("train", "val", "test"))
    locations_list = [_create_directory(d, overwrite=overwrite) for d in dir_paths]  # train, val, test

    images_iter = glob.iglob(os.path.join(image_dir, "*.tiff"))

    if exclude_files is not None:
        images_list = list(itertools.filterfalse(lambda x: x in exclude_files, images_iter))
    else:
        images_list = list(images_iter)

    num_elements = math.ceil(len(images_list) / num_partitions)
    for partition in partition_all(num_elements, images_list):  # Partition files into N partitions
        for files_list, dest_dir in zip(_split_train_val_test(partition, val_ratio, test_ratio), locations_list):
            _copy_files(files_list, dest_dir) 
Example #9
Source File: sync.py    From hivemind with MIT License 5 votes vote down vote up
def from_checkpoints(self, chunk_size=1000):
        """Initial sync strategy: read from blocks on disk.

        This methods scans for files matching ./checkpoints/*.json.lst
        and uses them for hive's initial sync. Each line must contain
        exactly one block in JSON format.
        """
        # pylint: disable=no-self-use
        last_block = Blocks.head_num()

        tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path]
        basedir = os.path.dirname(os.path.realpath(__file__ + "/../.."))
        files = glob.glob(basedir + "/checkpoints/*.json.lst")
        tuples = sorted(map(tuplize, files), key=lambda f: f[0])

        last_read = 0
        for (num, path) in tuples:
            if last_block < num:
                log.info("[SYNC] Load %s. Last block: %d", path, last_block)
                with open(path) as f:
                    # each line in file represents one block
                    # we can skip the blocks we already have
                    skip_lines = last_block - last_read
                    remaining = drop(skip_lines, f)
                    for lines in partition_all(chunk_size, remaining):
                        Blocks.process_multi(map(json.loads, lines), True)
                last_block = num
            last_read = num 
Example #10
Source File: date_utils.py    From catalyst with Apache License 2.0 5 votes vote down vote up
def compute_date_range_chunks(sessions, start_date, end_date, chunksize):
    """Compute the start and end dates to run a pipeline for.

    Parameters
    ----------
    sessions : DatetimeIndex
        The available dates.
    start_date : pd.Timestamp
        The first date in the pipeline.
    end_date : pd.Timestamp
        The last date in the pipeline.
    chunksize : int or None
        The size of the chunks to run. Setting this to None returns one chunk.

    Returns
    -------
    ranges : iterable[(np.datetime64, np.datetime64)]
        A sequence of start and end dates to run the pipeline for.
    """
    if start_date not in sessions:
        raise KeyError("Start date %s is not found in calendar." %
                       (start_date.strftime("%Y-%m-%d"),))
    if end_date not in sessions:
        raise KeyError("End date %s is not found in calendar." %
                       (end_date.strftime("%Y-%m-%d"),))
    if end_date < start_date:
        raise ValueError("End date %s cannot precede start date %s." %
                         (end_date.strftime("%Y-%m-%d"),
                          start_date.strftime("%Y-%m-%d")))

    if chunksize is None:
        return [(start_date, end_date)]

    start_ix, end_ix = sessions.slice_locs(start_date, end_date)
    return (
        (r[0], r[-1]) for r in partition_all(
            chunksize, sessions[start_ix:end_ix]
        )
    ) 
Example #11
Source File: CML.py    From CollMetric with GNU General Public License v3.0 4 votes vote down vote up
def optimize(model, sampler, train, valid):
    """
    Optimize the model. TODO: implement early-stopping
    :param model: model to optimize
    :param sampler: mini-batch sampler
    :param train: train user-item matrix
    :param valid: validation user-item matrix
    :return: None
    """
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    if model.feature_projection is not None:
        # initialize item embedding with feature projection
        sess.run(tf.assign(model.item_embeddings, model.feature_projection))

    # sample some users to calculate recall validation
    valid_users = numpy.random.choice(list(set(valid.nonzero()[0])), size=1000, replace=False)

    while True:
        # create evaluator on validation set
        validation_recall = RecallEvaluator(model, train, valid)
        # compute recall on validate set
        valid_recalls = []

        # compute recall in chunks to utilize speedup provided by Tensorflow
        for user_chunk in toolz.partition_all(100, valid_users):
            valid_recalls.extend([validation_recall.eval(sess, user_chunk)])
        print("\nRecall on (sampled) validation set: {}".format(numpy.mean(valid_recalls)))
        # TODO: early stopping based on validation recall

        # train model
        losses = []
        # run n mini-batches
        for _ in tqdm(range(EVALUATION_EVERY_N_BATCHES), desc="Optimizing..."):
            user_pos, neg = sampler.next_batch()
            _, loss = sess.run((model.optimize, model.loss),
                               {model.user_positive_items_pairs: user_pos,
                                model.negative_samples: neg})

            losses.append(loss)

        print("\nTraining loss {}".format(numpy.mean(losses)))