Python toolz.first() Examples

The following are 18 code examples of toolz.first(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module toolz , or try the search function

Example #1

Source File: experimental.py From sidekick with MIT License

6 votes

def peek(seq: Seq, default=NOT_GIVEN) -> (object, Seq):
    """
    Same as peek_with(first).

    Peek first element of sequence and return (first, seq).

    >>> fst, seq = peek(range(5))
    >>> fst, list(seq)
    (0, [0, 1, 2, 3, 4])
    """
    try:
        x, seq = uncons(seq)
    except ValueError:
        if default is NOT_GIVEN:
            raise
        return default, iter(())
    return x, toolz.cons(x, seq)

Example #2

Source File: experimental.py From sidekick with MIT License

6 votes

def first_repeated(key: Func, seq: Seq):
    """
    Return the index and value of first repeated element in sequence.

    Raises a ValueError if no repeated element is found.

    Examples:
        >>> first_repeated(None, [1, 2, 3, 1])
        (3, 1)
    """

    key = to_callable(key)
    seen = set()
    add = seen.add
    for i, x in enumerate(seq):
        tag = key(x)
        if tag in seen:
            return i, x
        add(tag)
    raise ValueError("no repeated element in sequence")

Example #3

Source File: asset_writer.py From catalyst with Apache License 2.0

6 votes

def _write_df_to_table(
        self,
        tbl,
        df,
        txn,
        chunk_size,
        idx=True,
        idx_label=None,
    ):
        df.to_sql(
            tbl.name,
            txn.connection,
            index=idx,
            index_label=(
                idx_label
                if idx_label is not None else
                first(tbl.primary_key.columns).name
            ),
            if_exists='append',
            chunksize=chunk_size,
        )

Example #4

Source File: __init__.py From attention-lvcsr with MIT License

5 votes

def _print_attributes(self, attribute_tuples):
        for attr, value in sorted(attribute_tuples.items(), key=first):
            if not self._attribute_filter(attr):
                print("\t", "{}:".format(attr), value)

Example #5

Source File: _incremental.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def _adapt(self, info):
        # First, have an adaptive algorithm
        if self.n_initial_parameters == "grid":
            start = len(ParameterGrid(self.parameters))
        else:
            start = self.n_initial_parameters

        def inverse(time):
            """ Decrease target number of models inversely with time """
            return int(start / (1 + time) ** self.decay_rate)

        example = toolz.first(info.values())
        time_step = example[-1]["partial_fit_calls"]

        current_time_step = time_step + 1
        next_time_step = current_time_step

        if inverse(current_time_step) == 0:
            # we'll never get out of here
            next_time_step = 1

        while inverse(current_time_step) == inverse(next_time_step) and (
            self.decay_rate
            and not self.patience
            or next_time_step - current_time_step < self.fits_per_score
        ):
            next_time_step += 1

        target = max(1, inverse(next_time_step))
        best = toolz.topk(target, info, key=lambda k: info[k][-1]["score"])

        if len(best) == 1:
            [best] = best
            return {best: 0}
        steps = next_time_step - current_time_step
        instructions = {b: steps for b in best}
        return instructions

Example #6

Source File: experimental.py From sidekick with MIT License

5 votes

def index(x, seq):
    try:
        return toolz.first(i for i, y in enumerate(seq) if x == y)
    except ValueError:
        raise IndexError("element not found in sequence")

Example #7

Source File: experimental.py From sidekick with MIT License

5 votes

def indexed_map(func: Func, *seqs: Seq, start=0) -> Seq:
    """
    Like map, but pass the index of each element as the first argument to
    func.

    Examples:
        >>> ''.join(indexed_map((X * Y), 'hello', start=1))
        'heelllllllooooo'

    See Also:
        map
    """
    return _map(func, itertools.count(start), *seqs)

Example #8

Source File: umis.py From umis with MIT License

5 votes

def detect_fastq_annotations(fastq_file):
    """
    detects annotations preesent in a FASTQ file by examining the first read
    """
    annotations = set()
    queryread = tz.first(read_fastq(fastq_file))
    for k, v in BARCODEINFO.items():
        if v.readprefix in queryread:
            annotations.add(k)
    return annotations

Example #9

Source File: __init__.py From attention-lvcsr with MIT License

5 votes

def main_loop(self):
        if not hasattr(self, '_main_loop'):
            raise ValueError("main loop must be assigned to extension first")
        return self._main_loop

Example #10

Source File: groupby.py From ibis with Apache License 2.0

5 votes

def __init__(
        self, table, by, having=None, order_by=None, window=None, **expressions
    ):
        self.table = table
        self.by = util.promote_list(by if by is not None else []) + [
            _get_group_by_key(table, v).name(k)
            for k, v in sorted(expressions.items(), key=toolz.first)
        ]
        self._order_by = order_by or []
        self._having = having or []
        self._window = window

Example #11

Source File: strings.py From ibis with Apache License 2.0

5 votes

def execute_string_group_by_find_in_set(op, needle, haystack, **kwargs):
    # `list` could contain series, series groupbys, or scalars
    # mixing series and series groupbys is not allowed
    series_in_haystack = [
        type(piece)
        for piece in haystack
        if isinstance(piece, (pd.Series, SeriesGroupBy))
    ]

    if not series_in_haystack:
        return ibis.util.safe_index(haystack, needle)

    try:
        (collection_type,) = frozenset(map(type, series_in_haystack))
    except ValueError:
        raise ValueError('Mixing Series and SeriesGroupBy is not allowed')

    pieces = haystack_to_series_of_lists(
        [getattr(piece, 'obj', piece) for piece in haystack]
    )

    result = pieces.map(toolz.flip(ibis.util.safe_index)(needle))
    if issubclass(collection_type, pd.Series):
        return result

    assert issubclass(collection_type, SeriesGroupBy)

    return result.groupby(
        toolz.first(
            piece.grouper.groupings
            for piece in haystack
            if hasattr(piece, 'grouper')
        )
    )

Example #12

Source File: strings.py From ibis with Apache License 2.0

5 votes

def haystack_to_series_of_lists(haystack, index=None):
    if index is None:
        index = toolz.first(
            piece.index for piece in haystack if hasattr(piece, 'index')
        )
    pieces = reduce(
        operator.add,
        (
            pd.Series(getattr(piece, 'values', piece), index=index).map(
                ibis.util.promote_list
            )
            for piece in haystack
        ),
    )
    return pieces

Example #13

Source File: selection.py From ibis with Apache License 2.0

5 votes

def remap_overlapping_column_names(table_op, root_table, data_columns):
    """Return an ``OrderedDict`` mapping possibly suffixed column names to
    column names without suffixes.

    Parameters
    ----------
    table_op : TableNode
        The ``TableNode`` we're selecting from.
    root_table : TableNode
        The root table of the expression we're selecting from.
    data_columns : set or frozenset
        The available columns to select from

    Returns
    -------
    mapping : OrderedDict[str, str]
        A map from possibly-suffixed column names to column names without
        suffixes.
    """
    if not isinstance(table_op, ops.Join):
        return None

    left_root, right_root = ops.distinct_roots(table_op.left, table_op.right)
    suffixes = {
        left_root: constants.LEFT_JOIN_SUFFIX,
        right_root: constants.RIGHT_JOIN_SUFFIX,
    }
    column_names = [
        ({name, name + suffixes[root_table]} & data_columns, name)
        for name in root_table.schema.names
    ]
    mapping = OrderedDict(
        (first(col_name), final_name)
        for col_name, final_name in column_names
        if col_name
    )
    return mapping

Example #14

Source File: client.py From ibis with Apache License 2.0

5 votes

def create_table(self, table_name, obj=None, schema=None):
        """Create a table."""
        if obj is None and schema is None:
            raise com.IbisError('Must pass expr or schema')

        if obj is not None:
            df = pd.DataFrame(obj)
        else:
            dtypes = ibis_schema_to_pandas(schema)
            df = schema.apply_to(
                pd.DataFrame(columns=list(map(toolz.first, dtypes)))
            )

        self.dictionary[table_name] = df

Example #15

Source File: umis.py From umis with MIT License

5 votes

def subset_bamfile(sam, barcodes):
    """
    Subset a SAM/BAM file, keeping only alignments from given
    cellular barcodes
    """
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)
    barcodes = set(barcode.strip() for barcode in barcodes)

    for count, aln in enumerate(track, start=1):
        if count and not count % 1000000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            cb = match.group('CB')
            if cb in barcodes:
                out_file.write(aln)

Example #16

Source File: asset_writer.py From catalyst with Apache License 2.0

4 votes

def _split_symbol_mappings(df):
    """Split out the symbol: sid mappings from the raw data.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe with multiple rows for each symbol: sid pair.

    Returns
    -------
    asset_info : pd.DataFrame
        The asset info with one row per asset.
    symbol_mappings : pd.DataFrame
        The dataframe of just symbol: sid mappings. The index will be
        the sid, then there will be three columns: symbol, start_date, and
        end_date.
    """
    mappings = df[list(mapping_columns)]
    ambigious = {}
    for symbol in mappings.symbol.unique():
        persymbol = mappings[mappings.symbol == symbol]
        intersections = list(intersecting_ranges(map(
            from_tuple,
            zip(persymbol.start_date, persymbol.end_date),
        )))
        if intersections:
            ambigious[symbol] = (
                intersections,
                persymbol[['start_date', 'end_date']].astype('datetime64[ns]'),
            )

    if ambigious:
        raise ValueError(
            'Ambiguous ownership for %d symbol%s, multiple assets held the'
            ' following symbols:\n%s' % (
                len(ambigious),
                '' if len(ambigious) == 1 else 's',
                '\n'.join(
                    '%s:\n  intersections: %s\n  %s' % (
                        symbol,
                        tuple(map(_format_range, intersections)),
                        # indent the dataframe string
                        '\n  '.join(str(df).splitlines()),
                    )
                    for symbol, (intersections, df) in sorted(
                        ambigious.items(),
                        key=first,
                    ),
                ),
            )
        )
    return (
        df.groupby(level=0).apply(_check_asset_group),
        df[list(mapping_columns)],
    )

Example #17

Source File: core.py From dask-lightgbm with BSD 3-Clause "New" or "Revised" License

4 votes

def train(client, data, label, params, model_factory, weight=None, **kwargs):
    # Split arrays/dataframes into parts. Arrange parts into tuples to enforce co-locality
    data_parts = _split_to_parts(data, is_matrix=True)
    label_parts = _split_to_parts(label, is_matrix=False)
    if weight is None:
        parts = list(map(delayed, zip(data_parts, label_parts)))
    else:
        weight_parts = _split_to_parts(weight, is_matrix=False)
        parts = list(map(delayed, zip(data_parts, label_parts, weight_parts)))

    # Start computation in the background
    parts = client.compute(parts)
    wait(parts)

    for part in parts:
        if part.status == 'error':
            return part  # trigger error locally

    # Find locations of all parts and map them to particular Dask workers
    key_to_part_dict = dict([(part.key, part) for part in parts])
    who_has = client.who_has(parts)
    worker_map = defaultdict(list)
    for key, workers in who_has.items():
        worker_map[first(workers)].append(key_to_part_dict[key])

    master_worker = first(worker_map)
    worker_ncores = client.ncores()

    if 'tree_learner' not in params or params['tree_learner'].lower() not in {'data', 'feature', 'voting'}:
        logger.warning('Parameter tree_learner not set or set to incorrect value '
                       f'({params.get("tree_learner", None)}), using "data" as default')
        params['tree_learner'] = 'data'

    # Tell each worker to train on the parts that it has locally
    futures_classifiers = [client.submit(_train_part,
                                         model_factory=model_factory,
                                         params=assoc(params, 'num_threads', worker_ncores[worker]),
                                         list_of_parts=list_of_parts,
                                         worker_addresses=list(worker_map.keys()),
                                         local_listen_port=params.get('local_listen_port', 12400),
                                         time_out=params.get('time_out', 120),
                                         return_model=(worker == master_worker),
                                         **kwargs)
                           for worker, list_of_parts in worker_map.items()]

    results = client.gather(futures_classifiers)
    results = [v for v in results if v]
    return results[0]

Example #18

Source File: umis.py From umis with MIT License

4 votes

def bamtag(sam):
    ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and
    cellular barcode tags
    '''
    from pysam import AlignmentFile

    start_time = time.time()

    sam_file = open_bamfile(sam)
    out_file = AlignmentFile("-", "wh", template=sam_file)
    track = sam_file.fetch(until_eof=True)

    # peek at first alignment to determine the annotations
    if is_python3():
        queryalignment = next(track)
    else:
        queryalignment = track.next()
    annotations = detect_alignment_annotations(queryalignment)
    track = itertools.chain([queryalignment], track)

    re_string = construct_transformed_regex(annotations)
    parser_re = re.compile(re_string)

    for count, aln in enumerate(track, start=1):
        if count and not count % 1000000:
            logger.info("Processed %d alignments." % count)

        match = parser_re.match(aln.qname)
        tags = aln.tags

        if "cellular" in annotations:
            aln.tags += [('XC', match.group('CB'))]
        if "molecular" in annotations:
            aln.tags += [('RX', match.group('MB'))]
        if "sample" in annotations:
            aln.tags += [('XS', match.group('SB'))]

        out_file.write(aln)

    total_time = time.time() - start_time
    logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
    logger.info("Processed %d alignments." % count)