Example 1
Source File:    From emodb with Apache License 2.0 6 votes vote down vote up
void readAll(String channel, SlabFilter filter, EventSink sink, boolean weak) {
    // PeekingIterator is needed so that we can look ahead and see the next slab Id
    PeekingIterator<Column<ByteBuffer>> manifestColumns = Iterators.peekingIterator(readManifestForChannel(channel, weak));

    while (manifestColumns.hasNext()) {
        Column<ByteBuffer> manifestColumn =;
        ByteBuffer slabId = manifestColumn.getName();
        ByteBuffer nextSlabId = manifestColumns.hasNext() ? manifestColumns.peek().getName() : null;
        boolean open = manifestColumn.getBooleanValue();
        if (filter != null && !filter.accept(slabId, open, nextSlabId)) {
        if (!readSlab(channel, slabId, new SlabCursor(), open, sink)) {
Example 2
Source File:    From usergrid with Apache License 2.0 6 votes vote down vote up
public MvccEntity parseColumn( Column<UUID> column ) {

    final EntityWrapper deSerialized;
    final UUID version = column.getName();

    try {
        deSerialized = column.getValue( entityJsonSerializer );
    catch ( DataCorruptionException e ) {
                "DATA CORRUPTION DETECTED when de-serializing entity with Id {} and version {}.  This means the write was truncated.",
                id, version, e );
        //return an empty entity, we can never load this one, and we don't want it to bring the system
        //to a grinding halt
        return new MvccEntityImpl( id, version, MvccEntity.Status.DELETED, Optional.<Entity>absent(),0 );

    //Inject the id into it.
    if ( deSerialized.entity.isPresent() ) {
        EntityUtils.setId( deSerialized.entity.get(), id );

    return new MvccEntityImpl( id, version, deSerialized.status, deSerialized.entity, 0 );
Example 3
Source File:    From emodb with Apache License 2.0 5 votes vote down vote up
public boolean moveIfFast(String fromChannel, String toChannel) {
    Iterator<Column<ByteBuffer>> manifestColumns = executePaginated(
            _keyspace.prepareQuery(ColumnFamilies.MANIFEST, ConsistencyLevel.CL_LOCAL_QUORUM)
                    .withColumnRange(new RangeBuilder().setLimit(50).build())

    List<ByteBuffer> closedSlabs = Lists.newArrayList();
    boolean movedAll = true;
    while (manifestColumns.hasNext()) {
        Column<ByteBuffer> manifestColumn =;
        ByteBuffer slabId = manifestColumn.getName();
        boolean open = manifestColumn.getBooleanValue();
        if (open) {
            // Can't safely re-assign open slabs to another channel since writers may still be writing.
            movedAll = false;  // All events in the open slab might be deleted, but don't check for that here.
        if (closedSlabs.size() >= SLAB_MOVE_BATCH) {
            _manifestPersister.move(fromChannel, toChannel, closedSlabs, false);
    if (!closedSlabs.isEmpty()) {
        _manifestPersister.move(fromChannel, toChannel, closedSlabs, false);

    return movedAll;
Example 4
Source File:    From emodb with Apache License 2.0 4 votes vote down vote up
@ParameterizedTimed(type = "AstyanaxEventReaderDAO")
public void readNewer(String channel, EventSink sink) {
    // Algorithm notes:
    // In general this method tries to read sequentially all events for a channel across multiple calls to
    // readNewer().  At the end of call N it remembers its position X using a SlabCursor object and on the next
    // call to readNewer() N+1 it starts reading at position X+1 where it left off.
    // The desire to read sequentially must be balanced by the fact that reading may skip events for various
    // reasons (event already claimed, write isn't yet visible due to out-of-order writes, etc.) so this
    // method must occasionally rewind and re-read events it skipped previously in case those events are now
    // interesting.
    // Two caches are used to implement this: one for a cursor position in "open" slabs (slabs that writers may
    // still be appending to) and second for a cursor position in "closed" slabs (slabs that writers are no
    // longer appending to):
    // 1. The biggest performance advantage comes from the "closed" slab cursor cache: once a closed slab has been
    //    read completely (cursor.get()==END) we can skip it entirely on the next few calls to readNewer() since we
    //    only need to re-read to handle race conditions & retry in the event of claim expiration.  So the closed
    //    slab cursor cache uses a longer TTL (10 seconds) so uninteresting closed slabs will generally be read at
    //    most every 10 seconds.
    // 2. If a slab is "open" then new content may be written at any time.  In that case, we must continuously poll
    //    the slab to discover new content.  Since we're issuing a Cassandra read no matter what, there is minimal
    //    benefit from using a cursor to start the read part-way through the slab.  So the open slab cursor cache
    //    has a very short TTL (250ms) to reduce memory requirements and minimize the latency between the time
    //    data is written and first read.
    // Additionally a third cache is used to track the oldest known slab in channel's manifest.  If the events
    // are written and acknowledged quickly and frequently then the head of the manifest row for the channel may
    // accrue tombstones as older slabs are fully read and deleted.  By caching the oldest slab in the manifest
    // reading the older tombstones can be minimized.  Since slabs can be written out-of-order across the cluster
    // we still occasionally (10 seconds) re-reads all slabs to pick up any of these newer-older slabs we may
    // have missed.

    Iterator<Column<ByteBuffer>> manifestColumns = readManifestForChannel(channel, true);

    while (manifestColumns.hasNext()) {
        Column<ByteBuffer> manifestColumn =;
        ByteBuffer slabId = manifestColumn.getName();
        boolean open = manifestColumn.getBooleanValue();

        ChannelSlab channelSlab = new ChannelSlab(channel, slabId);
        SlabCursor cursor = (open ? _openSlabCursors : _closedSlabCursors).getUnchecked(channelSlab);

        // Optimistic "can we skip this slab?" check outside the synchronized block.
        if (cursor.get() == SlabCursor.END) {

        // If multiple pollers try to query the same slab at the same time there's no reason they should do so
        // in parallel--they'll find the same events and compete for claims.  Might as well just serialize.
        // A smarter algorithm might randomize the order slabs are read to reduce contention between parallel
        // pollers, but be careful to avoid starvation.

        //noinspection SynchronizationOnLocalVariableOrMethodParameter
        synchronized (cursor) {
            if (!readSlab(channel, slabId, cursor, open, sink)) {
Example 5
Source File:    From emodb with Apache License 2.0 4 votes vote down vote up
/** Returns true to keep searching for more events, false to stop searching for events. */
private boolean readSlab(String channel, ByteBuffer slabId, SlabCursor cursor, boolean open, EventSink sink) {
    int start = cursor.get();
    if (start == SlabCursor.END) {
        return true;

    boolean recent = isRecent(slabId);

    // Event add and delete write with local quorum, so read with local quorum to get a consistent view of things.
    // Using a lower consistency level could result in (a) duplicate events because we miss deletes and (b)
    // incorrectly closing or deleting slabs when slabs look empty if we miss adds.
    ColumnList<Integer> eventColumns = execute(
            _keyspace.prepareQuery(ColumnFamilies.SLAB, ConsistencyLevel.CL_LOCAL_QUORUM)
                    .withColumnRange(start, Constants.OPEN_SLAB_MARKER, false, Integer.MAX_VALUE));

    boolean searching = true;
    boolean empty = (start == 0);  // If we skipped events in the query we must assume the slab isn't empty.
    boolean more = false;
    int next = start;
    for (Column<Integer> eventColumn : eventColumns) {
        int eventIdx = eventColumn.getName();

        // Open slabs have a dummy entry at maxint that indicates that this slab is still open.
        if (eventIdx == Constants.OPEN_SLAB_MARKER) {

        // Found at least one data item.
        empty = false;

        if (!searching) {
            more = true;  // There are more events to be found next time we poll this slab.

        // Pass the data on to the EventSink.  It will tell us whether or not to keep searching.
        EventId eventId = AstyanaxEventId.create(channel, slabId, eventIdx);
        ByteBuffer eventData = eventColumn.getByteBufferValue();
        searching = sink.accept(eventId, eventData);
        next = eventIdx;

    // Next time we query this slab start the search with last event received by the sink, repeating it.

    // Stale open slab?  Rare, should only happen when a writer crashes without cleaning up and closing its open
    // slabs.  Normally writers re-write the OPEN_SLAB_MARKER column on every write as a sort of heartbeat.  Readers
    // detect "stale" slabs when the open slab markers expire, and they close those slabs on behalf of the crashed writers.
    boolean hasOpenSlabMarker = !eventColumns.isEmpty() &&
            eventColumns.getColumnByIndex(eventColumns.size() - 1).getName() == Constants.OPEN_SLAB_MARKER;
    boolean stale = open && !recent && !hasOpenSlabMarker;
    if (stale) {

    // If the slab is currently closed or should be closed then it will never receive more data so check to see if
    // we can (a) delete it (it's empty) or at least (b) close it.
    if (empty && (!open || stale)) {
        deleteEmptySlabAsync(channel, slabId);
        open = false;
    } else if (stale) {
        closeStaleSlabAsync(channel, slabId);
        open = false;

    // If we ran through all the data in a closed slab, skip this slab next time.  This is especially common with
    // badly-behaving Databus listeners that poll repeatedly but don't ack.
    if (!more && !open) {

    return searching;
Example 6
Source File:    From usergrid with Apache License 2.0 4 votes vote down vote up
public String parseColumn( final Column<String> column ) {
    return column.getName();
Example 7
Source File:    From usergrid with Apache License 2.0 4 votes vote down vote up
private static ColumnNameIterator<Long, Long> createIterator( final String rowKey, final boolean reversed ) {

        final ColumnParser<Long, Long> longParser = new ColumnParser<Long, Long>() {
            public Long parseColumn( final Column<Long> column ) {
                return column.getName();

        final RangeBuilder forwardRange = new RangeBuilder().setLimit( 720 ).setReversed( reversed );

        final RowQuery<String, Long> forwardQuery =
                keyspace.prepareQuery( COLUMN_FAMILY ).getKey( rowKey ).withColumnRange( );

        ColumnNameIterator<Long, Long> itr = new ColumnNameIterator<>( forwardQuery, longParser, false );

        return itr;
Example 8
Source File:    From usergrid with Apache License 2.0 4 votes vote down vote up
/** Example CQL schema for this table
 * CREATE TABLE "Usergrid_Applications"."Edge_Shards" (
 *    key blob,
 *    column1 bigint,
 *    value blob,
 *    PRIMARY KEY (key, column1)

public Shard parseColumn( final Column<Long> column ) {

    // A custom serializer was introduced to handle parsing multiple column formats without re-writing the data.
    // The column can be stored as a legacy, single boolean, value OR a new, composite, value which contains
    // every item in the shard. If the legacy value is seen, we return a shard with Long.MIN for index and
    // createdTime so it can be identified later and handled.

    try {

        return column.getValue(SHARD_SERIALIZER);

    } catch ( Exception e) {

        // unable to parse the new format so return the old format
        return new Shard(column.getName(), column.getTimestamp(), column.getBooleanValue());


Example 9
Source File:    From usergrid with Apache License 2.0 4 votes vote down vote up
public T parseColumn( final Column<C> column ) {
    final C edge = column.getName();

    return createEdge( edge, column.getBooleanValue() );
Example 10
Source File:    From usergrid with Apache License 2.0 3 votes vote down vote up
private List<MvccLogEntry> parseResults( final ColumnList<UUID> columns, final Id entityId ) {

        List<MvccLogEntry> results = new ArrayList<MvccLogEntry>( columns.size() );

        for ( Column<UUID> col : columns ) {
            final UUID storedVersion = col.getName();
            final StageStatus stage = col.getValue( SER );

            results.add( new MvccLogEntryImpl( entityId, storedVersion, stage.stage, stage.state ) );

        return results;