Java Code Examples for org.apache.spark.api.java.JavaPairRDD#values()

The following examples show how to use org.apache.spark.api.java.JavaPairRDD#values() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Test
public void testBasicStream() throws Exception {
    StreamListener<ExecRow> sl = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);
    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(tenRows, 2).mapToPair(new FailsFunction(3));
    StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    srdd.submit();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        LOG.trace(execRow);
        count++;
        assertNotNull(execRow);
        assertTrue(execRow.getColumn(1).getInt() < 10);
    }
    assertEquals(10, count);
}
 
Example 2
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Test
public void testFailureBoundary() throws Exception {
    StreamListener<ExecRow> sl = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);
    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(tenRows, 20).mapToPair(new FailsFunction(4));
    StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    srdd.submit();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        LOG.trace(execRow);
        count++;
        assertNotNull(execRow);
        assertTrue(execRow.getColumn(1).getInt() < 10);
    }
    assertEquals(10, count);
}
 
Example 3
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Test
public void testBasicStream() throws Exception {
    StreamListener<ExecRow> sl = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);
    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(tenRows, 10);
    StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    srdd.submit();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        LOG.trace(execRow);
        count++;
        assertNotNull(execRow);
        assertTrue(execRow.getColumn(1).getInt() < 10);
    }
    assertEquals(10, count);
}
 
Example 4
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testOffset() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>(-1, 60000);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = 60000;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(100000-60000, count);
}
 
Example 5
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testLimit() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>(400, 0);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(400, count);
}
 
Example 6
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testOffsetLimit() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>(400, 30000);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = 30000;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(400, count);
}
 
Example 7
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testSmallLimit() throws StandardException {
    int limit = 2000;
    int offset = 0;
    int total = 4000;
    int batches = 2;
    int batchSize = 512;
    StreamListener<ExecRow> sl = new StreamListener<>(limit, offset, batches, batchSize);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < total; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 1);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize);
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = offset;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(limit, count);
}
 
Example 8
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testSmallOffsetLimit() throws StandardException {
    int limit = 100;
    int offset = 2000;
    int total = 4000;
    StreamListener<ExecRow> sl = new StreamListener<>(limit, offset);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < total; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 1);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = offset;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(limit, count);
}
 
Example 9
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testBlockingLargeOddPartitions() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        count++;
        assertNotNull(execRow);
    }
    assertEquals(100000, count);
}
 
Example 10
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testBlockingLarge() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 12);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        count++;
        assertNotNull(execRow);
    }
    assertEquals(100000, count);
}
 
Example 11
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testBlocking() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 10000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 6);
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                LOG.error(e);
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        count++;
        assertNotNull(execRow);
    }
    assertEquals(10000, count);
}
 
Example 12
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testFailureAfterRecoveryWarmup() throws StandardException, FileNotFoundException, UnsupportedEncodingException {
    int size = 100000;
    int batches = 2;
    int batchSize = 512;
    StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < size; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 2).sortByKey().mapToPair(new FailsTwiceFunction(10000, 2000));
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize);
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        count++;
    }
    assertEquals(size, count);
}
 
Example 13
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testFailureDuringRecoveryWarmup() throws StandardException, FileNotFoundException, UnsupportedEncodingException {
    int size = 100000;
    int batches = 2;
    int batchSize = 512;
    StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < size; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 2).sortByKey().mapToPair(new FailsTwiceFunction(10000, 100));
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize);
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        count++;
    }
    assertEquals(size, count);
}
 
Example 14
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testFailureAfterLimit() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>(40000, 300);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13).mapToPair(new FailsFunction(40301));;
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = 300;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(40000, count);
}
 
Example 15
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testFailureBeforeOffset() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>(40000, 300);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13).mapToPair(new FailsFunction(200));;
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = 300;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(40000, count);
}
 
Example 16
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testFailureBeforeLargeOffset() throws StandardException {
    StreamListener<ExecRow> sl = new StreamListener<>(400, 30000);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 13).mapToPair(new FailsFunction(29500));;
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    int first = 30000;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        assertEquals(count+first, execRow.getColumn(1).getInt());
        count++;
    }
    assertEquals(400, count);
}
 
Example 17
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testBlockingLarge() throws StandardException, FileNotFoundException, UnsupportedEncodingException {
    int size = 100000;
    int batches = 2;
    int batchSize = 512;
    StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < size; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 12).sortByKey().mapToPair(new FailsFunction(10000));
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize);
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        count++;
    }
    assertEquals(size, count);
}
 
Example 18
Source File: StreamableRDDTest_Failures.java    From spliceengine with GNU Affero General Public License v3.0 5 votes vote down vote up
@Test
public void testBlockingMedium() throws StandardException, FileNotFoundException, UnsupportedEncodingException {
    int size = 20000;
    int batches = 2;
    int batchSize = 512;
    StreamListener<ExecRow> sl = new StreamListener<>(-1, 0, batches, batchSize);
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < size; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 2).sortByKey().mapToPair(new FailsFunction(5000));
    final StreamableRDD srdd = new StreamableRDD(rdd.values(), null, sl.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort(), batches, batchSize);
    new Thread() {
        @Override
        public void run() {
            try {
                srdd.submit();
            } catch (Exception e) {
                throw new RuntimeException(e);
            }

        }
    }.start();
    Iterator<ExecRow> it = sl.getIterator();
    int count = 0;
    while (it.hasNext()) {
        ExecRow execRow = it.next();
        assertNotNull(execRow);
        count++;
    }
    assertEquals(size, count);
}
 
Example 19
Source File: StreamableRDDTest.java    From spliceengine with GNU Affero General Public License v3.0 4 votes vote down vote up
@Test
public void testConcurrentQueries() throws StandardException, ExecutionException, InterruptedException {
    final StreamListener<ExecRow> sl1 = new StreamListener<>();
    final StreamListener<ExecRow> sl2 = new StreamListener<>();
    final StreamListener<ExecRow> sl3 = new StreamListener<>();
    HostAndPort hostAndPort = server.getHostAndPort();
    server.register(sl1);
    server.register(sl2);
    server.register(sl3);

    List<Tuple2<ExecRow,ExecRow>> manyRows = new ArrayList<>();
    for(int i = 0; i < 100000; ++i) {
        manyRows.add(new Tuple2<ExecRow, ExecRow>(getExecRow(i, 1), getExecRow(i, 2)));
    }

    JavaPairRDD<ExecRow, ExecRow> rdd = SpliceSpark.getContextUnsafe().parallelizePairs(manyRows, 12);
    final StreamableRDD srdd1 = new StreamableRDD(rdd.values(), sl1.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    final StreamableRDD srdd2 = new StreamableRDD(rdd.values().map(new Function<ExecRow,ExecRow>() {
        @Override
        public ExecRow call(ExecRow o) throws Exception {
            o.getColumn(1).setValue(0);
            return o;
        }
    }), sl2.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    final StreamableRDD srdd3 = new StreamableRDD(rdd.values(), sl3.getUuid(), hostAndPort.getHostText(), hostAndPort.getPort());
    for (final StreamableRDD srdd : Arrays.asList(srdd1, srdd2, srdd3)) {
        new Thread() {
            @Override
            public void run() {
                try {
                    srdd.submit();
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }

            }
        }.start();
    }
    // We collect them asynchronously into memory so we are able to iterate over them at the same time. Otherwise
    // tasks for the third RDD might be blocked by tasks in other RDDs, and we are not consuming elements from the
    // other iterators so they can become unblocked.
    ExecutorService executor = Executors.newFixedThreadPool(3);
    Future<List<ExecRow>> future1 = executor.submit(new Callable<List<ExecRow>>() {
        @Override
        public List<ExecRow> call() throws Exception {
            return IteratorUtils.toList(sl1.getIterator());
        }
    });
    Future<List<ExecRow>> future2 = executor.submit(new Callable<List<ExecRow>>() {
        @Override
        public List<ExecRow> call() throws Exception {
            return IteratorUtils.toList(sl2.getIterator());
        }
    });
    Future<List<ExecRow>> future3 = executor.submit(new Callable<List<ExecRow>>() {
        @Override
        public List<ExecRow> call() throws Exception {
            return IteratorUtils.toList(sl3.getIterator());
        }
    });
    Iterator<ExecRow> it1 = future1.get().iterator();
    Iterator<ExecRow> it2 = future2.get().iterator();
    Iterator<ExecRow> it3 = future3.get().iterator();
    int count = 0;
    while (it1.hasNext()) {
        ExecRow r1 = it1.next();
        ExecRow r2 = it2.next();
        ExecRow r3 = it3.next();
        count++;
        assertNotNull(r1);
        assertNotNull(r2);
        assertNotNull(r3);
        assertEquals(0, r2.getColumn(1).getInt());
        assertEquals(r1.getColumn(1), r3.getColumn(1));
        assertEquals(r1.getColumn(2), r2.getColumn(2));
    }
    assertEquals(100000, count);
}
 
Example 20
Source File: MLUpdate.java    From oryx with Apache License 2.0 4 votes vote down vote up
@Override
public void runUpdate(JavaSparkContext sparkContext,
                      long timestamp,
                      JavaPairRDD<Object,M> newKeyMessageData,
                      JavaPairRDD<Object,M> pastKeyMessageData,
                      String modelDirString,
                      TopicProducer<String,String> modelUpdateTopic)
    throws IOException, InterruptedException {

  Objects.requireNonNull(newKeyMessageData);

  JavaRDD<M> newData = newKeyMessageData.values();
  JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

  if (newData != null) {
    newData.cache();
    // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
    // when many workers try to materialize the RDDs at once. Hence the workaround.
    newData.foreachPartition(p -> {});
  }
  if (pastData != null) {
    pastData.cache();
    pastData.foreachPartition(p -> {});
  }

  List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(
      getHyperParameterValues(), hyperParamSearch, candidates);

  Path modelDir = new Path(modelDirString);
  Path tempModelPath = new Path(modelDir, ".temporary");
  Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

  FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
  fs.mkdirs(candidatesPath);

  Path bestCandidatePath = findBestCandidatePath(
      sparkContext, newData, pastData, hyperParameterCombos, candidatesPath);

  Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
  if (bestCandidatePath == null) {
    log.info("Unable to build any model");
  } else {
    // Move best model into place
    fs.rename(bestCandidatePath, finalPath);
  }
  // Then delete everything else
  fs.delete(candidatesPath, true);

  if (modelUpdateTopic == null) {
    log.info("No update topic configured, not publishing models to a topic");
  } else {
    // Push PMML model onto update topic, if it exists
    Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
    if (fs.exists(bestModelPath)) {
      FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
      PMML bestModel = null;
      boolean modelNeededForUpdates = canPublishAdditionalModelData();
      boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
      if (modelNeededForUpdates || modelNotTooLarge) {
        // Either the model is required for publishAdditionalModelData, or required because it's going to
        // be serialized to Kafka
        try (InputStream in = fs.open(bestModelPath)) {
          bestModel = PMMLUtils.read(in);
        }
      }

      if (modelNotTooLarge) {
        modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
      } else {
        modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
      }

      if (modelNeededForUpdates) {
        publishAdditionalModelData(
            sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic);
      }
    }
  }

  if (newData != null) {
    newData.unpersist();
  }
  if (pastData != null) {
    pastData.unpersist();
  }
}