org.kitesdk.data.Dataset Java Examples

The following examples show how to use org.kitesdk.data.Dataset. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testTargetView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));
  View<Record> outputView = outputDataset.with("username", "test-0");

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #2
Source File: TestCrunchDatasetsHBase.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSourceView() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  View<GenericRecord> inputView = inputDataset
      .from("part1", new Utf8("part1_2")).to("part1", new Utf8("part1_7"))
      .from("part2", new Utf8("part2_2")).to("part2", new Utf8("part2_7"));
  Assert.assertEquals(6, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 6, 2);
}
 
Example #3
Source File: TestProjection.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSpecificProjectionLoad() throws IOException {
  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = unbounded.newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  Dataset<SmallEvent> dataset = repo.load(
      "ns", unbounded.getDataset().getName(),
      SmallEvent.class);

  Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent),
      toSmallEvent(octEvent), toSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}
 
Example #4
Source File: TestExternalBackwardCompatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateWithUpdatedURI() {
  Dataset<GenericRecord> updated = Datasets.update(
      "dataset:hive:/tmp/datasets/default/test",
      new DatasetDescriptor.Builder(descriptor)
          .property("added.property", "true")
          .build());
  Assert.assertNotNull("Update should succeed", updated);

  DatasetDescriptor stored =
      HiveUtils.descriptorForTable(conf, metastore.getTable("default", "test"));

  Assert.assertEquals("Should update default.test descriptor",
      stored, updated.getDescriptor());

  Assert.assertEquals("Added property should be present",
      stored.getProperty("added.property"), "true");
}
 
Example #5
Source File: TestHiveDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testExternalHDFSQueryOptions() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data?" + hdfsQueryArgs);
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets
      .<Object, Dataset<Object>>load("dataset:hive:/tmp/data/ns/test?" + hdfsQueryArgsOld, Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}
 
Example #6
Source File: ReadProductDatasetPojo.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Load the products dataset
  Dataset<Product> products = Datasets.load(
      "dataset:hdfs:/tmp/data/products", Product.class);

  // Get a reader for the dataset and read all the users
  DatasetReader<Product> reader = null;
  try {
    reader = products.newReader();
    for (Product product : reader) {
      System.out.println(product);
    }

  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  return 0;
}
 
Example #7
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example #8
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
Example #9
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedSource() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));
}
 
Example #10
Source File: TestHiveExternalDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
@Test
public void testNewPartitionIsVisibleToHive() throws Exception {
  final String NAME2 = "test2";

  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .hash("username", 2).build();

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(testSchema)
      .partitionStrategy(partitionStrategy)
      .build();

  Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor);

  HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2);
  HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2);
  Assert.assertTrue("No partitions yet",
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty());

  writeRecord(dataset, 0);

  Assert.assertEquals("Should be one partition", 1,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

}
 
Example #11
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedSourceAndTarget() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);
  Dataset<Record> outputPart0 =
      ((PartitionedDataset<Record>) outputDataset).getPartition(key, true);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputPart0), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputPart0));
}
 
Example #12
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #13
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedDataset() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset);
  writeUserToView(dataset);

  DatasetDescriptor expected = dataset.getDescriptor();
  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertEquals("Should succeed and find an equivalent descriptor",
      expected, actual);
}
 
Example #14
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testTargetViewProvidedPartition() throws IOException {
    PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().provided("version").build();

    Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
    Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
            .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

    View<Record> inputView = inputDataset.with("version", "test-version-0");

    writeTestUsers(inputView, 1);

    Assert.assertEquals(1, datasetSize(inputView));
    View<Record> outputView = outputDataset.with("version", "test-version-0");

    Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
    PCollection<GenericData.Record> data = pipeline.read(
            CrunchDatasets.asSource(inputView));
    pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
    pipeline.run();

    Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #15
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateFailsWithLocationChange() {
  ensureCreated();
  Dataset<Record> dataset = repo.load(NAMESPACE, NAME);
  URI location = dataset.getDescriptor().getLocation();

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
          .location(new Path(testDirectory, "newDataLocation").toUri())
          .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to data location change");
  } catch (ValidationException ex) {
    // expected
  }

  Assert.assertEquals(
      location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation());
}
 
Example #16
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testDatasetUris() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());

  writeTestUsers(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(new URIBuilder(repo.getUri(), "ns", "in").build(),
          GenericData.Record.class));
  pipeline.write(data, CrunchDatasets.asTarget(
      new URIBuilder(repo.getUri(), "ns", "out").build()), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(10, datasetSize(outputDataset));
}
 
Example #17
Source File: TestCreateDatasetWithExistingData.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromExistingPartitioned() throws Exception {
  command.datasets = Lists.newArrayList(existingPartitionedURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingPartitionedURI));

  PartitionStrategy providedVersionStrategy = new PartitionStrategy.Builder()
      .provided("version", "int")
      .build();

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingPartitionedURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertEquals("Should be partitioned with a provided partitioner",
      providedVersionStrategy, users.getDescriptor().getPartitionStrategy());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}
 
Example #18
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testLoadChangedRelativePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use actual namespace",
      "dataset:hive:default/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example #19
Source File: TestHiveDatasetURIsWithDefaultConfiguration.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testExternal() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive:/tmp/data");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<GenericRecord> ds = Datasets.load("dataset:hive:/tmp/data/ns/test");

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("hdfs://" + hdfsAuth + "/tmp/data/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}
 
Example #20
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testLoadChangedAbsolutePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:/ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use actual namespace",
      "dataset:hive:default/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example #21
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadNullsWithPrimitivesAllowNullSchema() {
  final String name = "allowNullPrimitives";
  try {
    repo.create(NAMESPACE, name, new DatasetDescriptor.Builder()
        .schema(ReflectData.AllowNull.get().getSchema(ObjectPoJo.class))
        .build(), ObjectPoJo.class);

    // should load the dataset because PrimitivePoJo can be used to write
    final Dataset<PrimitivePoJo> dataset = repo.load(
        NAMESPACE, name, PrimitivePoJo.class);
    TestHelpers.assertThrows("AllowNull primitives cannot read nullable type",
        IncompatibleSchemaException.class, new Runnable() {
          @Override
          public void run() {
            dataset.newReader();
          }
        });

  } catch (RuntimeException e) {
    throw e;
  } finally {
    repo.delete(NAMESPACE, name);
  }
}
 
Example #22
Source File: DatasetKeyOutputFormat.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void commitTask(TaskAttemptContext taskContext) throws IOException {
  DatasetRepository repo = getDatasetRepository(taskContext);
  boolean inTempRepo = repo instanceof TemporaryDatasetRepository;

  Dataset<E> jobDataset = repo.load(TEMP_NAMESPACE, getJobDatasetName(taskContext));
  String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext);
  if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) {
    Dataset<E> taskAttemptDataset = repo.load(TEMP_NAMESPACE, taskAttemptDatasetName);
    ((Mergeable<Dataset<E>>) jobDataset).merge(taskAttemptDataset);
    if (!inTempRepo) {
      repo.delete(TEMP_NAMESPACE, taskAttemptDatasetName);
    }
  }
}
 
Example #23
Source File: TestKiteURIHandler.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void checkURIExistsView() throws URIHandlerException, IOException{
  DatasetRepository repository = newRepo();
  Dataset<GenericRecord> dataset = repository.create("data","readymailbox", testDescriptor);

  View<GenericRecord> view = dataset.with("message", "hello");
  ((Signalable<GenericRecord>)view).signalReady();

  Assert.assertTrue(uriHandler.exists(view.getUri(), null));
}
 
Example #24
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testSignalReadyOutputView() {
  Assume.assumeTrue(!Hadoop.isHadoop1());
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-8", "test-9");
  View<Record> outputView = outputDataset.with("username", "test-8", "test-9");
  Assert.assertEquals(2, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputView), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(2, datasetSize(outputView));

  Assert.assertFalse("Output dataset should not be signaled ready",
      ((Signalable)outputDataset).isReady());
  Assert.assertTrue("Output view should be signaled ready",
      ((Signalable)outputView).isReady());
}
 
Example #25
Source File: TestHiveExternalDatasetRepository.java    From kite with Apache License 2.0 5 votes vote down vote up
private void writeRecord(Dataset<GenericRecord> dataset, int partition) {
  PartitionKey key = new PartitionKey(partition);
  DatasetWriter<GenericRecord> writer =
      ((PartitionedDataset<GenericRecord>) dataset).getPartition(key, true).newWriter();
  try {
    GenericRecordBuilder recordBuilder = new GenericRecordBuilder(
        dataset.getDescriptor().getSchema())
        .set("username", partition + "").set("email", partition + "@example.com");
    writer.write(recordBuilder.build());
  } finally {
    writer.close();
  }

}
 
Example #26
Source File: DatasetTestUtilities.java    From kite with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public static <E> void testPartitionKeysAreEqual(PartitionedDataset<E> ds,
    PartitionKey... expectedKeys) {
  Set<PartitionKey> expected = Sets.newHashSet(expectedKeys);
  Set<PartitionKey> actual = Sets.newHashSet(Iterables.transform(ds.getPartitions(),
      new Function<Dataset, PartitionKey>() {
    @Override
    public PartitionKey apply(Dataset input) {
      return ((FileSystemDataset) input).getPartitionKey();
    }
  }));
  Assert.assertEquals(expected, actual);
}
 
Example #27
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testUpdateFailsWithIncompatibleSchemaChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder()
      .schema(testSchema).build());

  Assert.assertEquals("Dataset name is propagated", NAME,
      dataset.getName());
  Assert.assertEquals("Dataset schema is propagated", testSchema, dataset
      .getDescriptor().getSchema());

  Schema testSchemaV2 = SchemaBuilder.record("user").fields()
      .requiredString("username")
      .requiredString("email")
      .requiredString("favoriteColor") // incompatible - no default
      .endRecord();

  try {
    repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(
        dataset.getDescriptor()).schema(testSchemaV2).build());
    Assert.fail("Should fail due to incompatible update");
  } catch (ValidationException e) {
    // expected
  }
  dataset = repo.load(NAMESPACE, NAME);
  Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset
      .getDescriptor().getSchema());
}
 
Example #28
Source File: FileSystemDatasetRepository.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public <E> Dataset<E> update(String namespace, String name,
                             DatasetDescriptor descriptor, Class<E> type) {
  Preconditions.checkNotNull(namespace, "Namespace cannot be null");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");
  Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");

  DatasetDescriptor oldDescriptor = metadataProvider.load(namespace, name);

  // oldDescriptor is valid if load didn't throw NoSuchDatasetException
  Compatibility.checkUpdate(oldDescriptor, descriptor);

  DatasetDescriptor updatedDescriptor = metadataProvider.update(namespace, name, descriptor);

  LOG.debug("Updated dataset: {} schema: {} location: {}", new Object[] {
      name, updatedDescriptor.getSchema(), updatedDescriptor.getLocation() });

  return new FileSystemDataset.Builder<E>()
      .namespace(namespace)
      .name(name)
      .configuration(conf)
      .descriptor(updatedDescriptor)
      .type(type)
      .uri(new URIBuilder(getUri(), namespace, name).build())
      .partitionKey(updatedDescriptor.isPartitioned() ? new PartitionKey() : null)
      .partitionListener(getPartitionListener())
      .build();
}
 
Example #29
Source File: TestHiveDatasetURIsWithDefaultConfiguration.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testManaged() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<GenericRecord> ds = Datasets.load("dataset:hive?dataset=test&namespace=ns");

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}
 
Example #30
Source File: TestProjection.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testSpecificProjectionAsType() throws IOException {
  Dataset<GenericRecord> original = Datasets.load(unbounded.getUri());

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.asType(StandardEvent.class).newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  final View<SmallEvent> smallEvents = original.asType(SmallEvent.class);

  Set<SmallEvent> expected = Sets.newHashSet(toSmallEvent(sepEvent),
      toSmallEvent(octEvent), toSmallEvent(novEvent));

  assertContentEquals(expected, smallEvents);

  TestHelpers.assertThrows("Should not be able to write small events",
      IncompatibleSchemaException.class, new Runnable() {
        @Override
        public void run() {
          smallEvents.newWriter();
        }
      });
}