Java Code Examples for org.kitesdk.data.Dataset#newWriter()

The following examples show how to use org.kitesdk.data.Dataset#newWriter() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestProjection.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMixedProjection() throws IOException {
  Dataset<StandardEvent> original = repo.create("ns", "mixedProjection",
      new DatasetDescriptor.Builder()
          .schema(StandardEvent.class)
          .build(), StandardEvent.class);

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}
 
Example 2
Source File: TestWriteReflectReadGeneric.java    From kite with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void setup() throws IOException {
  fs = LocalFileSystem.getInstance();
  testDirectory = new Path(Files.createTempDir().getAbsolutePath());
  FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(),
      testDirectory);
  Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder()
                                 .schema(MyRecord.class)
                                 .build(), MyRecord.class);
  DatasetWriter<MyRecord> writer = writerDataset.newWriter();
  for (int i = 0; i < totalRecords; i++) {
    writer.write(new MyRecord(String.valueOf(i), i));
  }
  writer.close();

  readerDataset = repo.load("ns", "test", GenericRecord.class);
}
 
Example 3
Source File: TestReadCustomGeneric.java    From kite with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void setup() throws IOException {
  fs = LocalFileSystem.getInstance();
  testDirectory = new Path(Files.createTempDir().getAbsolutePath());
  FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(),
      testDirectory);
  Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder()
                                 .schema(MyRecord.class)
                                 .build(), MyRecord.class);
  DatasetWriter<MyRecord> writer = writerDataset.newWriter();
  for (int i = 0; i < totalRecords; i++) {
    writer.write(new MyRecord(String.valueOf(i), i));
  }
  writer.close();

  readerDataset = repo.load("ns", "test", TestGenericRecord.class);
}
 
Example 4
Source File: CreateUserDatasetGeneric.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 5
Source File: CreateUserDatasetGenericParquet.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .format(Formats.PARQUET)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 6
Source File: CreateProductDatasetPojo.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Create a dataset of products with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(Product.class)
      .build();
  Dataset<Product> products = Datasets.create(
      "dataset:hdfs:/tmp/data/products", descriptor, Product.class);

  // Get a writer for the dataset and write some products to it
  DatasetWriter<Product> writer = null;
  try {
    writer = products.newWriter();
    int i = 0;
    for (String name : names) {
      Product product = new Product();
      product.setName(name);
      product.setId(i++);
      writer.write(product);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 7
Source File: CreateHiveUserDatasetGeneric.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create("dataset:hive?dataset=users",
      descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }

  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 8
Source File: CreateUserDatasetGenericPartitioned.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a partition strategy that hash partitions on username with 10 buckets
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("favoriteColor", "favorite_color")
      .build();

  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .partitionStrategy(partitionStrategy)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 9
Source File: TestCrunchDatasetsHBase.java    From kite with Apache License 2.0 5 votes vote down vote up
private void writeRecords(Dataset<GenericRecord> dataset, int count) {
  DatasetWriter<GenericRecord> writer = dataset.newWriter();
  try {
    for (int i = 0; i < count; ++i) {
      GenericRecord entity = HBaseDatasetRepositoryTest.createGenericEntity(i);
      writer.write(entity);
    }
  } finally {
    writer.close();
  }
}
 
Example 10
Source File: TestSimpleView.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testRefineIdentity() throws Exception {
    PartitionStrategy strategy = new PartitionStrategy.Builder()
            .identity("user_id")
            .build();

    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schemaUri("resource:standard_event.avsc")
            .partitionStrategy(strategy)
            .build();

    // Create a separate dataset to avoid conflicts with the above.
    Dataset<StandardEvent> identityDataset = repo.create(
        "ns", "test_identity", descriptor);

    DatasetWriter<StandardEvent> writer = null;

    try {
        writer = identityDataset.newWriter();
        writer.write(sepEvent);
        writer.write(octEvent);
        writer.write(novEvent);
    } finally {
        Closeables.close(writer, false);
    }

    assertContentEquals(Sets.newHashSet(sepEvent, novEvent),
            identityDataset.with("user_id", 0L));
}
 
Example 11
Source File: TestProjection.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testReflectProjectionLoad() throws IOException {
  Dataset<ReflectStandardEvent> original = repo.create(
      "ns", "reflectProjection",
      new DatasetDescriptor.Builder()
          .schema(ReflectStandardEvent.class)
          .build(),
      ReflectStandardEvent.class);

  DatasetWriter<ReflectStandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(new ReflectStandardEvent(sepEvent));
    writer.write(new ReflectStandardEvent(octEvent));
    writer.write(new ReflectStandardEvent(novEvent));
  } finally {
    Closeables.close(writer, false);
  }

  View<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}
 
Example 12
Source File: TestHiveExternalDatasetRepository.java    From kite with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("deprecation")
@Test
public void testDeletedPartitionRemovedFromHive() throws Exception {
  final String NAME2 = "test2";

  // use a multi-item partition strategy to ensure the system
  // can convert it to the corresponding Hive partition
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("username")
      .identity("email").build();

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(testSchema)
      .partitionStrategy(partitionStrategy)
      .build();

  Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor);

  HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2);
  HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2);
  Assert.assertTrue("No partitions yet",
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty());

  GenericData.Record record1 = new GenericRecordBuilder(
      dataset.getDescriptor().getSchema())
      .set("username", "0").set("email", "0").build();

  GenericData.Record record2 = new GenericRecordBuilder(
      dataset.getDescriptor().getSchema())
      .set("username", "1").set("email", "1").build();

  DatasetWriter<GenericRecord> writer = dataset.newWriter();

  try
  {
    writer.write(record1);
    writer.write(record2);

  } finally {

    writer.close();
  }

  Assert.assertEquals("Should be two partitions", 2,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

  RefinableView view = dataset.with("username", "0").with("email", "0");

  view.deleteAll();

  Assert.assertEquals("Should be one partition", 1,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

  view = dataset.with("username", "1").with("email", "1");

  view.deleteAll();

  Assert.assertEquals("Should be no partitions", 0,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());
}
 
Example 13
Source File: TestS3Dataset.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testBasics3a() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3a://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}
 
Example 14
Source File: TestS3Dataset.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testBasics3n() {
  // only run this test if credentials are present
  Assume.assumeTrue(ID != null && !ID.isEmpty());

  String uri = "dataset:s3n://" + BUCKET + "/ns/test";

  // make sure the dataset doesn't already exist
  Datasets.delete(uri);

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build();

  Dataset<String> dataset = Datasets.create(uri, descriptor, String.class);

  List<String> expected = Lists.newArrayList("a", "b", "time");
  DatasetWriter<String> writer = null;
  try {
    writer = dataset.newWriter();
    for (String s : expected) {
      writer.write(s);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  DatasetReader<String> reader = null;
  try {
    reader = dataset.newReader();
    Assert.assertEquals("Should match written strings",
        expected, Lists.newArrayList((Iterator<String>) reader));
  } finally {
    if (reader != null) {
      reader.close();
    }
  }

  // clean up
  Datasets.delete(uri);
}
 
Example 15
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testUseReaderSchemaParquet() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());

  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .format(Formats.PARQUET).schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}
 
Example 16
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
public void testUseReaderSchema() throws IOException {

  // Create a schema with only a username, so we can test reading it
  // with an enhanced record structure.
  Schema oldRecordSchema = SchemaBuilder.record("org.kitesdk.data.user.OldUserRecord")
      .fields()
      .requiredString("username")
      .endRecord();

  // create the dataset
  Dataset<Record> in = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Dataset<Record> out = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(oldRecordSchema).build());
  Record oldUser = new Record(oldRecordSchema);
  oldUser.put("username", "user");

  DatasetWriter<Record> writer = in.newWriter();

  try {

    writer.write(oldUser);

  } finally {
    writer.close();
  }

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);

  // read data from updated dataset that has the new schema.
  // At this point, User class has the old schema
  PCollection<NewUserRecord> data = pipeline.read(CrunchDatasets.asSource(in.getUri(),
      NewUserRecord.class));

  PCollection<NewUserRecord> processed = data.parallelDo(new UserRecordIdentityFn(),
      Avros.records(NewUserRecord.class));

  pipeline.write(processed, CrunchDatasets.asTarget(out));

  DatasetReader reader = out.newReader();

  Assert.assertTrue("Pipeline failed.", pipeline.run().succeeded());

  try {

    // there should be one record that is equal to our old user generic record.
    Assert.assertEquals(oldUser, reader.next());
    Assert.assertFalse(reader.hasNext());

  } finally {
    reader.close();
  }
}
 
Example 17
Source File: TestSpark.java    From kite with Apache License 2.0 4 votes vote down vote up
@Test
@SuppressWarnings("deprecation")
public void testSparkJob() throws Exception {
  Dataset<Record> inputDataset = repo.create("ns", "in",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STRING_SCHEMA)
        .format(format)
        .build(), Record.class);
  DatasetWriter<Record> writer = inputDataset.newWriter();
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("banana"));
  writer.write(newStringRecord("carrot"));
  writer.write(newStringRecord("apple"));
  writer.write(newStringRecord("apple"));
  writer.close();


  Dataset<Record> outputDataset = repo.create("ns", "out",
      new DatasetDescriptor.Builder()
        .property("kite.allow.csv", "true")
        .schema(TestMapReduce.STATS_SCHEMA)
        .format(format)
        .build(), Record.class);

  Job job = Job.getInstance();
  DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
  DatasetKeyOutputFormat.configure(job).writeTo(outputDataset);

  @SuppressWarnings("unchecked")
  JavaPairRDD<Record, Void> inputData = SparkTestHelper.getSparkContext()
      .newAPIHadoopRDD(job.getConfiguration(), DatasetKeyInputFormat.class,
          Record.class, Void.class);

  JavaPairRDD<String, Integer> mappedData = inputData.mapToPair(new ToJava());
  JavaPairRDD<String, Integer> sums = mappedData.reduceByKey(new Sum());
  JavaPairRDD<Record, Void> outputData = sums.mapToPair(new ToAvro());

  outputData.saveAsNewAPIHadoopDataset(job.getConfiguration());

  DatasetReader<Record> reader = outputDataset.newReader();
  Map<String, Integer> counts = new HashMap<String, Integer>();
  for (Record record : reader) {
    counts.put(record.get("name").toString(), (Integer) record.get("count"));
  }
  reader.close();

  Assert.assertEquals(3, counts.get("apple").intValue());
  Assert.assertEquals(2, counts.get("banana").intValue());
  Assert.assertEquals(1, counts.get("carrot").intValue());

}
 
Example 18
Source File: GenerateSimpleLogs.java    From kite-examples with Apache License 2.0 4 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // going to generate a lot of random log messages
  final Random rand = new Random();

  // data is written to the staging dataset
  Dataset<Record> staging = Datasets.load(
      "dataset:file:/tmp/data/logs_staging", Record.class);

  // this is going to build our simple log records
  GenericRecordBuilder builder = new GenericRecordBuilder(
      staging.getDescriptor().getSchema());

  // generate timestamps 1 second apart starting 1 day ago
  final Calendar now = Calendar.getInstance(TimeZone.getTimeZone("UTC"));
  final long yesterday = now.getTimeInMillis() - DAY_IN_MILLIS;

  DatasetWriter<Record> writer = null;
  try {
    writer = staging.newWriter();

    // generate 15,000 messages, each 5 seconds apart, starting 24 hours ago
    // this is a little less than 24 hours worth of messages
    for (int second : Ranges.closed(0, 15000).asSet(DiscreteDomains.integers())) {
      LOG.info("Generating log message " + second);

      builder.set("timestamp", yesterday + second * 5000);
      builder.set("component", "GenerateSimpleLogs");

      int level = rand.nextInt(LOG_LEVELS.length);
      builder.set("level", LOG_LEVELS[level]);
      builder.set("message", LOG_MESSAGES[level]);

      writer.write(builder.build());
    }

    if (writer instanceof Flushable) {
      ((Flushable) writer).flush();
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}