org.kitesdk.data.DatasetDescriptor Java Examples

The following examples show how to use org.kitesdk.data.DatasetDescriptor. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateFailsWithFormatChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME,
      new DatasetDescriptor.Builder(testDescriptor)
          .format(Formats.AVRO)
          .build());

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
      .format(Formats.PARQUET)
      .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to format change");
  } catch (ValidationException e) {
    // expected
  }

  Assert.assertEquals(
      Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat());
}
 
Example #2
Source File: TestWriteReflectReadGeneric.java    From kite with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void setup() throws IOException {
  fs = LocalFileSystem.getInstance();
  testDirectory = new Path(Files.createTempDir().getAbsolutePath());
  FileSystemDatasetRepository repo = new FileSystemDatasetRepository(fs.getConf(),
      testDirectory);
  Dataset<MyRecord> writerDataset = repo.create("ns", "test", new DatasetDescriptor.Builder()
                                 .schema(MyRecord.class)
                                 .build(), MyRecord.class);
  DatasetWriter<MyRecord> writer = writerDataset.newWriter();
  for (int i = 0; i < totalRecords; i++) {
    writer.write(new MyRecord(String.valueOf(i), i));
  }
  writer.close();

  readerDataset = repo.load("ns", "test", GenericRecord.class);
}
 
Example #3
Source File: HBaseMetadataProvider.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public DatasetDescriptor load(String namespace, String name) {
  Preconditions.checkArgument(DEFAULT_NAMESPACE.equals(namespace),
      "Non-default namespaces are not supported");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");

  if (!exists(namespace, name)) {
    throw new DatasetNotFoundException("No such dataset: " + name);
  }
  String tableName = getTableName(name);
  String entityName = getEntityName(name);
  return new DatasetDescriptor.Builder()
      .schemaLiteral(schemaManager.getEntitySchema(tableName, entityName)
          .getRawSchema())
      .build();
}
 
Example #4
Source File: TestHiveRepositoryURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testExternalURI() {
  URI hdfsUri = getDFS().getUri();
  URI repoUri = URI.create("repo:hive:/tmp/hive-repo?hdfs:host=" +
      hdfsUri.getHost() + "&hdfs:port=" + hdfsUri.getPort());
  DatasetRepository repo = DatasetRepositories.repositoryFor(repoUri);

  Assert.assertNotNull("Received a repository", repo);
  org.junit.Assert.assertTrue("Repo should be a HCatalogExternalDatasetRepository",
      repo instanceof HiveExternalDatasetRepository);
  Assert.assertEquals("Repository URI", repoUri, repo.getUri());

  // verify location
  DatasetDescriptor created = repo.create("tmp", "test",
      new DatasetDescriptor.Builder()
      .schemaLiteral("\"string\"")
      .build()).getDescriptor();
  Assert.assertEquals("Location should be in HDFS",
      "hdfs", created.getLocation().getScheme());
  Assert.assertEquals("Location should have the correct HDFS host",
      hdfsUri.getHost(), created.getLocation().getHost());
  Assert.assertEquals("Location should have the correct HDFS port",
      hdfsUri.getPort(), created.getLocation().getPort());
  Assert.assertTrue("Location should be in the repo path",
      created.getLocation().getPath().startsWith("/tmp/hive-repo"));
}
 
Example #5
Source File: FileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public <E> Dataset<E> load(String namespace, String name, Class<E> type) {
  Preconditions.checkNotNull(namespace, "Namespace cannot be null");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");

  LOG.debug("Loading dataset: {}", name);

  DatasetDescriptor descriptor = metadataProvider.load(namespace, name);

  FileSystemDataset<E> ds = new FileSystemDataset.Builder<E>()
      .namespace(namespace)
      .name(name)
      .configuration(conf)
      .descriptor(descriptor)
      .type(type)
      .uri(new URIBuilder(getUri(), namespace, name).build())
      .partitionKey(descriptor.isPartitioned() ? new PartitionKey() : null)
      .partitionListener(getPartitionListener())
      .build();

  LOG.debug("Loaded dataset:{}", ds);

  return ds;
}
 
Example #6
Source File: TestMetadataProviders.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCustomProperties() {
  final String propName = "my.custom.property";
  final String propValue = "string";
  DatasetDescriptor descriptorWithProp =
      new DatasetDescriptor.Builder(testDescriptor)
      .property(propName, propValue)
      .build();

  DatasetDescriptor created = provider.create(NAMESPACE, NAME, descriptorWithProp);
  Assert.assertTrue("Should have custom property",
      created.hasProperty(propName));
  Assert.assertEquals("Should have correct custom property value",
      propValue, created.getProperty(propName));
  Assert.assertTrue("List should contain property name",
      created.listProperties().contains(propName));

  DatasetDescriptor loaded = provider.load(NAMESPACE, NAME);
  Assert.assertTrue("Should have custom property",
      loaded.hasProperty(propName));
  Assert.assertEquals("Should have correct custom property value",
      propValue, loaded.getProperty(propName));
  Assert.assertTrue("List should contain property name",
      created.listProperties().contains(propName));
}
 
Example #7
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testLoadChangedAbsolutePathURICompatibility() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:/data/ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use apparent namespace",
      "dataset:hive:data/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example #8
Source File: TestHiveExternalDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
@Test
public void testNewPartitionIsVisibleToHive() throws Exception {
  final String NAME2 = "test2";

  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .hash("username", 2).build();

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(testSchema)
      .partitionStrategy(partitionStrategy)
      .build();

  Dataset<GenericRecord> dataset = repo.create(NAMESPACE, NAME2, descriptor);

  HiveTestUtils.assertTableExists(client, NAMESPACE, NAME2);
  HiveTestUtils.assertTableIsExternal(client, NAMESPACE, NAME2);
  Assert.assertTrue("No partitions yet",
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).isEmpty());

  writeRecord(dataset, 0);

  Assert.assertEquals("Should be one partition", 1,
      client.listPartitionNames(NAMESPACE, NAME2, (short) 10).size());

}
 
Example #9
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateFailsWithLocationChange() {
  ensureCreated();
  Dataset<Record> dataset = repo.load(NAMESPACE, NAME);
  URI location = dataset.getDescriptor().getLocation();

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
          .location(new Path(testDirectory, "newDataLocation").toUri())
          .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to data location change");
  } catch (ValidationException ex) {
    // expected
  }

  Assert.assertEquals(
      location, repo.load(NAMESPACE, NAME).getDescriptor().getLocation());
}
 
Example #10
Source File: TestUpdateDatasetCommand.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateSchema() throws Exception {
  File avroSchemaFile = new File("target/schema_update.avsc");
  new FileWriter(avroSchemaFile).append(schema2).close();

  command.datasets = Lists.newArrayList("users");
  command.avroSchemaFile = avroSchemaFile.toString();
  command.run();

  DatasetDescriptor updated = new DatasetDescriptor.Builder(original)
      .schemaLiteral(schema2)
      .build();

  verify(repo).load("default", "users"); // need to load the current dataset
  verify(ds).getDescriptor(); // should inspect and use its descriptor
  verify(repo).update(eq("default"), eq("users"), argThat(TestUtil.matches(updated)));
  verify(console).debug(contains("Updated"), eq("users"));
}
 
Example #11
Source File: TestCreateDatasetCommandCluster.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testBasicUseLocalSchema() throws Exception {
  String avsc = "target/localUser.avsc";
  FSDataOutputStream out = getFS()
      .create(new Path(avsc), true /* overwrite */ );
  ByteStreams.copy(Resources.getResource("test-schemas/user.avsc").openStream(), out);
  out.close();
  command.avroSchemaFile = avsc;
  command.datasets = Lists.newArrayList("users");
  command.run();

  DatasetDescriptor expectedDescriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:test-schemas/user.avsc")
      .build();

  verify(getMockRepo()).create("default", "users", expectedDescriptor);
  verify(console).debug(contains("Created"), eq("users"));
}
 
Example #12
Source File: TestCrunchDatasetsHBase.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGeneric() throws IOException {
  String datasetName = tableName + ".TestGenericEntity";

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();

  Dataset<GenericRecord> inputDataset = repo.create("default", "in", descriptor);
  Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);

  writeRecords(inputDataset, 10);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasetsHBase.class, HBaseTestUtils.getConf());
  PCollection<GenericRecord> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkRecords(outputDataset, 10, 0);
}
 
Example #13
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGeneric() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
Example #14
Source File: TestMetadataProviders.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateWithLocation() throws URISyntaxException {
  Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, NAME));

  String auth = getDFS().getUri().getAuthority();
  URI requestedLocation = new URI("hdfs://" + auth + "/tmp/data/my_data_set");
  DatasetDescriptor requested = new DatasetDescriptor.Builder(testDescriptor)
      .location(requestedLocation)
      .build();

  final DatasetDescriptor created;
  try {
    created = provider.create(NAMESPACE, NAME, requested);
  } catch (UnsupportedOperationException ex) {
    // this is expected if the provider doesn't support requested locations
    return;
  }

  // if supported, the location should be unchanged.
  Assert.assertNotNull("Descriptor should be returned", created);
  Assert.assertTrue("Descriptor should exist", provider.exists(NAMESPACE, NAME));
  Assert.assertEquals("Requested locations should match",
      requestedLocation, created.getLocation());
}
 
Example #15
Source File: HBaseDatasetReaderTest.java    From kite with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void beforeClass() throws Exception {
  HBaseTestUtils.getMiniCluster();
  // managed table should be created by HBaseDatasetRepository
  HBaseTestUtils.util.deleteTable(Bytes.toBytes(managedTableName));
  HBaseDatasetRepository repo = new HBaseDatasetRepository.Builder()
      .configuration(HBaseTestUtils.getConf()).build();
  String testGenericEntity = AvroUtils.inputStreamToString(
      HBaseDatasetRepositoryTest.class.getResourceAsStream("/TestGenericEntity.avsc"));
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(testGenericEntity)
      .build();
  dataset = repo.create("default", "testtable", descriptor);
  for (int i = 0; i < 10; i++) {
    dataset.put(HBaseDatasetRepositoryTest.createGenericEntity(i));
  }
}
 
Example #16
Source File: PartitionedDatasetWriter.java    From kite with Apache License 2.0 6 votes vote down vote up
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalPartitionedDatasetWriter<E>(view);
    } else {
      return new NonDurablePartitionedDatasetWriter<E>(view);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalPartitionedDatasetWriter<E>(view);
  } else {
    return new NonDurablePartitionedDatasetWriter<E>(view);
  }
}
 
Example #17
Source File: FileSystemWriter.java    From kite with Apache License 2.0 6 votes vote down vote up
private FileSystemWriter(FileSystem fs, Path path, long rollIntervalMillis,
                         long targetFileSize, DatasetDescriptor descriptor, Schema writerSchema) {
  Preconditions.checkNotNull(fs, "File system is not defined");
  Preconditions.checkNotNull(path, "Destination directory is not defined");
  Preconditions.checkNotNull(descriptor, "Descriptor is not defined");

  this.fs = fs;
  this.directory = path;
  this.rollIntervalMillis = rollIntervalMillis;
  this.targetFileSize = targetFileSize;
  this.descriptor = descriptor;
  this.conf = new Configuration(fs.getConf());
  this.state = ReaderWriterState.NEW;
  this.schema = writerSchema;

  // copy file format settings from custom properties to the Configuration
  for (String prop : descriptor.listProperties()) {
    conf.set(prop, descriptor.getProperty(prop));
  }

  // For performance reasons we will skip temp file creation if the file system does not support
  // efficient renaming, and write the file directly.
  this.useTempPath = FileSystemUtil.supportsRename(fs.getUri(), conf);
}
 
Example #18
Source File: TestProjection.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMixedProjection() throws IOException {
  Dataset<StandardEvent> original = repo.create("ns", "mixedProjection",
      new DatasetDescriptor.Builder()
          .schema(StandardEvent.class)
          .build(), StandardEvent.class);

  DatasetWriter<StandardEvent> writer = null;
  try {
    writer = original.newWriter();
    writer.write(sepEvent);
    writer.write(octEvent);
    writer.write(novEvent);
  } finally {
    Closeables.close(writer, false);
  }

  Dataset<ReflectSmallEvent> dataset = repo.load("ns", original.getName(),
      ReflectSmallEvent.class);

  Set<ReflectSmallEvent> expected = Sets.newHashSet(
      new ReflectSmallEvent(sepEvent), new ReflectSmallEvent(octEvent),
      new ReflectSmallEvent(novEvent));

  assertContentEquals(expected, dataset);
}
 
Example #19
Source File: DatasetKeyOutputFormat.java    From kite with Apache License 2.0 6 votes vote down vote up
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}
 
Example #20
Source File: TestHiveUtils.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateChangesDDL() throws Exception {
  DatasetDescriptor original = new DatasetDescriptor.Builder()
      .schema(SchemaBuilder.record("Test").fields()
          .requiredLong("id")
          .requiredString("data")
          .endRecord())
      .build();
  boolean external = false;
  Table table = HiveUtils.tableForDescriptor("ns", "test", original, external);

  DatasetDescriptor updated = new DatasetDescriptor.Builder()
      .schema(SchemaBuilder.record("Test").fields()
          .requiredLong("id")
          .requiredString("data")
          .nullableString("data2", "")
          .endRecord())
      .build();

  HiveUtils.updateTableSchema(table, updated);

  Assert.assertEquals("Should update the table DDL",
      table.getSd().getCols(),
      HiveSchemaConverter.convertSchema(updated.getSchema()));
}
 
Example #21
Source File: Compatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Checks that the {@code existing} {@link DatasetDescriptor} is compatible
 * with {@code test}.
 *
 * @param existing the current {@code DatasetDescriptor} for a dataset
 * @param test a new {@code DatasetDescriptor} for the same dataset
 */
public static void checkCompatible(DatasetDescriptor existing,
                                   DatasetDescriptor test) {
  checkNotChanged("format", existing.getFormat(), test.getFormat());

  checkNotChanged("partitioning",
      existing.isPartitioned(), test.isPartitioned());

  if (existing.isPartitioned()) {
    checkStrategyUpdate(
        existing.getPartitionStrategy(),
        test.getPartitionStrategy(),
        test.getSchema());
  }

  // check can read records written with old schema using new schema
  Schema oldSchema = existing.getSchema();
  Schema testSchema = test.getSchema();
  if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
    throw new IncompatibleSchemaException("Schema cannot read data " +
        "written using existing schema. Schema: " + testSchema.toString(true) +
        "\nExisting schema: " + oldSchema.toString(true));
  }

}
 
Example #22
Source File: CSVFileReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
public CSVFileReader(FileSystem fileSystem, Path path,
                     DatasetDescriptor descriptor,
                     EntityAccessor<E> accessor) {
  this.fs = fileSystem;
  this.path = path;
  this.schema = accessor.getReadSchema();
  this.recordClass = accessor.getType();
  this.state = ReaderWriterState.NEW;
  this.props = CSVProperties.fromDescriptor(descriptor);
  // defaults to false: assume that callers will not make defensive copies
  this.reuseRecords = DescriptorUtil.isEnabled(REUSE_RECORDS, descriptor);

  Preconditions.checkArgument(Schema.Type.RECORD.equals(schema.getType()),
      "Schemas for CSV files must be records of primitive types");
}
 
Example #23
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #24
Source File: TestCompatibilityChecks.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testAllowedPartitionSchemaCombinations() {
  Compatibility.checkDescriptor(
      new DatasetDescriptor.Builder()
          .schema(schema)
          .partitionStrategy(new PartitionStrategy.Builder()
              .year("timestamp")
              .month("timestamp")
              .day("timestamp")
              .hour("timestamp")
              .minute("timestamp")
              .identity("message", "message_copy")
              .identity("timestamp", "ts")
              .identity("number", "num")
              .hash("message", 48)
              .hash("timestamp", 48)
              .hash("number", 48)
              .hash("payload", 48)
              .hash("float", 48)
              .hash("double", 48)
              .hash("bool", 48)
              .range("number", 5, 10, 15, 20)
              .range("message", "m", "z", "M", "Z")
              .build())
          .build());
}
 
Example #25
Source File: TestConfigurationProperty.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Before
public void createDataset() throws Exception {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schema(TestUtil.USER_SCHEMA)
            .build();
    this.datasetUri = "dataset:file:" + temp.newFolder("ns", "temp").toString();
    this.dataset = Datasets.create(datasetUri, descriptor, Record.class);
}
 
Example #26
Source File: DescriptorUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Returns whether the value of the descriptor property is {@code true}.
 *
 * @param property a String property name
 * @param descriptor a {@link DatasetDescriptor}
 * @return {@code true} if set and "true", {@code false} otherwise.
 */
public static boolean isEnabled(String property, DatasetDescriptor descriptor) {
  if (descriptor.hasProperty(property)) {
    // return true if and only if the property value is "true"
    return Boolean.valueOf(descriptor.getProperty(property));
  }
  return false;
}
 
Example #27
Source File: AvroKeyEntitySchemaParser.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}
 
Example #28
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleAvroFilesAtDifferentDepths() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertTrue("Should flag data at mixed depth in the directory tree",
      DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}
 
Example #29
Source File: TestKiteURIHandler.java    From kite with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws IOException, URISyntaxException {
  this.conf = (distributed ?
      MiniDFSTest.getConfiguration() :
      new Configuration());

  this.fs = FileSystem.get(conf);

  this.testDescriptor = new DatasetDescriptor.Builder()
      .format(Formats.AVRO)
      .schema(SchemaBuilder.record("Event").fields()
          .requiredLong("timestamp")
          .requiredString("message")
          .endRecord())
      .partitionStrategy(new PartitionStrategy.Builder()
          .year("timestamp")
          .month("timestamp")
          .day("timestamp")
          .build())
      .build();

  uriHandler = new KiteURIHandler();

  startingConf = DefaultConfiguration.get();

  startingOozieHome = System.getProperty("oozie.home.dir");
}
 
Example #30
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testUpdateFailsWithIncompatibleSchemaChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME, new DatasetDescriptor.Builder()
      .schema(testSchema).build());

  Assert.assertEquals("Dataset name is propagated", NAME,
      dataset.getName());
  Assert.assertEquals("Dataset schema is propagated", testSchema, dataset
      .getDescriptor().getSchema());

  Schema testSchemaV2 = SchemaBuilder.record("user").fields()
      .requiredString("username")
      .requiredString("email")
      .requiredString("favoriteColor") // incompatible - no default
      .endRecord();

  try {
    repo.update(NAMESPACE, NAME, new DatasetDescriptor.Builder(
        dataset.getDescriptor()).schema(testSchemaV2).build());
    Assert.fail("Should fail due to incompatible update");
  } catch (ValidationException e) {
    // expected
  }
  dataset = repo.load(NAMESPACE, NAME);
  Assert.assertEquals("Dataset schema is unchanged", testSchema, dataset
      .getDescriptor().getSchema());
}