org.kitesdk.data.Formats Java Examples

The following examples show how to use org.kitesdk.data.Formats. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultipleAvroFilesInOneFolder() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files in parent
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, parent);
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
 
Example #2
Source File: AvroAppender.java    From kite with Apache License 2.0 6 votes vote down vote up
private CodecFactory getCodecFactory() {
  switch (compressionType) {
    case Snappy:
      return CodecFactory.snappyCodec();

    case Deflate:
      return CodecFactory.deflateCodec(9);

    case Bzip2:
      return CodecFactory.bzip2Codec();

    default:
      throw new IllegalArgumentException(String.format(
          "Unsupported compression format %s. Supported formats: %s",
          compressionType.getName(), Arrays.toString(
              Formats.AVRO.getSupportedCompressionTypes().toArray())));
  }
}
 
Example #3
Source File: FileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
Result file(FileSystem fs, Path path) throws IOException {
  Format format = formatFromExt(path);
  Schema schema = null;
  if (format == Formats.AVRO) {
    schema = Schemas.fromAvro(fs, path);
  } else if (format == Formats.PARQUET) {
    schema = Schemas.fromParquet(fs, path);
  } else if (format == Formats.JSON) {
    schema = Schemas.fromJSON("record", fs, path);
  }

  if (schema == null) {
    return new Result.Unknown();
  }

  return new Result.Table(path, format, schema, path.depth());
}
 
Example #4
Source File: MultiFileDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked") // See https://github.com/Parquet/parquet-mr/issues/106
private void openNextReader() {
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    this.reader = new ParquetFileSystemDatasetReader(fileSystem,
        filesIter.next(), accessor.getReadSchema(), accessor.getType());
  } else if (Formats.JSON.equals(descriptor.getFormat())) {
    this.reader = new JSONFileReader<E>(
        fileSystem, filesIter.next(), accessor);
  } else if (Formats.CSV.equals(descriptor.getFormat())) {
    this.reader = new CSVFileReader<E>(fileSystem, filesIter.next(),
        descriptor, accessor);
  } else if (Formats.INPUTFORMAT.equals(descriptor.getFormat())) {
    this.reader = new InputFormatReader(fileSystem, filesIter.next(), descriptor);
  } else {
    this.reader = new FileSystemDatasetReader<E>(fileSystem, filesIter.next(),
        accessor.getReadSchema(), accessor.getType());
  }
  reader.initialize();
  this.readerIterator = Iterators.filter(reader,
      constraints.toEntityPredicate(
          (pathIter != null ? pathIter.getStorageKey() : null), accessor));
}
 
Example #5
Source File: ParquetAppender.java    From kite with Apache License 2.0 6 votes vote down vote up
private CompressionCodecName getCompressionCodecName() {
  switch (compressionType) {
    case Snappy:
      return CompressionCodecName.SNAPPY;

    case Lzo:
      return CompressionCodecName.LZO;

    case Deflate:
      return CompressionCodecName.GZIP;

    default:
      throw new IllegalArgumentException(String.format(
          "Unsupported compression format %s. Supported formats: %s",
          compressionType.getName(), Arrays.toString(
              Formats.PARQUET.getSupportedCompressionTypes().toArray())));
  }
}
 
Example #6
Source File: PartitionedDatasetWriter.java    From kite with Apache License 2.0 6 votes vote down vote up
static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalPartitionedDatasetWriter<E>(view);
    } else {
      return new NonDurablePartitionedDatasetWriter<E>(view);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalPartitionedDatasetWriter<E>(view);
  } else {
    return new NonDurablePartitionedDatasetWriter<E>(view);
  }
}
 
Example #7
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSingleParquetFile() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a single Avro file
  Path parent = new Path(folder.toURI());
  createParquetEventFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri().getPath(),
      parent(descriptor.getLocation()).getPath());
  Assert.assertTrue("Should be a .parquet file",
      descriptor.getLocation().toString().endsWith(".parquet"));
  Assert.assertEquals("Should use event schema",
      EVENT_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Parquet format",
      Formats.PARQUET, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
 
Example #8
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultipleParquetFilesInOneFolder() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a single Avro file
  Path parent = new Path(folder.toURI());
  createParquetEventFile(fs, parent);
  createParquetEventFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use event schema",
      EVENT_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Parquet format",
      Formats.PARQUET, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
 
Example #9
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSourceView() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  View<Record> inputView = inputDataset.with("username", "test-0");
  Assert.assertEquals(1, datasetSize(inputView));

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputView));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(1, datasetSize(outputDataset));
}
 
Example #10
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testPartitionedSource() throws IOException {
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder().hash(
      "username", 2).build();

  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).partitionStrategy(partitionStrategy).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  writeTestUsers(inputDataset, 10);

  PartitionKey key = new PartitionKey(0);
  Dataset<Record> inputPart0 =
      ((PartitionedDataset<Record>) inputDataset).getPartition(key, false);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputPart0));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  Assert.assertEquals(5, datasetSize(outputDataset));
}
 
Example #11
Source File: TestCrunchDatasets.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testGenericParquet() throws IOException {
  Dataset<Record> inputDataset = repo.create("ns", "in", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());
  Dataset<Record> outputDataset = repo.create("ns", "out", new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA).format(Formats.PARQUET).build());

  // write two files, each of 5 records
  writeTestUsers(inputDataset, 5, 0);
  writeTestUsers(inputDataset, 5, 5);

  Pipeline pipeline = new MRPipeline(TestCrunchDatasets.class);
  PCollection<GenericData.Record> data = pipeline.read(
      CrunchDatasets.asSource(inputDataset));
  pipeline.write(data, CrunchDatasets.asTarget(outputDataset), Target.WriteMode.APPEND);
  pipeline.run();

  checkTestUsers(outputDataset, 10);
}
 
Example #12
Source File: TestFileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdateFailsWithFormatChange() {
  Dataset<Record> dataset = repo.create(NAMESPACE, NAME,
      new DatasetDescriptor.Builder(testDescriptor)
          .format(Formats.AVRO)
          .build());

  DatasetDescriptor changed =
      new DatasetDescriptor.Builder(dataset.getDescriptor())
      .format(Formats.PARQUET)
      .build();

  try {
    repo.update(NAMESPACE, NAME, changed);
    Assert.fail("Should fail due to format change");
  } catch (ValidationException e) {
    // expected
  }

  Assert.assertEquals(
      Formats.AVRO, repo.load(NAMESPACE, NAME).getDescriptor().getFormat());
}
 
Example #13
Source File: TestCreateDatasetWithExistingData.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromExistingPartitioned() throws Exception {
  command.datasets = Lists.newArrayList(existingPartitionedURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingPartitionedURI));

  PartitionStrategy providedVersionStrategy = new PartitionStrategy.Builder()
      .provided("version", "int")
      .build();

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingPartitionedURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertEquals("Should be partitioned with a provided partitioner",
      providedVersionStrategy, users.getDescriptor().getPartitionStrategy());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}
 
Example #14
Source File: TestCreateDatasetWithExistingData.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromExistingWithLocation() throws Exception {
  command.datasets = Lists.newArrayList(existingDataURI);
  command.location = existingPartitionedPathWithPartition.toString();
  command.run();

  verify(console).debug(contains("Created"), eq(existingDataURI));

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingDataURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertFalse("Should not be partitioned",
      users.getDescriptor().isPartitioned());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
  Assert.assertTrue("Location should point to the partitioned data",
      String.valueOf(users.getDescriptor().getLocation())
          .endsWith(existingPartitionedPathWithPartition.toString()));
}
 
Example #15
Source File: TestCreateDatasetWithExistingData.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromExisting() throws Exception {
  command.datasets = Lists.newArrayList(existingDataURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingDataURI));

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingDataURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertFalse("Should not be partitioned",
      users.getDescriptor().isPartitioned());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}
 
Example #16
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testSingleAvroFile() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a single Avro file
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri().getPath(),
      parent(descriptor.getLocation()).getPath());
  Assert.assertTrue("Should be a .avro file",
      descriptor.getLocation().toString().endsWith(".avro"));
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertFalse("Should not be partitioned", descriptor.isPartitioned());
}
 
Example #17
Source File: FileSystemWriter.java    From kite with Apache License 2.0 6 votes vote down vote up
@VisibleForTesting
@SuppressWarnings("unchecked")
<E> FileAppender<E> newAppender(Path temp) {
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return (FileAppender<E>) new DurableParquetAppender(
          fs, temp, schema, conf, descriptor.getCompressionType());
    } else {
      return (FileAppender<E>) new ParquetAppender(
          fs, temp, schema, conf,
          descriptor.getCompressionType());
    }
  } else if (Formats.AVRO.equals(format)) {
    return new AvroAppender<E>(fs, temp, schema,
        descriptor.getCompressionType());
  } else if (Formats.CSV.equals(format) &&
      DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)) {
    return new CSVAppender<E>(fs, temp, descriptor);
  } else {
    this.state = ReaderWriterState.ERROR;
    throw new UnknownFormatException("Unknown format " + descriptor);
  }
}
 
Example #18
Source File: FileSystemWriter.java    From kite with Apache License 2.0 6 votes vote down vote up
static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path,
                                         long rollIntervalMillis,
                                         long targetFileSize,
                                         DatasetDescriptor descriptor, Schema writerSchema) {
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalWriter<E>(
          fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
    } else {
      return new FileSystemWriter<E>(
          fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalWriter<E>(
        fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  } else {
    return new FileSystemWriter<E>(
        fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  }
}
 
Example #19
Source File: TestFileSystemDataset.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test(expected = ValidationException.class)
public void testCannotMergeDatasetsWithDifferentFormats() throws IOException {
  FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>()
      .namespace("ns")
      .name("users")
      .configuration(getConfiguration())
      .descriptor(new DatasetDescriptor.Builder()
          .schema(USER_SCHEMA)
          .format(Formats.AVRO)
          .location(testDirectory)
          .build())
      .type(Record.class)
      .build();
  FileSystemDataset<Record> dsUpdate = new FileSystemDataset.Builder<Record>()
      .namespace("ns")
      .name("users")
      .configuration(getConfiguration())
      .descriptor(new DatasetDescriptor.Builder()
          .schema(USER_SCHEMA)
          .format(Formats.PARQUET)
          .location(testDirectory)
          .build())
      .type(Record.class)
      .build();
  ds.merge(dsUpdate);
}
 
Example #20
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testPartitionedDatasetWithEscapedChars() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .partitionStrategy(new PartitionStrategy.Builder()
          .provided("s")
          .build())
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset.with("s", "test/-0"));
  writeUserToView(dataset.with("s", "test/-0"));

  Path datasetPath = new Path(folder.toURI());
  Path partitionPath = new Path(datasetPath, "s=test%2F-0");

  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Location should be at the partition directory",
      URI.create(partitionPath.toString()), actual.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, actual.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, actual.getFormat());
  Assert.assertFalse("Should not be partitioned", actual.isPartitioned());
}
 
Example #21
Source File: FileSystemDataset.java    From kite with Apache License 2.0 5 votes vote down vote up
FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}
 
Example #22
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleAvroFilesInSeparateFolders() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, new Path(parent, "2"));

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}
 
Example #23
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleAvroFilesAtDifferentDepths() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertTrue("Should flag data at mixed depth in the directory tree",
      DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}
 
Example #24
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleMergeTablesAtDifferentDepths() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, new Path(parent, "part=1"));
  createAvroUserFile(fs, parent);

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("part", "int")
      .build();

  Assert.assertTrue("Should flag data at mixed depth in the directory tree",
      DescriptorUtil.isEnabled("kite.filesystem.mixed-depth", descriptor));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}
 
Example #25
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultipleParquetFilesInSeparateFolders() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create a two Avro files under separate folders
  Path parent = new Path(folder.toURI());
  createParquetEventFile(fs, new Path(parent, "part"));
  createParquetEventFile(fs, new Path(parent, "2"));

  DatasetDescriptor descriptor = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  PartitionStrategy strategy = new PartitionStrategy.Builder()
      .provided("partition_1", "string")
      .build();

  Assert.assertFalse("Should not flag at mixed depth",
      descriptor.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(), descriptor.getLocation());
  Assert.assertEquals("Should use user schema",
      EVENT_SCHEMA, descriptor.getSchema());
  Assert.assertEquals("Should have Parquet format",
      Formats.PARQUET, descriptor.getFormat());
  Assert.assertEquals("Should be partitioned by part=int",
      strategy, descriptor.getPartitionStrategy());
}
 
Example #26
Source File: TestMetadataProviders.java    From kite with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws IOException, URISyntaxException {
  this.conf = (distributed ?
      MiniDFSTest.getConfiguration() :
      new Configuration());
  this.testDescriptor = new DatasetDescriptor.Builder()
      .format(Formats.AVRO)
      .schema(SchemaBuilder.record("Event").fields()
          .requiredLong("timestamp")
          .requiredString("message")
          .endRecord())
      .partitionStrategy(new PartitionStrategy.Builder()
          .year("timestamp")
          .month("timestamp")
          .day("timestamp")
          .build())
      .build();
  // something completely different
  this.anotherDescriptor = new DatasetDescriptor.Builder()
      .format(Formats.PARQUET)
      .schema(SchemaBuilder.record("Record").fields()
          .requiredBytes("some_field")
          .requiredString("another_field")
          .endRecord())
      .partitionStrategy(new PartitionStrategy.Builder()
          .hash("some_field", 20000)
          .build())
      .build();

  this.provider = newProvider(conf);
}
 
Example #27
Source File: TestMetadataProviders.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testLargeSchema() {

  // Only run this test in distributed mode, since non-HDFS schema URLs result
  // in the schema being loaded into the Hive metastore, and large schemas
  // can exceed the size limit of that.
  Assume.assumeTrue(distributed);

  Assert.assertFalse("Sanity check", provider.exists(NAMESPACE, "large_schema_test"));

  // Create a schema with many fields to ensure the underlying store can handle it.
  SchemaBuilder.FieldAssembler<Schema> fields = SchemaBuilder.record("Event").fields();

  for (int i = 0; i < 1000; ++i) {
    fields.requiredString("field_" + i);
  }

  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
          .format(Formats.AVRO)
          .schema(fields.endRecord())
          .build();

  DatasetDescriptor created = provider.create(NAMESPACE, "large_schema_test", descriptor);

  Assert.assertEquals("Large schemas should match",
          descriptor.getSchema(), created.getSchema());
}
 
Example #28
Source File: TestExternalBackwardCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateFailsIfNotCompatible() {
  // this will fail because the new descriptor uses a different format
  // the old descriptor is found and used to validate the change
  TestHelpers.assertThrows("Create should fail because of a format change",
      ValidationException.class, new Runnable() {
        @Override
        public void run() {
          Datasets.create("dataset:hive:/tmp/datasets/test",
              new DatasetDescriptor.Builder(descriptor)
                  .format(Formats.PARQUET)
                  .build());
        }
      });
}
 
Example #29
Source File: TestExternalBackwardCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testCreateIncompatibleSucceedsWithLocation() {
  // if there is a requested location then the default table isn't checked
  // because only the default location would have been used
  Assert.assertNotNull("Create should succeed if location doesn't match",
      Datasets.create("dataset:hive:/tmp/datasets/test",
          new DatasetDescriptor.Builder(descriptor)
          .location(URI.create("file:/tmp/test-data/test"))
          .format(Formats.PARQUET)
          .build()));
}
 
Example #30
Source File: TestExternalBackwardCompatibility.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testUpdateValidatesAgainstDefaultNamespace() {
  TestHelpers.assertThrows("Update should fail because of a format change",
      ValidationException.class, new Runnable() {
        @Override
        public void run() {
          Datasets.update("dataset:hive:/tmp/datasets/test",
              new DatasetDescriptor.Builder(descriptor)
                  .format(Formats.PARQUET)
                  .build());
        }
      });
}