Java Code Examples for org.kitesdk.data.DatasetDescriptor#getFormat()

The following examples show how to use org.kitesdk.data.DatasetDescriptor#getFormat() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0

6 votes

static <E> PartitionedDatasetWriter<E, ?> newWriter(FileSystemView<E> view) {
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalPartitionedDatasetWriter<E>(view);
    } else {
      return new NonDurablePartitionedDatasetWriter<E>(view);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalPartitionedDatasetWriter<E>(view);
  } else {
    return new NonDurablePartitionedDatasetWriter<E>(view);
  }
}

Example 2

Source File: FileSystemWriter.java From kite with Apache License 2.0

6 votes

static <E> FileSystemWriter<E> newWriter(FileSystem fs, Path path,
                                         long rollIntervalMillis,
                                         long targetFileSize,
                                         DatasetDescriptor descriptor, Schema writerSchema) {
  Format format = descriptor.getFormat();
  if (Formats.PARQUET.equals(format)) {
    // by default, Parquet is not durable
    if (DescriptorUtil.isDisabled(
        FileSystemProperties.NON_DURABLE_PARQUET_PROP, descriptor)) {
      return new IncrementalWriter<E>(
          fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
    } else {
      return new FileSystemWriter<E>(
          fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
    }
  } else if (Formats.AVRO.equals(format) || Formats.CSV.equals(format)) {
    return new IncrementalWriter<E>(
        fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  } else {
    return new FileSystemWriter<E>(
        fs, path, rollIntervalMillis, targetFileSize, descriptor, writerSchema);
  }
}

Example 3

Source File: FileSystemWriter.java From kite with Apache License 2.0

4 votes

static boolean isSupportedFormat(DatasetDescriptor descriptor) {
  Format format = descriptor.getFormat();
  return (SUPPORTED_FORMATS.contains(format) || (Formats.CSV.equals(format) &&
      DescriptorUtil.isEnabled(FileSystemProperties.ALLOW_CSV_PROP, descriptor)
  ));
}

Example 4

Source File: TestFileSystemUtil.java From kite with Apache License 2.0

4 votes

@Test
public void testIncompatibleFormatFilesInSameFolder() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();

  // create Avro and Parquet files under separate folders, with the same schema
  Path parent = new Path(folder.toURI());
  createAvroUserFile(fs, parent);
  createParquetUserFile(fs, parent);

  Collection<DatasetDescriptor> descriptors = FileSystemUtil
      .findPotentialDatasets(fs, root);

  Assert.assertEquals("Should have 2 descriptors", 2, descriptors.size());
  DatasetDescriptor avro;
  DatasetDescriptor parquet;
  DatasetDescriptor first = Iterables.getFirst(descriptors, null);
  if (first.getFormat() == Formats.AVRO) {
    avro = first;
    parquet = Iterables.getLast(descriptors, null);
  } else {
    parquet = first;
    avro = Iterables.getLast(descriptors, null);
  }

  Assert.assertFalse("Should not flag at mixed depth",
      avro.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(),
      parent(avro.getLocation()));
  Assert.assertTrue("Should be a .avro file",
      avro.getLocation().toString().endsWith(".avro"));
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, avro.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.AVRO, avro.getFormat());
  Assert.assertFalse("Should not be partitioned",
      avro.isPartitioned());

  Assert.assertFalse("Should not flag at mixed depth",
      parquet.hasProperty("kite.filesystem.mixed-depth"));
  Assert.assertEquals("Should be directly under parent",
      parent.toUri(),
      parent(parquet.getLocation()));
  Assert.assertTrue("Should be a .parquet file",
      parquet.getLocation().toString().endsWith(".parquet"));
  Assert.assertEquals("Should use user schema",
      USER_SCHEMA, parquet.getSchema());
  Assert.assertEquals("Should have Avro format",
      Formats.PARQUET, parquet.getFormat());
  Assert.assertFalse("Should not be partitioned",
      parquet.isPartitioned());
}

Example 5

Source File: HiveUtils.java From kite with Apache License 2.0

4 votes

static Table tableForDescriptor(String namespace, String name,
                                DatasetDescriptor descriptor,
                                boolean external,
                                boolean includeSchema) {
  final Table table = createEmptyTable(namespace, name);

  if (external) {
    // you'd think this would do it...
    table.setTableType(TableType.EXTERNAL_TABLE.toString());
    // but it doesn't work without some additional magic:
    table.getParameters().put("EXTERNAL", "TRUE");
    table.getSd().setLocation(descriptor.getLocation().toString());
  } else {
    table.setTableType(TableType.MANAGED_TABLE.toString());
  }

  addPropertiesForDescriptor(table, descriptor);

  // translate from Format to SerDe
  final Format format = descriptor.getFormat();
  if (FORMAT_TO_SERDE.containsKey(format)) {
    table.getSd().getSerdeInfo().setSerializationLib(FORMAT_TO_SERDE.get(format));
    table.getSd().setInputFormat(FORMAT_TO_INPUT_FORMAT.get(format));
    table.getSd().setOutputFormat(FORMAT_TO_OUTPUT_FORMAT.get(format));
  } else {
    throw new UnknownFormatException(
        "No known serde for format:" + format.getName());
  }

  if (includeSchema) {
    URL schemaURL = descriptor.getSchemaUrl();
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
          AVRO_SCHEMA_URL_PROPERTY_NAME,
          descriptor.getSchemaUrl().toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }
  }

  table.getParameters().put(COMPRESSION_TYPE_PROPERTY_NAME,
      descriptor.getCompressionType().getName());

  // convert the schema to Hive columns
  table.getSd().setCols(HiveSchemaConverter.convertSchema(descriptor.getSchema()));

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    table.setPartitionKeys(partitionColumns(ps, descriptor.getSchema()));
  }

  return table;
}