Java Code Examples for org.kitesdk.data.DatasetDescriptor#getSchema()

The following examples show how to use org.kitesdk.data.DatasetDescriptor#getSchema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DatasetKeyOutputFormat.java    From kite with Apache License 2.0 6 votes vote down vote up
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}
 
Example 2
Source File: Compatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Checks that the {@code existing} {@link DatasetDescriptor} is compatible
 * with {@code test}.
 *
 * @param existing the current {@code DatasetDescriptor} for a dataset
 * @param test a new {@code DatasetDescriptor} for the same dataset
 */
public static void checkCompatible(DatasetDescriptor existing,
                                   DatasetDescriptor test) {
  checkNotChanged("format", existing.getFormat(), test.getFormat());

  checkNotChanged("partitioning",
      existing.isPartitioned(), test.isPartitioned());

  if (existing.isPartitioned()) {
    checkStrategyUpdate(
        existing.getPartitionStrategy(),
        test.getPartitionStrategy(),
        test.getSchema());
  }

  // check can read records written with old schema using new schema
  Schema oldSchema = existing.getSchema();
  Schema testSchema = test.getSchema();
  if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
    throw new IncompatibleSchemaException("Schema cannot read data " +
        "written using existing schema. Schema: " + testSchema.toString(true) +
        "\nExisting schema: " + oldSchema.toString(true));
  }

}
 
Example 3
Source File: TestGetSchema.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testSchemaFromResourceURI() throws IOException {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schemaUri("resource:schema/user.avsc") // in kite-data-core test-jar
            .build();
    Schema expected = descriptor.getSchema();

    Schema schema = AbstractKiteProcessor.getSchema(
            "resource:schema/user.avsc", DefaultConfiguration.get());

    Assert.assertEquals("Schema from resource URI should match",
            expected, schema);
}
 
Example 4
Source File: HBaseMetadataProvider.java    From kite with Apache License 2.0 5 votes vote down vote up
private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) {
  // the SchemaManager stores schemas, so this embeds the column mapping and
  // partition strategy in the schema. the result is parsed by
  // AvroKeyEntitySchemaParser
  Schema schema = descriptor.getSchema();
  if (descriptor.isColumnMapped()) {
    schema = ColumnMappingParser
        .embedColumnMapping(schema, descriptor.getColumnMapping());
  }
  if (descriptor.isPartitioned()) {
    schema = PartitionStrategyParser
        .embedPartitionStrategy(schema, descriptor.getPartitionStrategy());
  }
  return schema;
}
 
Example 5
Source File: AvroKeyEntitySchemaParser.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public AvroEntitySchema parseEntitySchema(String rawSchema,
    ColumnMapping columnMapping) {
  // use DatasetDescriptor.Builder because it checks consistency
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .columnMapping(columnMapping)
      .build();
  return new AvroEntitySchema(
      descriptor.getSchema(), rawSchema, descriptor.getColumnMapping());
}
 
Example 6
Source File: AvroKeyEntitySchemaParser.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public AvroEntitySchema parseEntitySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroEntitySchema(
      descriptor.getSchema(), rawSchema, descriptor.getColumnMapping());
}
 
Example 7
Source File: AvroKeyEntitySchemaParser.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public AvroKeySchema parseKeySchema(String rawSchema,
    PartitionStrategy partitionStrategy) {
  // use DatasetDescriptor.Builder because it checks consistency
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .partitionStrategy(partitionStrategy)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}
 
Example 8
Source File: AvroKeyEntitySchemaParser.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}
 
Example 9
Source File: DaoDataset.java    From kite with Apache License 2.0 5 votes vote down vote up
public DaoDataset(String namespace, String name, Dao<E> dao, DatasetDescriptor descriptor,
    URI uri, Class<E> type) {
  super(type, descriptor.getSchema());
  Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
          type == Object.class,
      "HBase only supports the generic and specific data models. The entity"
          + " type must implement IndexedRecord");
  this.namespace = namespace;
  this.name = name;
  this.dao = dao;
  this.descriptor = descriptor;
  this.uri = uri;
  this.unbounded = new DaoView<E>(this, type);
}
 
Example 10
Source File: AbstractRefinableView.java    From kite with Apache License 2.0 5 votes vote down vote up
protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}
 
Example 11
Source File: FileSystemDataset.java    From kite with Apache License 2.0 5 votes vote down vote up
FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}
 
Example 12
Source File: CSVAppender.java    From kite with Apache License 2.0 5 votes vote down vote up
public CSVAppender(FileSystem fs, Path path, DatasetDescriptor descriptor) {
  this.fs = fs;
  this.path = path;
  this.schema = descriptor.getSchema();
  Preconditions.checkState(schema.getType() == Schema.Type.RECORD,
      "Unsupported schema (not a record): {}", schema);
  this.props = CSVProperties.fromDescriptor(descriptor);
}
 
Example 13
Source File: FileSystemView.java    From kite with Apache License 2.0 5 votes vote down vote up
private FileSystemPartitionIterator partitionIterator() {
  DatasetDescriptor descriptor = dataset.getDescriptor();
  try {
    return new FileSystemPartitionIterator(
        fs, root, descriptor.getPartitionStrategy(), descriptor.getSchema(),
        getKeyPredicate());
  } catch (IOException ex) {
    throw new DatasetException("Cannot list partitions in view:" + this, ex);
  }
}
 
Example 14
Source File: DatasetSink.java    From kite with Apache License 2.0 5 votes vote down vote up
private DatasetWriter<GenericRecord> newWriter(
    final UserGroupInformation login, final URI uri) {
  View<GenericRecord> view = KerberosUtil.runPrivileged(login,
      new PrivilegedExceptionAction<Dataset<GenericRecord>>() {
        @Override
        public Dataset<GenericRecord> run() {
          return Datasets.load(uri);
        }
      });

  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  String formatName = descriptor.getFormat().getName();
  Preconditions.checkArgument(allowedFormats().contains(formatName),
      "Unsupported format: " + formatName);

  Schema newSchema = descriptor.getSchema();
  if (targetSchema == null || !newSchema.equals(targetSchema)) {
    this.targetSchema = descriptor.getSchema();
    // target dataset schema has changed, invalidate all readers based on it
    readers.invalidateAll();
  }

  this.reuseDatum = !("parquet".equals(formatName));
  this.datasetName = view.getDataset().getName();

  return view.newWriter();
}
 
Example 15
Source File: CreateHiveUserDatasetGeneric.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create("dataset:hive?dataset=users",
      descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }

  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 16
Source File: CreateUserDatasetGenericParquet.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .format(Formats.PARQUET)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 17
Source File: CreateUserDatasetGeneric.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 18
Source File: CreateUserDatasetGenericPartitioned.java    From kite-examples with Apache License 2.0 5 votes vote down vote up
@Override
public int run(String[] args) throws Exception {
  // Create a partition strategy that hash partitions on username with 10 buckets
  PartitionStrategy partitionStrategy = new PartitionStrategy.Builder()
      .identity("favoriteColor", "favorite_color")
      .build();

  // Create a dataset of users with the Avro schema
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaUri("resource:user.avsc")
      .partitionStrategy(partitionStrategy)
      .build();
  Dataset<Record> users = Datasets.create(
      "dataset:hdfs:/tmp/data/users", descriptor, Record.class);

  // Get a writer for the dataset and write some users to it
  DatasetWriter<Record> writer = null;
  try {
    writer = users.newWriter();
    Random rand = new Random();
    GenericRecordBuilder builder = new GenericRecordBuilder(descriptor.getSchema());
    for (int i = 0; i < 100; i++) {
      Record record = builder.set("username", "user-" + i)
          .set("creationDate", System.currentTimeMillis())
          .set("favoriteColor", colors[rand.nextInt(colors.length)]).build();
      writer.write(record);
    }
  } finally {
    if (writer != null) {
      writer.close();
    }
  }

  return 0;
}
 
Example 19
Source File: TestGetSchema.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testSchemaFromResourceURI() throws IOException {
    DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
            .schemaUri("resource:schema/user.avsc") // in kite-data-core test-jar
            .build();
    Schema expected = descriptor.getSchema();

    Schema schema = AbstractKiteProcessor.getSchema(
            "resource:schema/user.avsc", DefaultConfiguration.get());

    Assert.assertEquals("Schema from resource URI should match",
            expected, schema);
}
 
Example 20
Source File: HiveUtils.java    From kite with Apache License 2.0 4 votes vote down vote up
public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
  URL schemaURL = descriptor.getSchemaUrl();

  if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
    if (useSchemaURL(schemaURL)) {
      table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
      table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME,
          schemaURL.toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }

  } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
    if (schemaURL == null) {
      throw new DatasetOperationException(
          "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME +
          " since descriptor schema URL is not set.");
    }
    table.getParameters().put(
        AVRO_SCHEMA_URL_PROPERTY_NAME,
        schemaURL.toExternalForm());

  } else {
    // neither the literal or the URL are set, so add the URL if specified
    // and the schema literal if not.
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
              AVRO_SCHEMA_URL_PROPERTY_NAME,
              schemaURL.toExternalForm());

    } else if (descriptor.getSchema() != null) {
      table.getParameters().put(
              AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
              descriptor.getSchema().toString());
    } else {
      throw new DatasetException("Table schema cannot be updated since it is" +
              " not set on the descriptor.");
    }
  }

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    // no need to set the partition columns; no changes to the Hive side
  }

  // keep the custom properties up-to-date
  addPropertiesForDescriptor(table, descriptor);

  // keep the table DDL up to-to-date with the Schema
  table.getSd().setCols(
      HiveSchemaConverter.convertSchema(descriptor.getSchema()));
}