Java Code Examples for org.kitesdk.data.Dataset#getDescriptor()

The following examples show how to use org.kitesdk.data.Dataset#getDescriptor() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DatasetKeyOutputFormat.java    From kite with Apache License 2.0 6 votes vote down vote up
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}
 
Example 2
Source File: TestFileSystemUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnpartitionedDataset() throws Exception {
  File folder = temp.newFolder("a/b/c/d/e/dataset_name");
  Path root = new Path(temp.getRoot().toURI());
  FileSystem fs = LocalFileSystem.getInstance();
  URI datasetUri = URI.create("dataset:file:" + folder.getAbsolutePath());
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schema(USER_SCHEMA)
      .build();

  Dataset<GenericRecord> dataset = Datasets.create(datasetUri, descriptor);

  // write two so that the descriptor uses the directory rather than a file
  writeUserToView(dataset);
  writeUserToView(dataset);

  DatasetDescriptor expected = dataset.getDescriptor();
  DatasetDescriptor actual = Iterables.getOnlyElement(
      FileSystemUtil.findPotentialDatasets(fs, root));

  Assert.assertEquals("Should succeed and find an equivalent descriptor",
      expected, actual);
}
 
Example 3
Source File: InfoCommand.java    From kite with Apache License 2.0 5 votes vote down vote up
private static void printInfo(Logger console, Dataset<?> dataset) {
  DatasetDescriptor desc = dataset.getDescriptor();
  String schema = ColumnMappingParser.removeEmbeddedMapping(
      PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema()))
      .toString(true);
  Collection<String> properties = desc.listProperties();

  console.info("\nDataset \"{}\":", dataset.getName());
  console.info("\tURI: \"{}\"", dataset.getUri());
  console.info("\tSchema: {}", indent(schema));
  if (desc.isPartitioned()) {
    console.info("\tPartition strategy: {}",
        indent(desc.getPartitionStrategy().toString(true)));
  } else {
    console.info("\tNot partitioned");
  }
  if (desc.isColumnMapped()) {
    console.info("\tColumn mapping: {}",
        indent(desc.getColumnMapping().toString(true)));
  }
  if (!properties.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    for (String prop : properties) {
      sb.append("\n\t\t").append(prop).append("=")
          .append(desc.getProperty(prop));
    }
    console.info("\tProperties:{}", sb.toString());
  }
}
 
Example 4
Source File: AbstractRefinableView.java    From kite with Apache License 2.0 5 votes vote down vote up
protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}
 
Example 5
Source File: UpdateDatasetCommand.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public int run() throws IOException {
  if (datasets == null || datasets.size() != 1) {
    throw new IllegalArgumentException(
        "Exactly one dataset name must be specified.");
  }

  String dataset = datasets.remove(0);
  Dataset<GenericRecord> currentDataset = load(dataset).getDataset();

  DatasetDescriptor.Builder descriptorBuilder = new DatasetDescriptor
      .Builder(currentDataset.getDescriptor());

  if (avroSchemaFile != null) {
    descriptorBuilder.schemaUri(qualifiedURI(avroSchemaFile));
  }

  if (partitionStrategyFile != null) {
    descriptorBuilder.partitionStrategyUri(
        qualifiedURI(partitionStrategyFile));
  }

  if (properties != null) {
    for (String propValue : properties) {
      Iterator<String> parts = PROP_VALUE_SEP.split(propValue).iterator();
      descriptorBuilder.property(
          Iterators.getNext(parts, null),
          Iterators.getNext(parts, null));
    }
  }

  DatasetDescriptor descriptor = descriptorBuilder.build();

  if (isDatasetOrViewUri(dataset)) {
    Datasets.<GenericData.Record, Dataset<GenericData.Record>> update(dataset, descriptor, GenericData.Record.class);
  } else {
    getDatasetRepository().update(namespace, dataset, descriptor);
  }

  console.debug("Updated {}", dataset);

  return 0;
}