Java Code Examples for org.kitesdk.data.DatasetDescriptor#isPartitioned()

The following examples show how to use org.kitesdk.data.DatasetDescriptor#isPartitioned() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: DatasetKeyOutputFormat.java    From kite with Apache License 2.0 6 votes vote down vote up
private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}
 
Example 2
Source File: Compatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Checks that the {@code existing} {@link DatasetDescriptor} is compatible
 * with {@code test}.
 *
 * @param existing the current {@code DatasetDescriptor} for a dataset
 * @param test a new {@code DatasetDescriptor} for the same dataset
 */
public static void checkCompatible(DatasetDescriptor existing,
                                   DatasetDescriptor test) {
  checkNotChanged("format", existing.getFormat(), test.getFormat());

  checkNotChanged("partitioning",
      existing.isPartitioned(), test.isPartitioned());

  if (existing.isPartitioned()) {
    checkStrategyUpdate(
        existing.getPartitionStrategy(),
        test.getPartitionStrategy(),
        test.getSchema());
  }

  // check can read records written with old schema using new schema
  Schema oldSchema = existing.getSchema();
  Schema testSchema = test.getSchema();
  if (!SchemaValidationUtil.canRead(oldSchema, testSchema)) {
    throw new IncompatibleSchemaException("Schema cannot read data " +
        "written using existing schema. Schema: " + testSchema.toString(true) +
        "\nExisting schema: " + oldSchema.toString(true));
  }

}
 
Example 3
Source File: InfoCommand.java    From kite with Apache License 2.0 5 votes vote down vote up
private static void printInfo(Logger console, Dataset<?> dataset) {
  DatasetDescriptor desc = dataset.getDescriptor();
  String schema = ColumnMappingParser.removeEmbeddedMapping(
      PartitionStrategyParser.removeEmbeddedStrategy(desc.getSchema()))
      .toString(true);
  Collection<String> properties = desc.listProperties();

  console.info("\nDataset \"{}\":", dataset.getName());
  console.info("\tURI: \"{}\"", dataset.getUri());
  console.info("\tSchema: {}", indent(schema));
  if (desc.isPartitioned()) {
    console.info("\tPartition strategy: {}",
        indent(desc.getPartitionStrategy().toString(true)));
  } else {
    console.info("\tNot partitioned");
  }
  if (desc.isColumnMapped()) {
    console.info("\tColumn mapping: {}",
        indent(desc.getColumnMapping().toString(true)));
  }
  if (!properties.isEmpty()) {
    StringBuilder sb = new StringBuilder();
    for (String prop : properties) {
      sb.append("\n\t\t").append(prop).append("=")
          .append(desc.getProperty(prop));
    }
    console.info("\tProperties:{}", sb.toString());
  }
}
 
Example 4
Source File: FileSystemDataset.java    From kite with Apache License 2.0 5 votes vote down vote up
FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}
 
Example 5
Source File: AbstractRefinableView.java    From kite with Apache License 2.0 5 votes vote down vote up
protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}
 
Example 6
Source File: HBaseMetadataProvider.java    From kite with Apache License 2.0 5 votes vote down vote up
private static Schema getEmbeddedSchema(DatasetDescriptor descriptor) {
  // the SchemaManager stores schemas, so this embeds the column mapping and
  // partition strategy in the schema. the result is parsed by
  // AvroKeyEntitySchemaParser
  Schema schema = descriptor.getSchema();
  if (descriptor.isColumnMapped()) {
    schema = ColumnMappingParser
        .embedColumnMapping(schema, descriptor.getColumnMapping());
  }
  if (descriptor.isPartitioned()) {
    schema = PartitionStrategyParser
        .embedPartitionStrategy(schema, descriptor.getPartitionStrategy());
  }
  return schema;
}
 
Example 7
Source File: CrunchDatasets.java    From kite with Apache License 2.0 4 votes vote down vote up
/**
 * Partitions {@code collection} to be stored efficiently in {@code View}.
 * <p>
 * This restructures the parallel collection so that all of the entities that
 * will be stored in a given partition will be evenly distributed across a specified
 * {@code numPartitionWriters}.
 * <p>
 * If the dataset is not partitioned, then this will structure all of the
 * entities to produce a number of files equal to {@code numWriters}.
 *
 * @param collection a collection of entities
 * @param view a {@link View} of a dataset to partition the collection for
 * @param numWriters the number of writers that should be used
 * @param numPartitionWriters the number of writers data for a single partition will be distributed across
 * @param <E> the type of entities in the collection and underlying dataset
 * @return an equivalent collection of entities partitioned for the view
 * @see #partition(PCollection, View)
 *
 * @since 1.1.0
 */
public static <E> PCollection<E> partition(PCollection<E> collection,
                                           View<E> view,
                                           int numWriters, int numPartitionWriters) {
  //ensure the number of writers is honored whether it is per partition or total.
  DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  if (descriptor.isPartitioned()) {
    GetStorageKey<E> getKey = new GetStorageKey<E>(view, numPartitionWriters);
    PTable<Pair<GenericData.Record, Integer>, E> table = collection
        .by(getKey, Avros.pairs(Avros.generics(getKey.schema()), Avros.ints()));
    PGroupedTable<Pair<GenericData.Record, Integer>, E> grouped =
        numWriters > 0 ? table.groupByKey(numWriters) : table.groupByKey();
    return grouped.ungroup().values();
  } else {
    return partition(collection, numWriters);
  }
}
 
Example 8
Source File: HiveUtils.java    From kite with Apache License 2.0 4 votes vote down vote up
static Table tableForDescriptor(String namespace, String name,
                                DatasetDescriptor descriptor,
                                boolean external,
                                boolean includeSchema) {
  final Table table = createEmptyTable(namespace, name);

  if (external) {
    // you'd think this would do it...
    table.setTableType(TableType.EXTERNAL_TABLE.toString());
    // but it doesn't work without some additional magic:
    table.getParameters().put("EXTERNAL", "TRUE");
    table.getSd().setLocation(descriptor.getLocation().toString());
  } else {
    table.setTableType(TableType.MANAGED_TABLE.toString());
  }

  addPropertiesForDescriptor(table, descriptor);

  // translate from Format to SerDe
  final Format format = descriptor.getFormat();
  if (FORMAT_TO_SERDE.containsKey(format)) {
    table.getSd().getSerdeInfo().setSerializationLib(FORMAT_TO_SERDE.get(format));
    table.getSd().setInputFormat(FORMAT_TO_INPUT_FORMAT.get(format));
    table.getSd().setOutputFormat(FORMAT_TO_OUTPUT_FORMAT.get(format));
  } else {
    throw new UnknownFormatException(
        "No known serde for format:" + format.getName());
  }

  if (includeSchema) {
    URL schemaURL = descriptor.getSchemaUrl();
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
          AVRO_SCHEMA_URL_PROPERTY_NAME,
          descriptor.getSchemaUrl().toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }
  }

  table.getParameters().put(COMPRESSION_TYPE_PROPERTY_NAME,
      descriptor.getCompressionType().getName());

  // convert the schema to Hive columns
  table.getSd().setCols(HiveSchemaConverter.convertSchema(descriptor.getSchema()));

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    table.setPartitionKeys(partitionColumns(ps, descriptor.getSchema()));
  }

  return table;
}
 
Example 9
Source File: HiveUtils.java    From kite with Apache License 2.0 4 votes vote down vote up
public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
  URL schemaURL = descriptor.getSchemaUrl();

  if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
    if (useSchemaURL(schemaURL)) {
      table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
      table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME,
          schemaURL.toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }

  } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
    if (schemaURL == null) {
      throw new DatasetOperationException(
          "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME +
          " since descriptor schema URL is not set.");
    }
    table.getParameters().put(
        AVRO_SCHEMA_URL_PROPERTY_NAME,
        schemaURL.toExternalForm());

  } else {
    // neither the literal or the URL are set, so add the URL if specified
    // and the schema literal if not.
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
              AVRO_SCHEMA_URL_PROPERTY_NAME,
              schemaURL.toExternalForm());

    } else if (descriptor.getSchema() != null) {
      table.getParameters().put(
              AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
              descriptor.getSchema().toString());
    } else {
      throw new DatasetException("Table schema cannot be updated since it is" +
              " not set on the descriptor.");
    }
  }

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    // no need to set the partition columns; no changes to the Hive side
  }

  // keep the custom properties up-to-date
  addPropertiesForDescriptor(table, descriptor);

  // keep the table DDL up to-to-date with the Schema
  table.getSd().setCols(
      HiveSchemaConverter.convertSchema(descriptor.getSchema()));
}