Java Code Examples for org.kitesdk.data.DatasetDescriptor#getPartitionStrategy()

The following examples show how to use org.kitesdk.data.DatasetDescriptor#getPartitionStrategy() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: DatasetKeyOutputFormat.java From kite with Apache License 2.0

6 votes

private static <E> View<E> loadOrCreateTaskAttemptView(TaskAttemptContext taskContext) {
  Configuration conf = Hadoop.JobContext.getConfiguration.invoke(taskContext);
  Map<String, String> uriOptions = Registration.lookupDatasetUri(
      URI.create(URI.create(conf.get(KITE_OUTPUT_URI)).getSchemeSpecificPart())).second();
  Dataset<E> dataset = loadOrCreateTaskAttemptDataset(taskContext);

  if (dataset instanceof AbstractDataset) {
    DatasetDescriptor descriptor = dataset.getDescriptor();
    Schema schema = descriptor.getSchema();
    PartitionStrategy strategy = null;
    if (descriptor.isPartitioned()) {
      strategy = descriptor.getPartitionStrategy();
    }
    Constraints constraints = Constraints.fromQueryMap(
        schema, strategy, uriOptions);
    return ((AbstractDataset<E>) dataset).filter(constraints);
  } else {
    return dataset;
  }
}

Example 2

Source File: PartitionedDatasetWriter.java From kite with Apache License 2.0

5 votes

private PartitionedDatasetWriter(FileSystemView<E> view) {
  final DatasetDescriptor descriptor = view.getDataset().getDescriptor();
  Preconditions.checkArgument(descriptor.isPartitioned(),
      "Dataset " + view.getDataset() + " is not partitioned");

  this.view = view;
  this.partitionStrategy = descriptor.getPartitionStrategy();

  int defaultMaxWriters = partitionStrategy.getCardinality();
  if (defaultMaxWriters < 0 || defaultMaxWriters > DEFAULT_WRITER_CACHE_SIZE) {
    defaultMaxWriters = DEFAULT_WRITER_CACHE_SIZE;
  }
  this.maxWriters = DescriptorUtil.getInt(WRITER_CACHE_SIZE_PROP, descriptor,
      defaultMaxWriters);

  this.state = ReaderWriterState.NEW;
  this.reusedKey = new StorageKey(partitionStrategy);
  this.accessor = view.getAccessor();
  this.provided = view.getProvidedValues();

  // get file rolling properties
  if (!Formats.PARQUET.equals(descriptor.getFormat())) {
    this.targetFileSize = DescriptorUtil.getLong(
        TARGET_FILE_SIZE_PROP, descriptor, -1);
  } else {
    targetFileSize = -1;
  }
  this.rollIntervalMillis = 1000 * DescriptorUtil.getLong(
      ROLL_INTERVAL_S_PROP, descriptor, -1);
}

Example 3

Source File: FileSystemView.java From kite with Apache License 2.0

5 votes

private FileSystemPartitionIterator partitionIterator() {
  DatasetDescriptor descriptor = dataset.getDescriptor();
  try {
    return new FileSystemPartitionIterator(
        fs, root, descriptor.getPartitionStrategy(), descriptor.getSchema(),
        getKeyPredicate());
  } catch (IOException ex) {
    throw new DatasetException("Cannot list partitions in view:" + this, ex);
  }
}

Example 4

Source File: FileSystemDataset.java From kite with Apache License 2.0

5 votes

FileSystemDataset(FileSystem fileSystem, Path directory,
                  String namespace, String name,
                  DatasetDescriptor descriptor, URI uri,
                  @Nullable PartitionListener partitionListener,
                  Class<E> type) {
  super(type, descriptor.getSchema());
  if (Formats.PARQUET.equals(descriptor.getFormat())) {
    Preconditions.checkArgument(IndexedRecord.class.isAssignableFrom(type) ||
        type == Object.class,
        "Parquet only supports generic and specific data models, type"
        + " parameter must implement IndexedRecord");
  }

  this.fileSystem = fileSystem;
  this.directory = directory;
  this.namespace = namespace;
  this.name = name;
  this.descriptor = descriptor;
  this.partitionStrategy =
      descriptor.isPartitioned() ? descriptor.getPartitionStrategy() : null;
  this.partitionListener = partitionListener;
  this.convert = new PathConversion(descriptor.getSchema());
  this.uri = uri;

  Path signalsPath = new Path(getDirectory(fileSystem, directory),
      SIGNALS_DIRECTORY_NAME);
  this.signalManager = new SignalManager(fileSystem, signalsPath);
  this.unbounded = new FileSystemPartitionView<E>(
      this, partitionListener, signalManager, type);

  // remove this.partitionKey for 0.14.0
  this.partitionKey = null;
}

Example 5

Source File: AbstractRefinableView.java From kite with Apache License 2.0

5 votes

protected AbstractRefinableView(Dataset<E> dataset, Class<E> type) {
  this.dataset = dataset;
  final DatasetDescriptor descriptor = dataset.getDescriptor();
  if (descriptor.isPartitioned()) {
    this.constraints = new Constraints(
        descriptor.getSchema(), descriptor.getPartitionStrategy());
    // TODO: is comparator used anywhere?
    this.comparator = new MarkerComparator(descriptor.getPartitionStrategy());
    this.keys = new ThreadLocal<StorageKey>() {
      @Override
      protected StorageKey initialValue() {
        return new StorageKey(descriptor.getPartitionStrategy());
      }
    };
  } else {
    this.constraints = new Constraints(descriptor.getSchema());
    this.comparator = null;
    this.keys = null;
  }
  this.accessor = DataModelUtil.accessor(type, descriptor.getSchema());
  this.entityTest = constraints.toEntityPredicate(accessor);

  Schema datasetSchema = descriptor.getSchema();
  this.canRead = SchemaValidationUtil.canRead(
      datasetSchema, accessor.getReadSchema());
  this.canWrite = SchemaValidationUtil.canRead(
      accessor.getWriteSchema(), datasetSchema);

  IncompatibleSchemaException.check(canRead || canWrite,
      "The type cannot be used to read from or write to the dataset:\n" +
      "Type schema: %s\nDataset schema: %s",
      getSchema(), descriptor.getSchema());
}

Example 6

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema) {
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Example 7

Source File: AvroKeyEntitySchemaParser.java From kite with Apache License 2.0

5 votes

@Override
public AvroKeySchema parseKeySchema(String rawSchema,
    PartitionStrategy partitionStrategy) {
  // use DatasetDescriptor.Builder because it checks consistency
  DatasetDescriptor descriptor = new DatasetDescriptor.Builder()
      .schemaLiteral(rawSchema)
      .partitionStrategy(partitionStrategy)
      .build();
  return new AvroKeySchema(
      descriptor.getSchema(), descriptor.getPartitionStrategy());
}

Example 8

Source File: HBaseMetadataProviderTest.java From kite with Apache License 2.0

5 votes

@Test
public void testBasic() {
  DatasetDescriptor desc = provider.create("default", tableName + ".TestEntity",
      new DatasetDescriptor.Builder().schemaLiteral(testEntity).build());
  ColumnMapping columnMapping = desc.getColumnMapping();
  PartitionStrategy partStrat = desc.getPartitionStrategy();
  assertEquals(9, columnMapping.getFieldMappings().size());
  assertEquals(2, Accessor.getDefault().getFieldPartitioners(partStrat).size());
}

Example 9

Source File: HiveUtils.java From kite with Apache License 2.0

4 votes

static Table tableForDescriptor(String namespace, String name,
                                DatasetDescriptor descriptor,
                                boolean external,
                                boolean includeSchema) {
  final Table table = createEmptyTable(namespace, name);

  if (external) {
    // you'd think this would do it...
    table.setTableType(TableType.EXTERNAL_TABLE.toString());
    // but it doesn't work without some additional magic:
    table.getParameters().put("EXTERNAL", "TRUE");
    table.getSd().setLocation(descriptor.getLocation().toString());
  } else {
    table.setTableType(TableType.MANAGED_TABLE.toString());
  }

  addPropertiesForDescriptor(table, descriptor);

  // translate from Format to SerDe
  final Format format = descriptor.getFormat();
  if (FORMAT_TO_SERDE.containsKey(format)) {
    table.getSd().getSerdeInfo().setSerializationLib(FORMAT_TO_SERDE.get(format));
    table.getSd().setInputFormat(FORMAT_TO_INPUT_FORMAT.get(format));
    table.getSd().setOutputFormat(FORMAT_TO_OUTPUT_FORMAT.get(format));
  } else {
    throw new UnknownFormatException(
        "No known serde for format:" + format.getName());
  }

  if (includeSchema) {
    URL schemaURL = descriptor.getSchemaUrl();
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
          AVRO_SCHEMA_URL_PROPERTY_NAME,
          descriptor.getSchemaUrl().toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }
  }

  table.getParameters().put(COMPRESSION_TYPE_PROPERTY_NAME,
      descriptor.getCompressionType().getName());

  // convert the schema to Hive columns
  table.getSd().setCols(HiveSchemaConverter.convertSchema(descriptor.getSchema()));

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    table.setPartitionKeys(partitionColumns(ps, descriptor.getSchema()));
  }

  return table;
}

Example 10

Source File: HiveUtils.java From kite with Apache License 2.0

4 votes

public static void updateTableSchema(Table table, DatasetDescriptor descriptor) {
  URL schemaURL = descriptor.getSchemaUrl();

  if (table.getParameters().get(AVRO_SCHEMA_LITERAL_PROPERTY_NAME) != null) {
    if (useSchemaURL(schemaURL)) {
      table.getParameters().remove(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
      table.getParameters().put(AVRO_SCHEMA_URL_PROPERTY_NAME,
          schemaURL.toExternalForm());
    } else {
      table.getParameters().put(
          AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
          descriptor.getSchema().toString());
    }

  } else if (table.getParameters().get(AVRO_SCHEMA_URL_PROPERTY_NAME) != null) {
    if (schemaURL == null) {
      throw new DatasetOperationException(
          "Cannot update " + AVRO_SCHEMA_URL_PROPERTY_NAME +
          " since descriptor schema URL is not set.");
    }
    table.getParameters().put(
        AVRO_SCHEMA_URL_PROPERTY_NAME,
        schemaURL.toExternalForm());

  } else {
    // neither the literal or the URL are set, so add the URL if specified
    // and the schema literal if not.
    if (useSchemaURL(schemaURL)) {
      table.getParameters().put(
              AVRO_SCHEMA_URL_PROPERTY_NAME,
              schemaURL.toExternalForm());

    } else if (descriptor.getSchema() != null) {
      table.getParameters().put(
              AVRO_SCHEMA_LITERAL_PROPERTY_NAME,
              descriptor.getSchema().toString());
    } else {
      throw new DatasetException("Table schema cannot be updated since it is" +
              " not set on the descriptor.");
    }
  }

  // copy partitioning info
  if (descriptor.isPartitioned()) {
    PartitionStrategy ps = descriptor.getPartitionStrategy();
    table.getParameters().put(PARTITION_EXPRESSION_PROPERTY_NAME,
        Accessor.getDefault().toExpression(ps));
    // no need to set the partition columns; no changes to the Hive side
  }

  // keep the custom properties up-to-date
  addPropertiesForDescriptor(table, descriptor);

  // keep the table DDL up to-to-date with the Schema
  table.getSd().setCols(
      HiveSchemaConverter.convertSchema(descriptor.getSchema()));
}