Java Code Examples for org.kitesdk.data.DatasetDescriptor#getLocation()

The following examples show how to use org.kitesdk.data.DatasetDescriptor#getLocation() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestFileSystemMetadataProvider.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testDeleteRemovesMetadataFiles() throws IOException {
  testCreateMetadataFiles();

  DatasetDescriptor loaded = provider.load(NAMESPACE, NAME);

  Path namedDirectory = new Path(loaded.getLocation());
  Path metadataDirectory = new Path(namedDirectory, ".metadata");
  Path propertiesFile = new Path(metadataDirectory, "descriptor.properties");
  Path schemaDirectory = new Path(metadataDirectory, "schemas");

  boolean result = provider.delete(NAMESPACE, NAME);
  Assert.assertTrue(result);
  Assert.assertFalse("Descriptor properties file should not exist",
      fileSystem.exists(propertiesFile));
  Assert.assertFalse("Descriptor schema directory should not exist",
      fileSystem.exists(schemaDirectory));
  Assert.assertFalse("Metadata directory should not exist",
      fileSystem.exists(metadataDirectory));
  Assert.assertTrue("Named directory should still exist for name:" + NAME,
      fileSystem.exists(namedDirectory));
}
 
Example 2
Source File: TestFileSystemMetadataProvider.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testUpdatePreviousFormat() throws IOException {

  useOldRepositoryFormat();

  DatasetDescriptor oldFormatDescriptor = provider.load(NAMESPACE, NAME);

  Path namedDirectory = new Path(oldFormatDescriptor.getLocation());
  Path metadataDirectory = new Path(namedDirectory, ".metadata");
  Path schemaDirectory = new Path(metadataDirectory, "schemas");
  Path newSchemaLocation = new Path(schemaDirectory, "1.avsc");

  // Performing an update against a dataset in the old location should bring it
  // into the new location.
  DatasetDescriptor updated =  new DatasetDescriptor.Builder(oldFormatDescriptor).build();

  provider.update(NAMESPACE, NAME, updated);

  Assert.assertEquals(testDescriptor.getSchema(), oldFormatDescriptor.getSchema());

  Assert.assertTrue("Schema should exist at the new location.",
      fileSystem.exists(newSchemaLocation));
}
 
Example 3
Source File: FileSystemDatasetRepository.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public <E> Dataset<E> create(String namespace, String name,
                             DatasetDescriptor descriptor, Class<E> type) {
  Preconditions.checkNotNull(namespace, "Namespace cannot be null");
  Preconditions.checkNotNull(name, "Dataset name cannot be null");
  Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");

  // suggest a location for this dataset: <root>/<namespace>/<name>/
  Path suggestedLocation = pathForDataset(namespace, name);

  DatasetDescriptor newDescriptor = descriptor;
  if (descriptor.getLocation() == null) {
    newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(suggestedLocation) // may be overridden by MetadataProvider
        .build();
  }

  newDescriptor = metadataProvider.create(namespace, name, newDescriptor);

  FileSystemUtil.ensureLocationExists(newDescriptor, conf);

  LOG.debug("Created dataset: {} schema: {} datasetPath: {}", new Object[] {
      name, newDescriptor.getSchema(), newDescriptor.getLocation() });

  FileSystemDataset<E> dataset = new FileSystemDataset.Builder<E>()
      .namespace(namespace)
      .name(name)
      .configuration(conf)
      .descriptor(newDescriptor)
      .type(type)
      .uri(new URIBuilder(getUri(), namespace, name).build())
      .partitionKey(newDescriptor.isPartitioned() ? new PartitionKey() : null)
      .partitionListener(getPartitionListener())
      .build();

  // notify the partition listener about any existing data partitions
  dataset.addExistingPartitions();

  return dataset;
}
 
Example 4
Source File: TestFileSystemMetadataProvider.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Converts the test repository to the old format.
 */
private void useOldRepositoryFormat() throws IOException {
  testCreateMetadataFiles();

  // Create a placeholder descriptor that we'll modify to
  // look like the old layout.
  DatasetDescriptor placeholder = provider.load(NAMESPACE, NAME);

  Path namedDirectory = new Path(placeholder.getLocation());
  Path metadataDirectory = new Path(namedDirectory, ".metadata");
  Path propertiesFile = new Path(metadataDirectory, "descriptor.properties");
  Path schemaDirectory = new Path(metadataDirectory, "schemas");
  Path oldSchemaLocation = new Path(metadataDirectory, "schema.avsc");

  // Delete the new schema directory to simulate a dataset
  // written using the older format. This works because
  // the metadata provider writes schema to both the old and new locations.
  fileSystem.delete(schemaDirectory, true);

  Assert.assertTrue("Named directory should exist for name:" + NAME,
      fileSystem.exists(namedDirectory));
  Assert.assertTrue("Metadata directory should exist",
      fileSystem.exists(metadataDirectory));
  Assert.assertTrue("Descriptor properties file should exist",
      fileSystem.exists(propertiesFile));
  Assert.assertTrue("Old schema location should exist.",
      fileSystem.exists(oldSchemaLocation));
}
 
Example 5
Source File: MemoryMetadataProvider.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) {
  Preconditions.checkNotNull(namespace, "Namespace cannot be null");
  Preconditions.checkNotNull(name, "Name cannot be null");
  Preconditions.checkNotNull(descriptor, "Descriptor cannot be null");

  if (exists(namespace, name)) {
    throw new DatasetExistsException(
        "Dataset already exists for name:" + name);
  }

  DatasetDescriptor newDescriptor;
  if (descriptor.getLocation() == null) {
    newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(fs.makeQualified(new Path(newLocation(name))))
        .build();
  } else {
    // don't need to modify it
    newDescriptor = descriptor;
  }

  // save and return
  if (!descriptors.containsKey(namespace)) {
    descriptors.put(namespace, Maps.<String, DatasetDescriptor>newHashMap());
  }
  Map<String, DatasetDescriptor> datasets = descriptors.get(namespace);
  datasets.put(name, newDescriptor);

  return newDescriptor;
}
 
Example 6
Source File: HiveExternalMetadataProvider.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) {
  Compatibility.checkDatasetName(namespace, name);
  Compatibility.checkDescriptor(descriptor);

  String resolved = resolveNamespace(
      namespace, name, descriptor.getLocation());
  if (resolved != null) {
    if (resolved.equals(namespace)) {
      // the requested dataset already exists
      throw new DatasetExistsException(
          "Metadata already exists for dataset: " + namespace + "." + name);
    } else {
      // replacing old default.name table
      LOG.warn("Creating table {}.{} for {}: replaces default.{}",
          new Object[]{
              namespace, name, pathForDataset(namespace, name), name});
      // validate that the new metadata can read the existing data
      Compatibility.checkUpdate(load(resolved, name), descriptor);
    }
  }

  LOG.info("Creating an external Hive table: {}.{}", namespace, name);

  DatasetDescriptor newDescriptor = descriptor;

  if (descriptor.getLocation() == null) {
    // create a new descriptor with the dataset's location
    newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(pathForDataset(namespace, name))
        .build();
  }

  Path managerPath = new Path(new Path(newDescriptor.getLocation().toString()),
      SCHEMA_DIRECTORY);

  // Store the schema with the schema manager and use the
  // managed URI moving forward.
  SchemaManager manager = SchemaManager.create(conf, managerPath);

  URI managedSchemaUri = manager.writeSchema(descriptor.getSchema());

  try {
    newDescriptor = new DatasetDescriptor.Builder(newDescriptor)
        .schemaUri(managedSchemaUri)
        .build();
  } catch (IOException e) {
    throw new DatasetIOException("Unable to load schema", e);
  }

  // create the data directory first so it is owned by the current user, not Hive
  FileSystemUtil.ensureLocationExists(newDescriptor, conf);

  // this object will be the table metadata
  Table table = HiveUtils.tableForDescriptor(
      namespace, name, newDescriptor, true /* external table */);

  // assign the location of the the table
  getMetaStoreUtil().createTable(table);

  return newDescriptor;
}
 
Example 7
Source File: HiveManagedMetadataProvider.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
public DatasetDescriptor create(String namespace, String name, DatasetDescriptor descriptor) {
  Compatibility.checkDatasetName(namespace, name);
  Compatibility.checkDescriptor(descriptor);

  URI location = descriptor.getLocation();
  String resolved = resolveNamespace(namespace, name, location);
  if (resolved != null) {
    if (resolved.equals(namespace)) {
      // the requested dataset already exists
      throw new DatasetExistsException(
          "Metadata already exists for dataset: " + namespace + "." + name);
    } else if (location != null) {
      // if the location was set and matches an existing dataset, then this
      // dataset is replacing the existing and using its data
      DatasetDescriptor loaded = load(resolved, name);
      // replacing old default.name table
      LOG.warn("Creating table managed table {}.{}: replaces default.{}",
          new Object[]{namespace, name, name});
      // validate that the new metadata can read the existing data
      Compatibility.checkUpdate(loaded, descriptor);
      // if the table in the default namespace matches, then the location is
      // either null (and should be set to the existing) or matches. either
      // way, use the loaded location.
      location = loaded.getLocation();
    }
  }

  LOG.info("Creating a managed Hive table named: " + name);

  boolean isExternal = (location != null);

  DatasetDescriptor toCreate = descriptor;
  if (isExternal) {
    // add the location to the descriptor that will be used
    toCreate = new DatasetDescriptor.Builder(descriptor)
        .location(location)
        .build();
  }

  // construct the table metadata from a descriptor, but without the Avro schema
  // since we don't yet know its final location.
  Table table = HiveUtils.tableForDescriptor(
      namespace, name, toCreate, isExternal, false);

  // create it
  getMetaStoreUtil().createTable(table);

  // load the created table to get the final data location
  Table newTable = getMetaStoreUtil().getTable(namespace, name);

  Path managerPath = new Path(new Path(newTable.getSd().getLocation()), SCHEMA_DIRECTORY);

  // Write the Avro schema to that location and update the table appropriately.
  SchemaManager manager = SchemaManager.create(conf, managerPath);

  URI schemaLocation = manager.writeSchema(descriptor.getSchema());

  DatasetDescriptor newDescriptor = null;

  try {
    newDescriptor = new DatasetDescriptor.Builder(descriptor)
        .location(newTable.getSd().getLocation())
        .schemaUri(schemaLocation)
        .build();
  } catch (IOException e) {
    throw new DatasetIOException("Unable to set schema.", e);
  }

  // Add the schema now that it exists at an established URI.
  HiveUtils.updateTableSchema(newTable, newDescriptor);

  getMetaStoreUtil().alterTable(newTable);

  if (isExternal) {
    FileSystemUtil.ensureLocationExists(newDescriptor, conf);
  }

  return newDescriptor;
}