Java Code Examples for org.kitesdk.data.Datasets#load()

The following examples show how to use org.kitesdk.data.Datasets#load() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testLoadChangedAbsolutePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:/ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use actual namespace",
      "dataset:hive:default/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 2
Source File: ReadDataset.java    From kite-examples with Apache License 2.0 6 votes vote down vote up
@Override
public int run(String[] args) throws Exception {

  // Load the events dataset
  Dataset<GenericRecord> events = Datasets.load("dataset:hive:/tmp/data/default/events");

  // Get a reader for the dataset and read all the events
  DatasetReader<GenericRecord> reader = events.newReader();
  try {
    for (GenericRecord event : reader) {
      System.out.println(event);
    }
  } finally {
    reader.close();
  }

  return 0;
}
 
Example 3
Source File: TestHiveDatasetURIsCompatibility.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testLoadChangedRelativePathURIMissingNamespace() {
  // this used to be a relative external URI, but is now a managed URI
  String uri = "dataset:hive:ds";

  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/tmp/data");
  DatasetDescriptor withLocation = new DatasetDescriptor.Builder(DESCRIPTOR)
      .location("file:/tmp/data/ds") // old location
      .build();
  Dataset<GenericRecord> expected = repo.create(
      "default", "ds", withLocation, GenericRecord.class);

  Dataset<GenericRecord> actual = Datasets.load(uri);
  Assert.assertEquals("Should load existing dataset default.ds",
      expected, actual);

  Assert.assertEquals("URI should use actual namespace",
      "dataset:hive:default/ds", actual.getUri().toString());

  Assert.assertTrue(Datasets.delete(uri));
}
 
Example 4
Source File: TestLocalDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testAbsolute() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:file:/tmp/data");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Record> ds = Datasets.<Record, Dataset<Record>>
      load("dataset:file:/tmp/data/ns/test", Record.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("file:/tmp/data/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", ds.getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", ds.getName());

  repo.delete("ns", "test");
}
 
Example 5
Source File: TestHiveDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testManagedHDFSQueryOptions() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive?" + hdfsQueryArgs);
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets
      .<Object, Dataset<Object>>load("dataset:hive?dataset=test&namespace=ns&" + hdfsQueryArgsOld, Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", ds.getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", ds.getName());

  repo.delete("ns", "test");
}
 
Example 6
Source File: TestHiveDatasetURIsWithDefaultConfiguration.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testExternalRelative() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:hive:data");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<GenericRecord> ds = Datasets.load("dataset:hive:data/ns/test");

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Path cwd = getDFS().makeQualified(new Path("."));
  Assert.assertEquals("Locations should match",
      new Path(cwd, "data/ns/test").toUri(), ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());

  repo.delete("ns", "test");
}
 
Example 7
Source File: TestHiveDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testExternalRoot() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive:/?" + hdfsQueryArgs);
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets
      .<Object, Dataset<Object>>load("dataset:hive:/ns/test?" + hdfsQueryArgs, Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("hdfs://" + hdfsAuth + "/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", ds.getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", ds.getName());

  repo.delete("ns", "test");
}
 
Example 8
Source File: TestHiveDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testManaged() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hive?" + hdfsQueryArgs);
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets
      .<Object, Dataset<Object>>load("dataset:hive?dataset=test&namespace=ns&" + hdfsQueryArgs, Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", ds.getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", ds.getName());

  repo.delete("ns", "test");
}
 
Example 9
Source File: TestCreateDatasetWithExistingData.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromExisting() throws Exception {
  command.datasets = Lists.newArrayList(existingDataURI);
  command.run();

  verify(console).debug(contains("Created"), eq(existingDataURI));

  // load the new dataset and verify it
  Dataset<GenericRecord> users = Datasets.load(existingDataURI);
  Assert.assertEquals("Schema should match",
      USER_SCHEMA, users.getDescriptor().getSchema());
  Assert.assertFalse("Should not be partitioned",
      users.getDescriptor().isPartitioned());
  Assert.assertEquals("Should be Parquet",
      Formats.PARQUET, users.getDescriptor().getFormat());
}
 
Example 10
Source File: TestHDFSDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testRelative() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hdfs://" + hdfsAuth + "/data?absolute=false");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets.<Object, Dataset<Object>>
      load("dataset:hdfs://" + hdfsAuth + "/data/ns/test?absolute=false",
      Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Path cwd = getDFS().makeQualified(new Path("."));
  Assert.assertEquals("Locations should match",
      new Path(cwd, "data/ns/test").toUri(), ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", ds.getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", ds.getName());

  repo.delete("ns", "test");
}
 
Example 11
Source File: TestHDFSDatasetURIs.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testAbsoluteRoot() {
  DatasetRepository repo = DatasetRepositories
      .repositoryFor("repo:hdfs://" + hdfsAuth + "/");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  Dataset<Object> ds = Datasets.<Object, Dataset<Object>>
      load("dataset:hdfs://" + hdfsAuth + "/ns/test",
      Object.class);

  Assert.assertNotNull("Should load dataset", ds);
  Assert.assertTrue(ds instanceof FileSystemDataset);
  Assert.assertEquals("Locations should match",
      URI.create("hdfs://" + hdfsAuth + "/ns/test"),
      ds.getDescriptor().getLocation());
  Assert.assertEquals("Descriptors should match",
      repo.load("ns", "test").getDescriptor(), ds.getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", ds.getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", ds.getName());

  repo.delete("ns", "test");
}
 
Example 12
Source File: TestHDFSDatasetURIs.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testMissingHDFSAuthority() {
  try {
    Datasets.load("dataset:hdfs:/tmp/data/ns/test", Object.class);
    Assert.fail("Shouldn't be able to connect to HDFS");
  } catch (DatasetIOException e) {
    Assert.assertTrue("Should have helpful error message",
        e.getMessage().contains("make sure the default hdfs URI is configured"));
  }
}
 
Example 13
Source File: TestLocalDatasetURIs.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testViewConstraints() {
  DatasetRepository repo = DatasetRepositories.repositoryFor("repo:file:/tmp/data");
  repo.delete("ns", "test");
  repo.create("ns", "test", descriptor);

  RefinableView<Record> v = Datasets.<Record, RefinableView<Record>>
      load("view:file:/tmp/data/ns/test?username=user", Record.class);

  Assert.assertNotNull("Should load view", v);
  Assert.assertTrue(v instanceof FileSystemView);
  Assert.assertEquals("Locations should match",
      URI.create("file:/tmp/data/ns/test"),
      v.getDataset().getDescriptor().getLocation());

  DatasetDescriptor loaded = repo.load("ns", "test").getDescriptor();
  Assert.assertEquals("Descriptors should match",
      loaded, v.getDataset().getDescriptor());
  Assert.assertEquals("Should report correct namespace",
      "ns", v.getDataset().getNamespace());
  Assert.assertEquals("Should report correct name",
      "test", v.getDataset().getName());

  Constraints withUser = new Constraints(loaded.getSchema())
      .with("username", new Utf8("user"));
  Assert.assertEquals("Constraints should be username=user",
      withUser, ((FileSystemView) v).getConstraints());

  repo.delete("ns", "test");
}
 
Example 14
Source File: BaseDatasetCommand.java    From kite with Apache License 2.0 5 votes vote down vote up
protected <E> View<E> load(String uriOrName, Class<E> type) {
  if (isDatasetOrViewUri(uriOrName)) {
    return Datasets.<E, View<E>>load(uriOrName, type);
  } else {
    return getDatasetRepository().load(namespace, uriOrName, type);
  }
}
 
Example 15
Source File: KiteDatasetExecutor.java    From sqoop-on-spark with Apache License 2.0 5 votes vote down vote up
/**
 * Merges a dataset into this.
 */
public void mergeDataset(String uri) {
  FileSystemDataset<GenericRecord> update = Datasets.load(uri);
  if (dataset instanceof FileSystemDataset) {
    ((FileSystemDataset<GenericRecord>) dataset).merge(update);
    // And let's completely drop the temporary dataset
    Datasets.delete(uri);
  } else {
    throw new SqoopException(
        KiteConnectorError.GENERIC_KITE_CONNECTOR_0000, uri);
  }
}
 
Example 16
Source File: KiteFromInitializer.java    From sqoop-on-spark with Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(InitializerContext context,
    LinkConfiguration linkConfig, FromJobConfiguration fromJobConfig) {
  String uri = ConfigUtil.buildDatasetUri(
      linkConfig.linkConfig, fromJobConfig.fromJobConfig.uri);
  Dataset dataset = Datasets.load(uri);
  org.apache.avro.Schema avroSchema = dataset.getDescriptor().getSchema();
  return AvroDataTypeUtil.createSqoopSchema(avroSchema);
}
 
Example 17
Source File: TestHiveImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private void verifyHiveDataset(String tableName, Object[][] valsArray) {
  String datasetUri = String.format("dataset:hive:default/%s",
      tableName.toLowerCase());
  assertTrue(Datasets.exists(datasetUri));
  Dataset dataset = Datasets.load(datasetUri);
  assertFalse(dataset.isEmpty());

  DatasetReader<GenericRecord> reader = dataset.newReader();
  try {
    List<String> expectations = new ArrayList<String>();
    if (valsArray != null) {
      for (Object[] vals : valsArray) {
        expectations.add(Arrays.toString(vals));
      }
    }

    while (reader.hasNext() && expectations.size() > 0) {
      String actual = Arrays.toString(
          convertGenericRecordToArray(reader.next()));
      assertTrue("Expect record: " + actual, expectations.remove(actual));
    }
    assertFalse(reader.hasNext());
    assertEquals(0, expectations.size());
  } finally {
    reader.close();
  }
}
 
Example 18
Source File: DatasetTarget.java    From kite with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public boolean handleExisting(WriteMode writeMode, long lastModForSource,
    Configuration entries) {
  outputConf(
      DatasetKeyOutputFormat.KITE_WRITE_MODE,
      kiteWriteMode(writeMode).toString());

  if (view == null) {
    try {
      view = Datasets.load(uri);
    } catch (DatasetNotFoundException e) {
      LOG.info("Writing to new dataset/view: " + uri);
      return true;
    }
  }

  boolean ready = false;
  if (view instanceof Signalable) {
    ready = ((Signalable)view).isReady();
  }
  // a view exists if it isn't empty, or if it has been marked ready
  boolean exists = ready || !view.isEmpty();
  if (exists) {
    switch (writeMode) {
      case DEFAULT:
        LOG.error("Dataset/view " + view + " already exists!");
        throw new CrunchRuntimeException("Dataset/view already exists: " + view);
      case OVERWRITE:
        LOG.info("Overwriting existing dataset/view: " + view);
        break;
      case APPEND:
        LOG.info("Appending to existing dataset/view: " + view);
        break;
      case CHECKPOINT:
        long lastModForTarget = -1;
        if (view instanceof LastModifiedAccessor) {
          lastModForTarget = ((LastModifiedAccessor) view).getLastModified();
        }

        if (ready && (lastModForTarget > lastModForSource)) {
          LOG.info("Re-starting pipeline from checkpoint dataset/view: " + view);
          break;
        } else {
          if (!ready) {
            LOG.info("Checkpoint is not ready. Deleting data from existing " +
                "checkpoint dataset/view: " + view);
          } else {
            LOG.info("Source data has recent updates. Deleting data from existing " +
                "checkpoint dataset/view: " + view);
          }
          delete(view);
          return false;
        }
      default:
        throw new CrunchRuntimeException("Unknown WriteMode:  " + writeMode);
    }
  } else {
    LOG.info("Writing to empty dataset/view: " + view);
  }
  return exists;
}
 
Example 19
Source File: TestParquetImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 4 votes vote down vote up
private Dataset<GenericRecord> getDataset() {
  String uri = "dataset:file:" + getTablePath();
  return Datasets.load(uri, GenericRecord.class);
}
 
Example 20
Source File: ParquetJob.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 4 votes vote down vote up
/**
 * Configure the import job. The import process will use a Kite dataset to
 * write data records into Parquet format internally. The input key class is
 * {@link org.apache.sqoop.lib.SqoopRecord}. The output key is
 * {@link org.apache.avro.generic.GenericRecord}.
 */
public static void configureImportJob(JobConf conf, Schema schema,
    String uri, WriteMode writeMode) throws IOException {
  Dataset dataset;

  // Add hive delegation token only if we don't already have one.
  if (uri.startsWith("dataset:hive")) {
    Configuration hiveConf = HiveConfig.getHiveConf(conf);
    if (isSecureMetastore(hiveConf)) {
      // Copy hive configs to job config
      HiveConfig.addHiveConfigs(hiveConf, conf);

      if (conf.getCredentials().getToken(new Text(HIVE_METASTORE_TOKEN_ALIAS)) == null) {
        addHiveDelegationToken(conf);
      }
    }
  }

  if (Datasets.exists(uri)) {
    if (WriteMode.DEFAULT.equals(writeMode)) {
      throw new IOException("Destination exists! " + uri);
    }

    dataset = Datasets.load(uri);
    Schema writtenWith = dataset.getDescriptor().getSchema();
    if (!SchemaValidationUtil.canRead(writtenWith, schema)) {
      throw new IOException(
          String.format("Expected schema: %s%nActual schema: %s",
              writtenWith, schema));
    }
  } else {
    dataset = createDataset(schema, getCompressionType(conf), uri);
  }
  conf.set(CONF_AVRO_SCHEMA, schema.toString());

  DatasetKeyOutputFormat.ConfigBuilder builder =
      DatasetKeyOutputFormat.configure(conf);
  if (WriteMode.OVERWRITE.equals(writeMode)) {
    builder.overwrite(dataset);
  } else if (WriteMode.APPEND.equals(writeMode)) {
    builder.appendTo(dataset);
  } else {
    builder.writeTo(dataset);
  }
}