org.kitesdk.data.DatasetIOException Java Examples

The following examples show how to use org.kitesdk.data.DatasetIOException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileSystemDatasetRepository.java    From kite with Apache License 2.0 6 votes vote down vote up
public FileSystemDatasetRepository(
    Configuration conf, Path rootDirectory, MetadataProvider provider) {
  Preconditions.checkNotNull(conf, "Configuration cannot be null");
  Preconditions.checkNotNull(rootDirectory, "Root directory cannot be null");
  Preconditions.checkNotNull(provider, "Metadata provider cannot be null");

  try {
    this.fs = rootDirectory.getFileSystem(conf);
  } catch (IOException e) {
    throw new DatasetIOException(
        "Cannot get FileSystem for repository location: " + rootDirectory, e);
  }

  this.conf = conf;
  this.rootDirectory = fs.makeQualified(rootDirectory);
  this.repositoryUri = URI.create("repo:" + this.rootDirectory.toUri());
  this.metadataProvider = provider;
}
 
Example #2
Source File: JSONFileReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void close() {
  if (!state.equals(ReaderWriterState.OPEN)) {
    return;
  }

  LOG.debug("Closing reader on path:{}", path);

  iterator = null;
  try {
    incoming.close();
  } catch (IOException e) {
    throw new DatasetIOException("Unable to close reader path:" + path, e);
  }

  state = ReaderWriterState.CLOSED;
}
 
Example #3
Source File: BaseEntityBatch.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Checks an HTable out of the HTablePool and modifies it to take advantage of
 * batch puts. This is very useful when performing many consecutive puts.
 *
 * @param clientTemplate
 *          The client template to use
 * @param entityMapper
 *          The EntityMapper to use for mapping
 * @param pool
 *          The HBase table pool
 * @param tableName
 *          The name of the HBase table
 * @param writeBufferSize
 *          The batch buffer size in bytes.
 */
public BaseEntityBatch(HBaseClientTemplate clientTemplate,
    EntityMapper<E> entityMapper, HTablePool pool, String tableName,
    long writeBufferSize) {
  this.table = pool.getTable(tableName);
  this.table.setAutoFlush(false);
  this.clientTemplate = clientTemplate;
  this.entityMapper = entityMapper;
  this.state = ReaderWriterState.NEW;

  /**
   * If the writeBufferSize is less than the currentBufferSize, then the
   * buffer will get flushed automatically by HBase. This should never happen,
   * since we're getting a fresh table out of the pool, and the writeBuffer
   * should be empty.
   */
  try {
    table.setWriteBufferSize(writeBufferSize);
  } catch (IOException e) {
    throw new DatasetIOException("Error flushing commits for table ["
        + table + "]", e);
  }
}
 
Example #4
Source File: Loader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void load() {
  try {
    // load hdfs-site.xml by loading HdfsConfiguration
    FileSystem.getLocal(DefaultConfiguration.get());
  } catch (IOException e) {
    throw new DatasetIOException("Cannot load default config", e);
  }

  OptionBuilder<DatasetRepository> builder = new URIBuilder();

  // username and secret are the same; host is the bucket
  Registration.register(
      new URIPattern("s3n:/*path"),
      new URIPattern("s3n:/*path/:namespace/:dataset"),
      builder);
  Registration.register(
      new URIPattern("s3a:/*path"),
      new URIPattern("s3a:/*path/:namespace/:dataset"),
      builder);
}
 
Example #5
Source File: Loader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public DatasetRepository getFromOptions(Map<String, String> match) {
  String path = match.get("path");
  final Path root = (path == null || path.isEmpty()) ?
      new Path("/") : new Path("/", path);

  Configuration conf = DefaultConfiguration.get();
  FileSystem fs;
  try {
    fs = FileSystem.get(fileSystemURI(match), conf);
  } catch (IOException e) {
    // "Incomplete HDFS URI, no host" => add a helpful suggestion
    if (e.getMessage().startsWith("Incomplete")) {
      throw new DatasetIOException("Could not get a FileSystem: " +
          "make sure the credentials for " + match.get(URIPattern.SCHEME) +
          " URIs are configured.", e);
    }
    throw new DatasetIOException("Could not get a FileSystem", e);
  }
  return new FileSystemDatasetRepository.Builder()
      .configuration(new Configuration(conf)) // make a modifiable copy
      .rootDirectory(fs.makeQualified(root))
      .build();
}
 
Example #6
Source File: FileSystemView.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public long getLastModified() {
  long lastMod = -1;
  for (Iterator<Path> i = dirIterator(); i.hasNext(); ) {
    Path dir = i.next();
    try {
      for (FileStatus st : fs.listStatus(dir)) {
        if (lastMod < st.getModificationTime()) {
          lastMod = st.getModificationTime();
        }
      }
    } catch (IOException e) {
      throw new DatasetIOException("Cannot find last modified time of of " + dir, e);
    }
  }

  // if view was marked ready more recently count it as the modified time
  if (signalManager != null) {
    long readyTimestamp = signalManager.getReadyTimestamp(getConstraints());
    if (lastMod < readyTimestamp) {
      lastMod = readyTimestamp;
    }
  }

  return lastMod;
}
 
Example #7
Source File: CSVFileReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void close() {
  if (!state.equals(ReaderWriterState.OPEN)) {
    return;
  }

  LOG.debug("Closing reader on path:{}", path);

  try {
    reader.close();
  } catch (IOException e) {
    throw new DatasetIOException("Unable to close reader path:" + path, e);
  }

  state = ReaderWriterState.CLOSED;
}
 
Example #8
Source File: ParquetFileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize() {
  Preconditions.checkState(state.equals(ReaderWriterState.NEW),
    "A reader may not be opened more than once - current state:%s", state);

  LOG.debug("Opening reader on path:{}", path);

  try {
    final Configuration conf = fileSystem.getConf();
    AvroReadSupport.setAvroReadSchema(conf, readerSchema);
    reader = new AvroParquetReader<E>(
        conf, fileSystem.makeQualified(path));
  } catch (IOException e) {
    throw new DatasetIOException("Unable to create reader path:" + path, e);
  }

  advance();

  state = ReaderWriterState.OPEN;
}
 
Example #9
Source File: ParquetFileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void close() {
  if (!state.equals(ReaderWriterState.OPEN)) {
    return;
  }

  LOG.debug("Closing reader on path:{}", path);

  try {
    reader.close();
  } catch (IOException e) {
    this.state = ReaderWriterState.ERROR;
    throw new DatasetIOException("Unable to close reader path:" + path, e);
  }

  state = ReaderWriterState.CLOSED;
}
 
Example #10
Source File: TestPartitionStrategyParser.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testInputStreamIOException() {
  TestHelpers.assertThrows("Should pass DatasetIOException",
      DatasetIOException.class, new Runnable() {
        @Override
        public void run() {
          PartitionStrategyParser.parse(new InputStream() {
            @Override
            public int read() throws IOException {
              throw new IOException("InputStream angry.");
            }
          });
        }
      }
  );
}
 
Example #11
Source File: TestColumnMappingParser.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test
public void testInputStreamIOException() {
  TestHelpers.assertThrows("Should pass DatasetIOException",
      DatasetIOException.class, new Runnable() {
        @Override
        public void run() {
          ColumnMappingParser.parse(new InputStream() {
            @Override
            public int read() throws IOException {
              throw new IOException("InputStream angry.");
            }
          });
        }
      }
  );
}
 
Example #12
Source File: SchemaManager.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Loads a schema manager that stores data under the given dataset root
 * directory it exists. Returns <code>null</code> if it does not.
 *
 * @param conf the Hadoop configuration
 * @param schemaDirectory directory in which the manager stores schemas.
 *
 * @return a schema manager instance, or <code>null</code> if the given
 * directory does not exist.
 */
public static SchemaManager load(Configuration conf, Path schemaDirectory) {

  try {

    FileSystem rootFileSystem = schemaDirectory.getFileSystem(conf);

    if (rootFileSystem.exists(schemaDirectory)) {
      return new SchemaManager(schemaDirectory, rootFileSystem);
    } else {
      return null;
    }

  } catch (IOException e) {
    throw new DatasetIOException ("Cannot load schema manager at:"
            + schemaDirectory, e);
  }
}
 
Example #13
Source File: TestFileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Test(expected = DatasetIOException.class)
public void testEmptyFile() throws IOException {
  final Path emptyFile = new Path("/tmp/empty-file.avro");

  // outside the try block; if this fails then it isn't correct to remove it
  Assert.assertTrue("Failed to create a new empty file",
      fileSystem.createNewFile(emptyFile));

  try {
    AbstractDatasetReader<String> reader = new FileSystemDatasetReader<String>(
        fileSystem, emptyFile, STRING_SCHEMA, String.class);

    // the reader should not fail until open()
    Assert.assertNotNull(reader);

    reader.initialize();
  } finally {
    Assert.assertTrue("Failed to clean up empty file",
        fileSystem.delete(emptyFile, true));
  }
}
 
Example #14
Source File: TaskUtil.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Adds a set of jar paths to the distributed cache.
 *
 * @param jars A list of jar paths
 * @return this for method chaining
 *
 * @since 0.16.0
 */
public ConfigBuilder addJars(List<String> jars) {
  if (!skipDistributedCache && jars != null) {
    for (String jar : jars) {
      try {
        File path = new File(jar);
        Preconditions.checkArgument(path.exists(),
            "Jar files does not exist: " + jar);
        Preconditions.checkArgument(path.isFile(),
            "Not a file: " + jar);
        Preconditions.checkArgument(path.canRead(),
            "Cannot read jar file: " + jar);
        DistCache.addJarToDistributedCache(conf, path);
      } catch (IOException e) {
        throw new DatasetIOException(
            "Cannot add jar to distributed cache: " + jar, e);
      }
    }
  }
  return this;
}
 
Example #15
Source File: InputFormatReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void close() {
  if (!state.equals(ReaderWriterState.OPEN)) {
    return;
  }

  this.state = ReaderWriterState.CLOSED;

  try {
    if (currentReader != null) {
      currentReader.close();
    }
  } catch (IOException e) {
    throw new DatasetIOException("Unable to close reader path:" + path, e);
  }

  this.hasNext = false;
}
 
Example #16
Source File: SignalManager.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Check the last time the specified constraints have been signaled as ready.
 *
 * @param viewConstraints The constraints to check for a signal.
 *
 * @return the timestamp of the last time the constraints were signaled as ready.
 *          if the constraints have never been signaled, -1 will be returned.
 *
 * @throws DatasetException if the signals could not be accessed.
 */
public long getReadyTimestamp(Constraints viewConstraints) {
  String normalizedConstraints = getNormalizedConstraints(viewConstraints);

  Path signalPath = new Path(signalDirectory, normalizedConstraints);
  // check if the signal exists
  try {
    try {
      FileStatus signalStatus = rootFileSystem.getFileStatus(signalPath);
      return signalStatus.getModificationTime();
    } catch (final FileNotFoundException ex) {
      // empty, will be thrown when the signal path doesn't exist
    }
    return -1;
  } catch (IOException e) {
    throw new DatasetIOException("Could not access signal path: " + signalPath, e);
  }
}
 
Example #17
Source File: FileSystemDataset.java    From kite with Apache License 2.0 6 votes vote down vote up
public FileSystemDataset<E> build() {
  Preconditions.checkState(this.namespace != null, "No namespace defined");
  Preconditions.checkState(this.name != null, "No dataset name defined");
  Preconditions.checkState(this.descriptor != null,
    "No dataset descriptor defined");
  Preconditions.checkState((conf != null) || (fileSystem != null),
      "Configuration or FileSystem must be set");
  Preconditions.checkState(type != null, "No type specified");

  this.directory = new Path(descriptor.getLocation().toString());

  if (fileSystem == null) {
    try {
      this.fileSystem = directory.getFileSystem(conf);
    } catch (IOException ex) {
      throw new DatasetIOException("Cannot access FileSystem", ex);
    }
  }

  Path absoluteDirectory = fileSystem.makeQualified(directory);
  return new FileSystemDataset<E>(
      fileSystem, absoluteDirectory, namespace, name, descriptor, uri,
      partitionKey, partitionListener, type);
}
 
Example #18
Source File: PartitionStrategyParser.java    From kite with Apache License 2.0 6 votes vote down vote up
public static String toString(PartitionStrategy strategy, boolean pretty) {
  StringWriter writer = new StringWriter();
  JsonGenerator gen;
  try {
    gen = new JsonFactory().createGenerator(writer);
    if (pretty) {
      gen.useDefaultPrettyPrinter();
    }
    gen.setCodec(new ObjectMapper());
    gen.writeTree(toJson(strategy));
    gen.close();
  } catch (IOException e) {
    throw new DatasetIOException("Cannot write to JSON generator", e);
  }
  return writer.toString();
}
 
Example #19
Source File: FileSystemDataset.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public void dropPartition(PartitionKey key) {
  Preconditions.checkState(descriptor.isPartitioned(),
    "Attempt to drop a partition on a non-partitioned dataset (name:%s)",
    name);
  Preconditions.checkNotNull(key, "Partition key may not be null");

  LOG.debug("Dropping partition with key:{} dataset:{}", key, name);

  Path partitionDirectory = toDirectoryName(directory, key);

  try {
    if (!fileSystem.delete(partitionDirectory, true)) {
      throw new IOException("Partition directory " + partitionDirectory
        + " for key " + key + " does not exist");
    }
  } catch (IOException e) {
    throw new DatasetIOException("Unable to locate or drop dataset partition directory " + partitionDirectory, e);
  }
}
 
Example #20
Source File: ColumnMappingParser.java    From kite with Apache License 2.0 6 votes vote down vote up
public static String toString(ColumnMapping mapping, boolean pretty) {
  StringWriter writer = new StringWriter();
  JsonGenerator gen;
  try {
    gen = new JsonFactory().createGenerator(writer);
    if (pretty) {
      gen.useDefaultPrettyPrinter();
    }
    gen.setCodec(new ObjectMapper());
    gen.writeTree(toJson(mapping));
    gen.close();
  } catch (IOException e) {
    throw new DatasetIOException("Cannot write to JSON generator", e);
  }
  return writer.toString();
}
 
Example #21
Source File: FileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void close() {
  if (!state.equals(ReaderWriterState.OPEN)) {
    return;
  }

  LOG.debug("Closing reader on path:{}", path);

  try {
    reader.close();
  } catch (IOException e) {
    throw new DatasetIOException("Unable to close reader path:" + path, e);
  }

  state = ReaderWriterState.CLOSED;
}
 
Example #22
Source File: FileSystemDatasetReader.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize() {
  Preconditions.checkState(state.equals(ReaderWriterState.NEW),
    "A reader may not be opened more than once - current state:%s", state);

  LOG.debug("Opening reader on path:{}", path);

  try {
    reader = new DataFileReader<E>(new AvroFSInput(fileSystem.open(path),
      fileSystem.getFileStatus(path).getLen()),
        DataModelUtil.getDatumReaderForType(type, schema));
  } catch (IOException e) {
    throw new DatasetIOException("Unable to create reader path:" + path, e);
  }

  state = ReaderWriterState.OPEN;
}
 
Example #23
Source File: SchemaManager.java    From kite with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the path of the newest schema file, or null if none exists.
 */
private Path newestFile() {

  try {
    FileStatus[] statuses = rootFileSystem.listStatus(schemaDirectory);

    // No schema files exist, so return null;
    if (statuses.length == 0) {
      return null;
    }

    // Sort the schema files and return the newest one.
    Arrays.sort(statuses, new FileNameComparator());

    return statuses[statuses.length - 1].getPath();

  } catch (IOException e) {
    throw new DatasetIOException("Unable to list schema files.", e);
  }
}
 
Example #24
Source File: ColumnMappingParser.java    From kite with Apache License 2.0 5 votes vote down vote up
public static String toString(FieldMapping mapping) {
  StringWriter writer = new StringWriter();
  JsonGenerator gen;
  try {
    gen = new JsonFactory().createGenerator(writer);
    gen.setCodec(new ObjectMapper());
    gen.writeTree(toJson(mapping));
    gen.close();
  } catch (IOException e) {
    throw new DatasetIOException("Cannot write to JSON generator", e);
  }
  return writer.toString();
}
 
Example #25
Source File: BaseEntityBatch.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public void close() {
  if (state.equals(ReaderWriterState.OPEN)) {
    try {
      table.flushCommits();
      table.setAutoFlush(true);
      table.close();
    } catch (IOException e) {
      throw new DatasetIOException("Error closing table [" + table + "]", e);
    }
    state = ReaderWriterState.CLOSED;
  }
}
 
Example #26
Source File: HiveUtils.java    From kite with Apache License 2.0 5 votes vote down vote up
static FileSystem fsForPath(Configuration conf, Path path) {
  try {
    return path.getFileSystem(conf);
  } catch (IOException ex) {
    throw new DatasetIOException("Cannot access FileSystem for uri:" + path, ex);
  }
}
 
Example #27
Source File: Loader.java    From kite with Apache License 2.0 5 votes vote down vote up
@Override
public DatasetRepository getFromOptions(Map<String, String> match) {
  LOG.debug("External URI options: {}", match);
  final Path root;
  String path = match.get("path");
  if (match.containsKey("absolute")
      && Boolean.valueOf(match.get("absolute"))) {
    root = (path == null || path.isEmpty()) ? new Path("/") : new Path("/", path);
  } else {
    root = (path == null || path.isEmpty()) ? new Path(".") : new Path(path);
  }

  // make a modifiable copy (it may be changed)
  Configuration conf = newHiveConf(DefaultConfiguration.get());
  FileSystem fs;
  try {
    fs = FileSystem.get(fileSystemURI(match, conf), conf);
  } catch (IOException e) {
    // "Incomplete HDFS URI, no host" => add a helpful suggestion
    if (e.getMessage().startsWith("Incomplete")) {
      throw new DatasetIOException("Could not get a FileSystem: " +
          "make sure the default " + match.get(URIPattern.SCHEME) +
          " URI is configured.", e);
    }
    throw new DatasetIOException("Could not get a FileSystem", e);
  }

  // setup the MetaStore URI
  setMetaStoreURI(conf, match);

  return new HiveManagedDatasetRepository.Builder()
      .configuration(conf)
      .rootDirectory(fs.makeQualified(root))
      .build();
}
 
Example #28
Source File: TaskUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
/**
 * Finds the jar that contains the required class and adds it to the
 * distributed cache configuration.
 *
 * @param requiredClass a class required for a MR job
 * @return this for method chaining
 */
public ConfigBuilder addJarForClass(Class<?> requiredClass) {
  if (!skipDistributedCache) {
    File jar = findJarForClass(requiredClass);
    try {
      DistCache.addJarToDistributedCache(conf, jar);
    } catch (IOException e) {
      throw new DatasetIOException(
          "Cannot add jar to distributed cache: " + jar, e);
    }
  }
  return this;
}
 
Example #29
Source File: FileSystemDatasetRepository.java    From kite with Apache License 2.0 5 votes vote down vote up
private static FileSystem fsForPath(Path dataPath, Configuration conf) {
  try {
    return dataPath.getFileSystem(conf);
  } catch (IOException ex) {
    throw new DatasetIOException(
        "Cannot get FileSystem for descriptor", ex);
  }
}
 
Example #30
Source File: HiveExternalMetadataProvider.java    From kite with Apache License 2.0 5 votes vote down vote up
public HiveExternalMetadataProvider(Configuration conf, Path rootDirectory) {
  super(conf);
  Preconditions.checkNotNull(rootDirectory, "Root cannot be null");

  try {
    this.rootFileSystem = rootDirectory.getFileSystem(conf);
    this.rootDirectory = rootFileSystem.makeQualified(rootDirectory);
  } catch (IOException ex) {
    throw new DatasetIOException("Could not get FileSystem for root path", ex);
  }
}