org.apache.avro.file.SeekableInput Java Examples

The following examples show how to use org.apache.avro.file.SeekableInput. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroScanner.java    From tajo with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the AvroScanner.
 */
@Override
public void init() throws IOException {
  if (targets == null) {
    targets = schema.toArray();
  }
  prepareProjection(targets);
  outTuple = new VTuple(projectionMap.length);

  Schema avroSchema = AvroUtil.getAvroSchema(meta, conf);
  avroFields = avroSchema.getFields();

  DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(avroSchema);
  SeekableInput input = new FsInput(fragment.getPath(), conf);
  dataFileReader = new DataFileReader<>(input, datumReader);
  super.init();
}
 
Example #2
Source File: AvroInputFormat.java    From stratosphere with Apache License 2.0 6 votes vote down vote up
@Override
public void open(FileInputSplit split) throws IOException {
	super.open(split);

	DatumReader<E> datumReader;
	if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) {
		datumReader = new SpecificDatumReader<E>(avroValueType);
	} else {
		datumReader = new ReflectDatumReader<E>(avroValueType);
	}
	
	LOG.info("Opening split " + split);
	
	SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
	
	dataFileReader = DataFileReader.openReader(in, datumReader);
	dataFileReader.sync(split.getStart());
}
 
Example #3
Source File: AvroInputFormat.java    From stratosphere with Apache License 2.0 6 votes vote down vote up
@Override
public void open(FileInputSplit split) throws IOException {
	super.open(split);
	
	this.wrapper = InstantiationUtil.instantiate(avroWrapperTypeClass, AvroBaseValue.class);
	
	DatumReader<E> datumReader;
	if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) {
		datumReader = new SpecificDatumReader<E>(avroValueType);
	} else {
		datumReader = new ReflectDatumReader<E>(avroValueType);
	}
	
	LOG.info("Opening split " + split);
	
	SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
	
	dataFileReader = DataFileReader.openReader(in, datumReader);
	dataFileReader.sync(split.getStart());
	
	reuseAvroValue = null;
}
 
Example #4
Source File: SchemaCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}
 
Example #5
Source File: BaseCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
protected Schema getAvroSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return Schemas.fromParquet(getConf(), qualifiedURI(source));
      case AVRO:
        return Schemas.fromAvro(open(source));
      case TEXT:
        if (source.endsWith("avsc")) {
          return Schemas.fromAvsc(open(source));
        } else if (source.endsWith("json")) {
          return Schemas.fromJSON("json", open(source));
        }
      default:
    }

    throw new IllegalArgumentException(String.format(
        "Could not determine file format of %s.", source));
  }
}
 
Example #6
Source File: AvroInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
	DatumReader<E> datumReader;

	if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
		datumReader = new GenericDatumReader<E>();
	} else {
		datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)
			? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
	}
	if (LOG.isInfoEnabled()) {
		LOG.info("Opening split {}", split);
	}

	SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
	DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);

	if (LOG.isDebugEnabled()) {
		LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
	}

	end = split.getStart() + split.getLength();
	recordsReadSinceLastSync = 0;
	return dataFileReader;
}
 
Example #7
Source File: AvroRecordWriterTest.java    From data-highway with Apache License 2.0 6 votes vote down vote up
@Test
public void typical() throws Exception {
  Schema schema = SchemaBuilder
      .builder()
      .record("record")
      .fields()
      .requiredLong("id")
      .requiredString("name")
      .endRecord();
  Record value = new GenericRecordBuilder(schema).set("id", 1L).set("name", "hello").build();
  ByteArrayOutputStream output = new ByteArrayOutputStream();

  Factory factory = new Factory(CodecFactory.nullCodec());
  RecordWriter writer = factory.create(schema, output);
  writer.write(value);
  writer.close();

  SeekableInput input = new SeekableByteArrayInput(output.toByteArray());
  DatumReader<Record> datumReader = new GenericDatumReader<>(schema);
  DataFileReader<Record> dataFileReader = new DataFileReader<>(input, datumReader);
  assertThat(dataFileReader.next(), is(value));
  assertThat(dataFileReader.hasNext(), is(false));
  dataFileReader.close();
}
 
Example #8
Source File: Purge.java    From Cubert with Apache License 2.0 6 votes vote down vote up
private DataFileReader<GenericRecord> createDataFileReader(String filename,
                                                           boolean localFS) throws IOException
{
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> dataFileReader;

    if (localFS)
    {
        dataFileReader =
                new DataFileReader<GenericRecord>(new File(filename), datumReader);
    }
    else
    {
        Path path = new Path(filename);
        SeekableInput input = new FsInput(path, conf);
        dataFileReader = new DataFileReader<GenericRecord>(input, datumReader);
    }

    return dataFileReader;
}
 
Example #9
Source File: AvroInputFormat.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
	DatumReader<E> datumReader;

	if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
		datumReader = new GenericDatumReader<E>();
	} else {
		datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)
			? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
	}
	if (LOG.isInfoEnabled()) {
		LOG.info("Opening split {}", split);
	}

	SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
	DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);

	if (LOG.isDebugEnabled()) {
		LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
	}

	end = split.getStart() + split.getLength();
	recordsReadSinceLastSync = 0;
	return dataFileReader;
}
 
Example #10
Source File: AvroInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
	DatumReader<E> datumReader;

	if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
		datumReader = new GenericDatumReader<E>();
	} else {
		datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)
			? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
	}
	if (LOG.isInfoEnabled()) {
		LOG.info("Opening split {}", split);
	}

	SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
	DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);

	if (LOG.isDebugEnabled()) {
		LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
	}

	end = split.getStart() + split.getLength();
	recordsReadSinceLastSync = 0;
	return dataFileReader;
}
 
Example #11
Source File: AvroToOrcRecordConverter.java    From datacollector with Apache License 2.0 6 votes vote down vote up
public void convert(SeekableInput avroInputFile, Path orcOutputFile) throws IOException {
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroInputFile, reader)) {
    Schema avroSchema = fileReader.getSchema();

    initializeWriter(avroSchema, orcOutputFile);

    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();

      addAvroRecord(record);
    }

    closeWriter();
  }
}
 
Example #12
Source File: AvroUtilsTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public static List<GenericRecord> getRecordFromFile(String path)
    throws IOException {
  Configuration config = new Configuration();
  SeekableInput input = new FsInput(new Path(path), config);
  DatumReader<GenericRecord> reader1 = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1);
  List<GenericRecord> records = new ArrayList<>();
  for (GenericRecord datum : fileReader) {
    records.add(datum);
  }
  fileReader.close();
  return records;
}
 
Example #13
Source File: AvroRecordInputFormat.java    From stratosphere with Apache License 2.0 5 votes vote down vote up
@Override
public void open(FileInputSplit split) throws IOException {
	super.open(split);
	DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
	SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
	LOG.info("Opening split " + split);
	dataFileReader = DataFileReader.openReader(in, datumReader);
	dataFileReader.sync(split.getStart());
}
 
Example #14
Source File: AvroUtil.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
/**
 * Get the schema of AVRO files stored in a directory
 */
public static Schema getAvroSchema(Path path, Configuration conf)
    throws IOException {
  FileSystem fs = path.getFileSystem(conf);
  Path fileToTest;
  if (fs.isDirectory(path)) {
    FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
      @Override
      public boolean accept(Path p) {
        String name = p.getName();
        return !name.startsWith("_") && !name.startsWith(".");
      }
    });
    if (fileStatuses.length == 0) {
      return null;
    }
    fileToTest = fileStatuses[0].getPath();
  } else {
    fileToTest = path;
  }

  SeekableInput input = new FsInput(fileToTest, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);

  Schema result = fileReader.getSchema();
  fileReader.close();
  return result;
}
 
Example #15
Source File: AvroRecordReader.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration conf = context.getConfiguration();
  SeekableInput in = new FsInput(split.getPath(), conf);
  DatumReader<T> datumReader = new GenericDatumReader<T>();
  this.reader = DataFileReader.openReader(in, datumReader);
  reader.sync(split.getStart());                    // sync to start
  this.start = reader.tell();
  this.end = split.getStart() + split.getLength();
}
 
Example #16
Source File: AvroTestTools.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Read all avro records in an HDFS location into a map from file name to {@link RecordIterator}.
 */
@Override
public TreeMap<String, RecordIterator> readAllRecordsInBinaryDirectory(FileSystem fs, Path path)
    throws IOException {
  TreeMap<String, RecordIterator> output = new TreeMap<>();
  if (!fs.exists(path)) {
    return output;
  }
  PathFilter pathFilter = new HiddenFilter();
  for (FileStatus status : FileListUtils.listFilesRecursively(fs, path, pathFilter)) {
    SeekableInput sin = new FsInput(status.getPath(), fs);
    DataFileReader<GenericRecord> dfr = new DataFileReader<>(sin, new GenericDatumReader<>());

    String key = PathUtils.relativizePath(status.getPath(), path).toString();

    output.put(key, new RecordIterator(dfr.getSchema(), new AbstractIterator<GenericRecord>() {
      @Override
      protected GenericRecord computeNext() {
        if (dfr.hasNext()) {
          return dfr.next();
        } else {
          try {
            dfr.close();
          } catch (IOException ioe) {
            log.error("Failed to close data file reader.", ioe);
          }
          endOfData();
          return null;
        }
      }
    }));
  }
  return output;
}
 
Example #17
Source File: TestAvroExtractor.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public static List<GenericRecord> getRecordFromFile(String path)
    throws IOException {
  Configuration config = new Configuration();
  SeekableInput input = new FsInput(new Path(path), config);
  DatumReader<GenericRecord> reader1 = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1);
  List<GenericRecord> records = new ArrayList<>();
  for (GenericRecord datum : fileReader) {
    records.add(datum);
  }
  fileReader.close();
  return records;
}
 
Example #18
Source File: AvroUtils.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Get Avro schema from an Avro data file.
 */
public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException {
  try (SeekableInput sin = new FsInput(dataFile, fs.getConf());
      DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) {
    return reader.getSchema();
  }
}
 
Example #19
Source File: AvroExternalTable.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private Schema getSchemaFromAvroDataFile() throws IOException {
  String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs);
  LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath);
  SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput();

  try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) {
    Schema schema = dfr.getSchema();
    return schema;
  }
}
 
Example #20
Source File: TestMerge.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private boolean checkAvroFileForLine(FileSystem fs, Path p, List<Integer> record)
    throws IOException {
  SeekableInput in = new FsInput(p, new Configuration());
  DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> reader = DataFileReader.openReader(in, datumReader);
  reader.sync(0);

  while (reader.hasNext()) {
    if (valueMatches(reader.next(), record)) {
      return true;
    }
  }

  return false;
}
 
Example #21
Source File: AvroFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  org.apache.avro.Schema schema = fileReader.getSchema();
  fileReader.close();
  return avroData.toConnectSchema(schema);
}
 
Example #22
Source File: AvroUtils.java    From Cubert with Apache License 2.0 5 votes vote down vote up
public static Schema getSchema(SeekableInput input) throws IOException
{
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> dataFileReader =
            new DataFileReader<GenericRecord>(input, datumReader);
    Schema schema = dataFileReader.getSchema();

    if (PadDefaultNullsToSchema)
    {
        // a list of "cloned" fields, with optional default value set to null
        ArrayList<Field> paddedFields = new ArrayList<Field>();

        for (Field field: schema.getFields())
        {
            // should this field be padded?
            boolean needsNullPadding = (field.schema() != null) // the field has nested schema
                && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION
                && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type

            JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue();

            Field f = new Field(field.name(), field.schema(), field.doc(), defValue);
            paddedFields.add(f);
        }

        schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
        schema.setFields(paddedFields);
    }

    return schema;
}
 
Example #23
Source File: AvroFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
  ArrayList<Object> collection = new ArrayList<>();
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  for (Object object: fileReader) {
    collection.add(object);
  }
  fileReader.close();
  return collection;
}
 
Example #24
Source File: ClusterHdfsSource.java    From datacollector with Apache License 2.0 5 votes vote down vote up
private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException {
  int previewCount = previewBuffer.size();
  Path filePath = fileStatus.getPath();
  SeekableInput input = new FsInput(filePath, hadoopConf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  List<Map.Entry> batch = new ArrayList<>();
  try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader)) {
    int count = 0;
    while (fileReader.hasNext() && batch.size() < batchSize && previewCount < batchSize) {
      GenericRecord datum = fileReader.next();
      ByteArrayOutputStream out = new ByteArrayOutputStream();
      DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>
          (datum.getSchema()));
      try {
        dataFileWriter.create(datum.getSchema(), out);
        dataFileWriter.append(datum);
      } finally {
        dataFileWriter.close();
        out.close();
      }
      batch.add(new Pair(filePath.toUri().getPath() + "::" + count, out.toByteArray()));
      count++;
      previewCount++;
    }
  }
  return batch;
}
 
Example #25
Source File: BaseCommand.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public SeekableInput openSeekable(String filename) throws IOException {
  Path path = qualifiedPath(filename);
  // even though it was qualified using the default FS, it may not be in it
  FileSystem fs = path.getFileSystem(getConf());
  return new SeekableFSDataInputStream(fs, path);
}
 
Example #26
Source File: AvroConversionBaseMapper.java    From datacollector with Apache License 2.0 4 votes vote down vote up
@Override
protected void map(String input, String output, Context context) throws IOException, InterruptedException {
  FileSystem fs = FileSystem.get(context.getConfiguration());
  Configuration conf = context.getConfiguration();

  LOG.info("Converting input file: {}", input);
  LOG.info("Output directory: {}", output);
  Path inputPath = new Path(input);
  Path outputDir = new Path(output);
  fs.mkdirs(outputDir);

  Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName());
  if(fs.exists(tempFile)) {
    if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) {
      fs.delete(tempFile, true);
    } else {
      throw new IOException("Temporary file " + tempFile + " already exists.");
    }
  }
  LOG.info("Using temp file: {}", tempFile);

  // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc
  String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix();
  Path finalFile = new Path(outputDir, outputFileName);
  LOG.info("Final path will be: {}", finalFile);

  // Avro reader
  SeekableInput seekableInput = new FsInput(inputPath, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader);
  Schema avroSchema = fileReader.getSchema();

  initializeWriter(tempFile, avroSchema, conf, context);

  LOG.info("Started reading input file");
  long recordCount = 0;
  try {
    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();
      handleAvroRecord(record);

      context.getCounter(Counters.PROCESSED_RECORDS).increment(1);
      recordCount++;
    }
  } catch (Exception e) {
    // Various random stuff can happen while converting, so we wrap the underlying exception with more details
    String message = String.format(
        "Exception at offset %d (record %d): %s",
        fileReader.tell(),
        recordCount,
        e.toString()
    );
    throw new IOException(message, e);
  }
  LOG.info("Done reading input file");
  closeWriter();

  LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile);
  fs.rename(tempFile, finalFile);

  if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) {
    LOG.info("Removing input file", inputPath);
    fs.delete(inputPath, true);
  }

  LOG.info("Done converting input file into output directory {}", output);
}