org.apache.avro.file.SeekableInput Java Exaples

Source File: AvroScanner.java From tajo with Apache License 2.0

6 votes

/**
 * Initializes the AvroScanner.
 */
@Override
public void init() throws IOException {
  if (targets == null) {
    targets = schema.toArray();
  }
  prepareProjection(targets);
  outTuple = new VTuple(projectionMap.length);

  Schema avroSchema = AvroUtil.getAvroSchema(meta, conf);
  avroFields = avroSchema.getFields();

  DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(avroSchema);
  SeekableInput input = new FsInput(fragment.getPath(), conf);
  dataFileReader = new DataFileReader<>(input, datumReader);
  super.init();
}

Source File: AvroInputFormat.java From stratosphere with Apache License 2.0

6 votes

@Override
public void open(FileInputSplit split) throws IOException {
	super.open(split);

	DatumReader<E> datumReader;
	if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) {
		datumReader = new SpecificDatumReader<E>(avroValueType);
	} else {
		datumReader = new ReflectDatumReader<E>(avroValueType);
	}
	
	LOG.info("Opening split " + split);
	
	SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
	
	dataFileReader = DataFileReader.openReader(in, datumReader);
	dataFileReader.sync(split.getStart());
}

Source File: AvroInputFormat.java From stratosphere with Apache License 2.0

6 votes

@Override
public void open(FileInputSplit split) throws IOException {
	super.open(split);
	
	this.wrapper = InstantiationUtil.instantiate(avroWrapperTypeClass, AvroBaseValue.class);
	
	DatumReader<E> datumReader;
	if (org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)) {
		datumReader = new SpecificDatumReader<E>(avroValueType);
	} else {
		datumReader = new ReflectDatumReader<E>(avroValueType);
	}
	
	LOG.info("Opening split " + split);
	
	SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
	
	dataFileReader = DataFileReader.openReader(in, datumReader);
	dataFileReader.sync(split.getStart());
	
	reuseAvroValue = null;
}

Source File: SchemaCommand.java From parquet-mr with Apache License 2.0

6 votes

private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}

Source File: BaseCommand.java From parquet-mr with Apache License 2.0

6 votes

protected Schema getAvroSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return Schemas.fromParquet(getConf(), qualifiedURI(source));
      case AVRO:
        return Schemas.fromAvro(open(source));
      case TEXT:
        if (source.endsWith("avsc")) {
          return Schemas.fromAvsc(open(source));
        } else if (source.endsWith("json")) {
          return Schemas.fromJSON("json", open(source));
        }
      default:
    }

    throw new IllegalArgumentException(String.format(
        "Could not determine file format of %s.", source));
  }
}

Source File: AvroInputFormat.java From flink with Apache License 2.0

6 votes

private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
	DatumReader<E> datumReader;

	if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
		datumReader = new GenericDatumReader<E>();
	} else {
		datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)
			? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
	}
	if (LOG.isInfoEnabled()) {
		LOG.info("Opening split {}", split);
	}

	SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
	DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);

	if (LOG.isDebugEnabled()) {
		LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
	}

	end = split.getStart() + split.getLength();
	recordsReadSinceLastSync = 0;
	return dataFileReader;
}

Source File: AvroRecordWriterTest.java From data-highway with Apache License 2.0

6 votes

@Test
public void typical() throws Exception {
  Schema schema = SchemaBuilder
      .builder()
      .record("record")
      .fields()
      .requiredLong("id")
      .requiredString("name")
      .endRecord();
  Record value = new GenericRecordBuilder(schema).set("id", 1L).set("name", "hello").build();
  ByteArrayOutputStream output = new ByteArrayOutputStream();

  Factory factory = new Factory(CodecFactory.nullCodec());
  RecordWriter writer = factory.create(schema, output);
  writer.write(value);
  writer.close();

  SeekableInput input = new SeekableByteArrayInput(output.toByteArray());
  DatumReader<Record> datumReader = new GenericDatumReader<>(schema);
  DataFileReader<Record> dataFileReader = new DataFileReader<>(input, datumReader);
  assertThat(dataFileReader.next(), is(value));
  assertThat(dataFileReader.hasNext(), is(false));
  dataFileReader.close();
}

Source File: Purge.java From Cubert with Apache License 2.0

6 votes

private DataFileReader<GenericRecord> createDataFileReader(String filename,
                                                           boolean localFS) throws IOException
{
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> dataFileReader;

    if (localFS)
    {
        dataFileReader =
                new DataFileReader<GenericRecord>(new File(filename), datumReader);
    }
    else
    {
        Path path = new Path(filename);
        SeekableInput input = new FsInput(path, conf);
        dataFileReader = new DataFileReader<GenericRecord>(input, datumReader);
    }

    return dataFileReader;
}

Source File: AvroInputFormat.java From Flink-CEPplus with Apache License 2.0

6 votes

private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
	DatumReader<E> datumReader;

	if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
		datumReader = new GenericDatumReader<E>();
	} else {
		datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)
			? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
	}
	if (LOG.isInfoEnabled()) {
		LOG.info("Opening split {}", split);
	}

	SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
	DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);

	if (LOG.isDebugEnabled()) {
		LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
	}

	end = split.getStart() + split.getLength();
	recordsReadSinceLastSync = 0;
	return dataFileReader;
}

Source File: AvroInputFormat.java From flink with Apache License 2.0

6 votes

private DataFileReader<E> initReader(FileInputSplit split) throws IOException {
	DatumReader<E> datumReader;

	if (org.apache.avro.generic.GenericRecord.class == avroValueType) {
		datumReader = new GenericDatumReader<E>();
	} else {
		datumReader = org.apache.avro.specific.SpecificRecordBase.class.isAssignableFrom(avroValueType)
			? new SpecificDatumReader<E>(avroValueType) : new ReflectDatumReader<E>(avroValueType);
	}
	if (LOG.isInfoEnabled()) {
		LOG.info("Opening split {}", split);
	}

	SeekableInput in = new FSDataInputStreamWrapper(stream, split.getPath().getFileSystem().getFileStatus(split.getPath()).getLen());
	DataFileReader<E> dataFileReader = (DataFileReader) DataFileReader.openReader(in, datumReader);

	if (LOG.isDebugEnabled()) {
		LOG.debug("Loaded SCHEMA: {}", dataFileReader.getSchema());
	}

	end = split.getStart() + split.getLength();
	recordsReadSinceLastSync = 0;
	return dataFileReader;
}

Source File: AvroToOrcRecordConverter.java From datacollector with Apache License 2.0

6 votes

public void convert(SeekableInput avroInputFile, Path orcOutputFile) throws IOException {
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroInputFile, reader)) {
    Schema avroSchema = fileReader.getSchema();

    initializeWriter(avroSchema, orcOutputFile);

    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();

      addAvroRecord(record);
    }

    closeWriter();
  }
}

Source File: AvroUtilsTest.java From incubator-gobblin with Apache License 2.0

5 votes

public static List<GenericRecord> getRecordFromFile(String path)
    throws IOException {
  Configuration config = new Configuration();
  SeekableInput input = new FsInput(new Path(path), config);
  DatumReader<GenericRecord> reader1 = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1);
  List<GenericRecord> records = new ArrayList<>();
  for (GenericRecord datum : fileReader) {
    records.add(datum);
  }
  fileReader.close();
  return records;
}

Source File: AvroRecordInputFormat.java From stratosphere with Apache License 2.0

5 votes

@Override
public void open(FileInputSplit split) throws IOException {
	super.open(split);
	DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
	SeekableInput in = new FSDataInputStreamWrapper(stream, (int) split.getLength());
	LOG.info("Opening split " + split);
	dataFileReader = DataFileReader.openReader(in, datumReader);
	dataFileReader.sync(split.getStart());
}

Source File: AvroUtil.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

/**
 * Get the schema of AVRO files stored in a directory
 */
public static Schema getAvroSchema(Path path, Configuration conf)
    throws IOException {
  FileSystem fs = path.getFileSystem(conf);
  Path fileToTest;
  if (fs.isDirectory(path)) {
    FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
      @Override
      public boolean accept(Path p) {
        String name = p.getName();
        return !name.startsWith("_") && !name.startsWith(".");
      }
    });
    if (fileStatuses.length == 0) {
      return null;
    }
    fileToTest = fileStatuses[0].getPath();
  } else {
    fileToTest = path;
  }

  SeekableInput input = new FsInput(fileToTest, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);

  Schema result = fileReader.getSchema();
  fileReader.close();
  return result;
}

Source File: AvroRecordReader.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration conf = context.getConfiguration();
  SeekableInput in = new FsInput(split.getPath(), conf);
  DatumReader<T> datumReader = new GenericDatumReader<T>();
  this.reader = DataFileReader.openReader(in, datumReader);
  reader.sync(split.getStart());                    // sync to start
  this.start = reader.tell();
  this.end = split.getStart() + split.getLength();
}

Source File: AvroTestTools.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Read all avro records in an HDFS location into a map from file name to {@link RecordIterator}.
 */
@Override
public TreeMap<String, RecordIterator> readAllRecordsInBinaryDirectory(FileSystem fs, Path path)
    throws IOException {
  TreeMap<String, RecordIterator> output = new TreeMap<>();
  if (!fs.exists(path)) {
    return output;
  }
  PathFilter pathFilter = new HiddenFilter();
  for (FileStatus status : FileListUtils.listFilesRecursively(fs, path, pathFilter)) {
    SeekableInput sin = new FsInput(status.getPath(), fs);
    DataFileReader<GenericRecord> dfr = new DataFileReader<>(sin, new GenericDatumReader<>());

    String key = PathUtils.relativizePath(status.getPath(), path).toString();

    output.put(key, new RecordIterator(dfr.getSchema(), new AbstractIterator<GenericRecord>() {
      @Override
      protected GenericRecord computeNext() {
        if (dfr.hasNext()) {
          return dfr.next();
        } else {
          try {
            dfr.close();
          } catch (IOException ioe) {
            log.error("Failed to close data file reader.", ioe);
          }
          endOfData();
          return null;
        }
      }
    }));
  }
  return output;
}

Source File: TestAvroExtractor.java From incubator-gobblin with Apache License 2.0

5 votes

public static List<GenericRecord> getRecordFromFile(String path)
    throws IOException {
  Configuration config = new Configuration();
  SeekableInput input = new FsInput(new Path(path), config);
  DatumReader<GenericRecord> reader1 = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1);
  List<GenericRecord> records = new ArrayList<>();
  for (GenericRecord datum : fileReader) {
    records.add(datum);
  }
  fileReader.close();
  return records;
}

Source File: AvroUtils.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Get Avro schema from an Avro data file.
 */
public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException {
  try (SeekableInput sin = new FsInput(dataFile, fs.getConf());
      DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) {
    return reader.getSchema();
  }
}

Source File: AvroExternalTable.java From incubator-gobblin with Apache License 2.0

5 votes

private Schema getSchemaFromAvroDataFile() throws IOException {
  String firstDataFilePath = HdfsReader.getFirstDataFilePathInDir(this.dataLocationInHdfs);
  LOG.info("Extracting schema for table " + this.name + " from avro data file " + firstDataFilePath);
  SeekableInput sin = new HdfsReader(firstDataFilePath).getFsInput();

  try (DataFileReader<Void> dfr = new DataFileReader<>(sin, new GenericDatumReader<Void>())) {
    Schema schema = dfr.getSchema();
    return schema;
  }
}

Source File: TestMerge.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

private boolean checkAvroFileForLine(FileSystem fs, Path p, List<Integer> record)
    throws IOException {
  SeekableInput in = new FsInput(p, new Configuration());
  DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> reader = DataFileReader.openReader(in, datumReader);
  reader.sync(0);

  while (reader.hasNext()) {
    if (valueMatches(reader.next(), record)) {
      return true;
    }
  }

  return false;
}

Source File: AvroFileReader.java From streamx with Apache License 2.0

5 votes

@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  org.apache.avro.Schema schema = fileReader.getSchema();
  fileReader.close();
  return avroData.toConnectSchema(schema);
}

Source File: AvroUtils.java From Cubert with Apache License 2.0

5 votes

public static Schema getSchema(SeekableInput input) throws IOException
{
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>();
    DataFileReader<GenericRecord> dataFileReader =
            new DataFileReader<GenericRecord>(input, datumReader);
    Schema schema = dataFileReader.getSchema();

    if (PadDefaultNullsToSchema)
    {
        // a list of "cloned" fields, with optional default value set to null
        ArrayList<Field> paddedFields = new ArrayList<Field>();

        for (Field field: schema.getFields())
        {
            // should this field be padded?
            boolean needsNullPadding = (field.schema() != null) // the field has nested schema
                && (field.schema().getType().equals(Type.UNION)) // the nested schema is UNION
                && (field.schema().getTypes().get(0).getType().equals(Type.NULL)); // the first element of union is NULL type

            JsonNode defValue = needsNullPadding ? NullNode.getInstance() : field.defaultValue();

            Field f = new Field(field.name(), field.schema(), field.doc(), defValue);
            paddedFields.add(f);
        }

        schema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
        schema.setFields(paddedFields);
    }

    return schema;
}

Source File: AvroFileReader.java From streamx with Apache License 2.0

5 votes

@Override
public Collection<Object> readData(Configuration conf, Path path) throws IOException {
  ArrayList<Object> collection = new ArrayList<>();
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  for (Object object: fileReader) {
    collection.add(object);
  }
  fileReader.close();
  return collection;
}

Source File: ClusterHdfsSource.java From datacollector with Apache License 2.0

5 votes

private List<Map.Entry> previewAvroBatch(FileStatus fileStatus, int batchSize) throws IOException {
  int previewCount = previewBuffer.size();
  Path filePath = fileStatus.getPath();
  SeekableInput input = new FsInput(filePath, hadoopConf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  List<Map.Entry> batch = new ArrayList<>();
  try (FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader)) {
    int count = 0;
    while (fileReader.hasNext() && batch.size() < batchSize && previewCount < batchSize) {
      GenericRecord datum = fileReader.next();
      ByteArrayOutputStream out = new ByteArrayOutputStream();
      DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>
          (datum.getSchema()));
      try {
        dataFileWriter.create(datum.getSchema(), out);
        dataFileWriter.append(datum);
      } finally {
        dataFileWriter.close();
        out.close();
      }
      batch.add(new Pair(filePath.toUri().getPath() + "::" + count, out.toByteArray()));
      count++;
      previewCount++;
    }
  }
  return batch;
}

Source File: BaseCommand.java From parquet-mr with Apache License 2.0

4 votes

public SeekableInput openSeekable(String filename) throws IOException {
  Path path = qualifiedPath(filename);
  // even though it was qualified using the default FS, it may not be in it
  FileSystem fs = path.getFileSystem(getConf());
  return new SeekableFSDataInputStream(fs, path);
}

Source File: AvroConversionBaseMapper.java From datacollector with Apache License 2.0

4 votes

@Override
protected void map(String input, String output, Context context) throws IOException, InterruptedException {
  FileSystem fs = FileSystem.get(context.getConfiguration());
  Configuration conf = context.getConfiguration();

  LOG.info("Converting input file: {}", input);
  LOG.info("Output directory: {}", output);
  Path inputPath = new Path(input);
  Path outputDir = new Path(output);
  fs.mkdirs(outputDir);

  Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName());
  if(fs.exists(tempFile)) {
    if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) {
      fs.delete(tempFile, true);
    } else {
      throw new IOException("Temporary file " + tempFile + " already exists.");
    }
  }
  LOG.info("Using temp file: {}", tempFile);

  // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc
  String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix();
  Path finalFile = new Path(outputDir, outputFileName);
  LOG.info("Final path will be: {}", finalFile);

  // Avro reader
  SeekableInput seekableInput = new FsInput(inputPath, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader);
  Schema avroSchema = fileReader.getSchema();

  initializeWriter(tempFile, avroSchema, conf, context);

  LOG.info("Started reading input file");
  long recordCount = 0;
  try {
    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();
      handleAvroRecord(record);

      context.getCounter(Counters.PROCESSED_RECORDS).increment(1);
      recordCount++;
    }
  } catch (Exception e) {
    // Various random stuff can happen while converting, so we wrap the underlying exception with more details
    String message = String.format(
        "Exception at offset %d (record %d): %s",
        fileReader.tell(),
        recordCount,
        e.toString()
    );
    throw new IOException(message, e);
  }
  LOG.info("Done reading input file");
  closeWriter();

  LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile);
  fs.rename(tempFile, finalFile);

  if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) {
    LOG.info("Removing input file", inputPath);
    fs.delete(inputPath, true);
  }

  LOG.info("Done converting input file into output directory {}", output);
}

org.apache.avro.file.SeekableInput Java Examples