Java Code Examples for org.apache.avro.file.FileReader#getSchema()

The following examples show how to use org.apache.avro.file.FileReader#getSchema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroUtil.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
/**
 * Get the schema of AVRO files stored in a directory
 */
public static Schema getAvroSchema(Path path, Configuration conf)
    throws IOException {
  FileSystem fs = path.getFileSystem(conf);
  Path fileToTest;
  if (fs.isDirectory(path)) {
    FileStatus[] fileStatuses = fs.listStatus(path, new PathFilter() {
      @Override
      public boolean accept(Path p) {
        String name = p.getName();
        return !name.startsWith("_") && !name.startsWith(".");
      }
    });
    if (fileStatuses.length == 0) {
      return null;
    }
    fileToTest = fileStatuses[0].getPath();
  } else {
    fileToTest = path;
  }

  SeekableInput input = new FsInput(fileToTest, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader);

  Schema result = fileReader.getSchema();
  fileReader.close();
  return result;
}
 
Example 2
Source File: AvroFileReader.java    From streamx with Apache License 2.0 5 votes vote down vote up
@Override
public Schema getSchema(Configuration conf, Path path) throws IOException {
  SeekableInput input = new FsInput(path, conf);
  DatumReader<Object> reader = new GenericDatumReader<>();
  FileReader<Object> fileReader = DataFileReader.openReader(input, reader);
  org.apache.avro.Schema schema = fileReader.getSchema();
  fileReader.close();
  return avroData.toConnectSchema(schema);
}
 
Example 3
Source File: AvroToJsonConverter.java    From celos with Apache License 2.0 5 votes vote down vote up
@Override
public FixFile convert(TestRun testRun, FixFile ff) throws IOException {
    byte[] bytes = IOUtils.toByteArray(ff.getContent());
    if (bytes.length == 0) {
        return ff;
    }
    ByteArrayOutputStream os = new ByteArrayOutputStream();
    GenericDatumReader<Object> reader = new GenericDatumReader<>();
    FileReader<Object> fileReader =  DataFileReader.openReader(new SeekableByteArrayInput(bytes), reader);
    try {
        Schema schema = fileReader.getSchema();
        DatumWriter<Object> writer = new GenericDatumWriter<>(schema);
        JsonEncoder encoder = EncoderFactory.get().jsonEncoder(schema, os);

        for (Object datum : fileReader) {
            writer.write(datum, encoder);
        }
        encoder.flush();
    } finally {
        fileReader.close();
    }
    return new FixFile(new ByteArrayInputStream(os.toByteArray()));
}
 
Example 4
Source File: AvroConversionBaseMapper.java    From datacollector with Apache License 2.0 4 votes vote down vote up
@Override
protected void map(String input, String output, Context context) throws IOException, InterruptedException {
  FileSystem fs = FileSystem.get(context.getConfiguration());
  Configuration conf = context.getConfiguration();

  LOG.info("Converting input file: {}", input);
  LOG.info("Output directory: {}", output);
  Path inputPath = new Path(input);
  Path outputDir = new Path(output);
  fs.mkdirs(outputDir);

  Path tempFile = new Path(outputDir, getTempFilePrefix() + inputPath.getName());
  if(fs.exists(tempFile)) {
    if(conf.getBoolean(AvroConversionCommonConstants.OVERWRITE_TMP_FILE, false)) {
      fs.delete(tempFile, true);
    } else {
      throw new IOException("Temporary file " + tempFile + " already exists.");
    }
  }
  LOG.info("Using temp file: {}", tempFile);

  // Output file is the same as input except of dropping .avro extension if it exists and appending .parquet or .orc
  String outputFileName = inputPath.getName().replaceAll("\\.avro$", "") + getOutputFileSuffix();
  Path finalFile = new Path(outputDir, outputFileName);
  LOG.info("Final path will be: {}", finalFile);

  // Avro reader
  SeekableInput seekableInput = new FsInput(inputPath, conf);
  DatumReader<GenericRecord> reader = new GenericDatumReader<>();
  FileReader<GenericRecord> fileReader = DataFileReader.openReader(seekableInput, reader);
  Schema avroSchema = fileReader.getSchema();

  initializeWriter(tempFile, avroSchema, conf, context);

  LOG.info("Started reading input file");
  long recordCount = 0;
  try {
    while (fileReader.hasNext()) {
      GenericRecord record = fileReader.next();
      handleAvroRecord(record);

      context.getCounter(Counters.PROCESSED_RECORDS).increment(1);
      recordCount++;
    }
  } catch (Exception e) {
    // Various random stuff can happen while converting, so we wrap the underlying exception with more details
    String message = String.format(
        "Exception at offset %d (record %d): %s",
        fileReader.tell(),
        recordCount,
        e.toString()
    );
    throw new IOException(message, e);
  }
  LOG.info("Done reading input file");
  closeWriter();

  LOG.info("Moving temporary file {} to final destination {}", tempFile, finalFile);
  fs.rename(tempFile, finalFile);

  if(!context.getConfiguration().getBoolean(AvroConversionCommonConstants.KEEP_INPUT_FILE, false)) {
    LOG.info("Removing input file", inputPath);
    fs.delete(inputPath, true);
  }

  LOG.info("Done converting input file into output directory {}", output);
}