Java Code Examples for org.apache.avro.file.DataFileWriter#setCodec()

The following examples show how to use org.apache.avro.file.DataFileWriter#setCodec() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StageRunData.java    From geowave with Apache License 2.0 6 votes vote down vote up
private synchronized DataFileWriter getDataWriterCreateIfNull(
    final String typeName,
    final GeoWaveAvroFormatPlugin plugin) {
  if (!cachedWriters.containsKey(typeName)) {
    FSDataOutputStream out = null;
    final DataFileWriter dfw = new DataFileWriter(new GenericDatumWriter());
    cachedWriters.put(typeName, dfw);
    dfw.setCodec(CodecFactory.snappyCodec());
    try {
      // TODO: we should probably clean up the type name to make it
      // HDFS path safe in case there are invalid characters
      // also, if a file already exists do we want to delete it or
      // append to it?
      out = fs.create(new Path(hdfsBaseDirectory, typeName));
      dfw.create(plugin.getAvroSchema(), out);

    } catch (final IOException e) {
      LOGGER.error("Unable to create output stream", e);
      // cache a null value so we don't continually try to recreate
      cachedWriters.put(typeName, null);
      return null;
    }
  }
  return cachedWriters.get(typeName);
}
 
Example 2
Source File: TestExtractAvroMetadata.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testExtractionWithCodec() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
    runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec

    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));

    final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
    final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);

    final ByteArrayOutputStream out = new ByteArrayOutputStream();
    final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.setCodec(CodecFactory.deflateCodec(1));
    dataFileWriter.create(schema, out);
    dataFileWriter.append(data);
    dataFileWriter.close();

    runner.enqueue(out.toByteArray());
    runner.run();

    runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);

    final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals("avro.codec", "deflate");
}
 
Example 3
Source File: Hdfs.java    From pxf with Apache License 2.0 6 votes vote down vote up
@Override
public void writeAvroFile(String pathToFile, String schemaName,
                          String codecName, IAvroSchema[] data)
        throws Exception {
    Path path = getDatapath(pathToFile);
    OutputStream outStream = fs.create(path, true, bufferSize,
            replicationSize, blockSize);
    Schema schema = new Schema.Parser().parse(new FileInputStream(
            schemaName));
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<>(
            schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(
            writer);
    if (!StringUtils.isEmpty(codecName)) {
        dataFileWriter.setCodec(CodecFactory.fromString(codecName));
    }

    dataFileWriter.create(schema, outStream);

    for (IAvroSchema iAvroSchema : data) {
        GenericRecord datum = iAvroSchema.serialize();
        dataFileWriter.append(datum);
    }
    dataFileWriter.close();
}
 
Example 4
Source File: AvroFileAppender.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private static <D> DataFileWriter<D> newAvroWriter(
    Schema schema, OutputFile file, Function<Schema, DatumWriter<?>> createWriterFunc,
    CodecFactory codec, Map<String, String> metadata) throws IOException {
  DataFileWriter<D> writer = new DataFileWriter<>(
      (DatumWriter<D>) createWriterFunc.apply(schema));

  writer.setCodec(codec);

  for (Map.Entry<String, String> entry : metadata.entrySet()) {
    writer.setMeta(entry.getKey(), entry.getValue());
  }

  // TODO: support overwrite
  return writer.create(schema, file.create());
}
 
Example 5
Source File: AvroAsJsonOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 6 votes vote down vote up
static <K> void configureDataFileWriter(DataFileWriter<K> writer,
    JobConf job) throws UnsupportedEncodingException {

    if (FileOutputFormat.getCompressOutput(job)) {
        int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
                org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ?
            CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
            DEFAULT_SYNC_INTERVAL));

    // copy metadata from job
    for (Map.Entry<String,String> e : job) {
        if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue());
        if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                   URLDecoder.decode(e.getValue(), "ISO-8859-1")
                   .getBytes("ISO-8859-1"));
    }
}
 
Example 6
Source File: AvroIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public void open(WritableByteChannel channel) throws IOException {
  this.schema = new Schema.Parser().parse(getJsonSchema());
  DataFileWriter<?> writer;
  if (getRecordFormatter() == null) {
    writer = reflectWriter = new DataFileWriter<>(new ReflectDatumWriter<>(schema));
  } else {
    writer = genericWriter = new DataFileWriter<>(new GenericDatumWriter<>(schema));
  }
  writer.setCodec(getCodec().getCodec());
  for (Map.Entry<String, Object> entry : getMetadata().entrySet()) {
    Object v = entry.getValue();
    if (v instanceof String) {
      writer.setMeta(entry.getKey(), (String) v);
    } else if (v instanceof Long) {
      writer.setMeta(entry.getKey(), (Long) v);
    } else if (v instanceof byte[]) {
      writer.setMeta(entry.getKey(), (byte[]) v);
    } else {
      throw new IllegalStateException(
          "Metadata value type must be one of String, Long, or byte[]. Found "
              + v.getClass().getSimpleName());
    }
  }
  writer.create(schema, Channels.newOutputStream(channel));
}
 
Example 7
Source File: PutHiveStreaming.java    From nifi with Apache License 2.0 6 votes vote down vote up
private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}
 
Example 8
Source File: AvroRecordWriter.java    From presto with Apache License 2.0 6 votes vote down vote up
public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties)
        throws IOException
{
    Schema schema;
    try {
        schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties);
    }
    catch (AvroSerdeException e) {
        throw new IOException(e);
    }
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter);

    if (isCompressed) {
        int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
                ? CodecFactory.deflateCodec(level)
                : CodecFactory.fromString(codecName);
        dataFileWriter.setCodec(factory);
    }

    outputStream = path.getFileSystem(jobConf).create(path);
    dataFileWriter.create(schema, outputStream);
    delegate = new AvroGenericRecordWriter(dataFileWriter);
}
 
Example 9
Source File: AvroStockFileWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File inputFile, OutputStream outputStream)
    throws IOException {

  DataFileWriter<Stock> writer =
      new DataFileWriter<Stock>(
          new SpecificDatumWriter<Stock>());

  writer.setCodec(CodecFactory.snappyCodec());
  writer.create(Stock.SCHEMA$, outputStream);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {
    writer.append(stock);
  }

  IOUtils.closeStream(writer);
  IOUtils.closeStream(outputStream);
}
 
Example 10
Source File: AvroKeyValueFileWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File inputFile, OutputStream outputStream)
    throws IOException {

  DataFileWriter<GenericRecord> writer =
      new DataFileWriter<GenericRecord>(
          new GenericDatumWriter<GenericRecord>());

  writer.setCodec(CodecFactory.snappyCodec());
  writer.create(SCHEMA, outputStream);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {

    AvroKeyValue<CharSequence, Stock> record
        = new AvroKeyValue<CharSequence, Stock>(new GenericData.Record(SCHEMA));
    record.setKey(stock.getSymbol());
    record.setValue(stock);

    writer.append(record.get());
  }

  IOUtils.closeStream(writer);
  IOUtils.closeStream(outputStream);
}
 
Example 11
Source File: SmallFilesWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File srcPath,
        OutputStream outputStream)
        throws IOException {
  DataFileWriter<Object> writer =
          new DataFileWriter<Object>(
              new GenericDatumWriter<Object>())
              .setSyncInterval(100);                 //<co id="ch02_smallfilewrite_comment2"/>
  writer.setCodec(CodecFactory.snappyCodec());   //<co id="ch02_smallfilewrite_comment3"/>
  writer.create(SCHEMA, outputStream);           //<co id="ch02_smallfilewrite_comment4"/>
  for (Object obj : FileUtils.listFiles(srcPath, null, false)) {
    File file = (File) obj;
    String filename = file.getAbsolutePath();
    byte content[] = FileUtils.readFileToByteArray(file);
    GenericRecord record = new GenericData.Record(SCHEMA);  //<co id="ch02_smallfilewrite_comment5"/>
    record.put(FIELD_FILENAME, filename);                   //<co id="ch02_smallfilewrite_comment6"/>
    record.put(FIELD_CONTENTS, ByteBuffer.wrap(content));   //<co id="ch02_smallfilewrite_comment7"/>
    writer.append(record);                                  //<co id="ch02_smallfilewrite_comment8"/>
    System.out.println(
            file.getAbsolutePath()
            + ": "
            + DigestUtils.md5Hex(content));
  }

  IOUtils.cleanup(null, writer);
  IOUtils.cleanup(null, outputStream);
}
 
Example 12
Source File: AvroHdfsDataWriter.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Create a new {@link DataFileWriter} for writing Avro records.
 *
 * @param codecFactory a {@link CodecFactory} object for building the compression codec
 * @throws IOException if there is something wrong creating a new {@link DataFileWriter}
 */
private DataFileWriter<GenericRecord> createDataFileWriter(CodecFactory codecFactory) throws IOException {
  @SuppressWarnings("resource")
  DataFileWriter<GenericRecord> writer = new DataFileWriter<>(this.datumWriter);
  writer.setCodec(codecFactory);

  // Open the file and return the DataFileWriter
  return writer.create(this.schema, this.stagingFileOutputStream);
}
 
Example 13
Source File: TestUtil.java    From nifi with Apache License 2.0 5 votes vote down vote up
private static byte[] bytesFor(List<Record> records) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataFileWriter<Record> writer = new DataFileWriter<>(
            AvroUtil.newDatumWriter(records.get(0).getSchema(), Record.class));
    writer.setCodec(CodecFactory.snappyCodec());
    writer = writer.create(records.get(0).getSchema(), out);

    for (Record record : records) {
        writer.append(record);
    }

    writer.flush();

    return out.toByteArray();
}
 
Example 14
Source File: PigAvroOutputFormat.java    From spork with Apache License 2.0 5 votes vote down vote up
@Override
public RecordWriter<NullWritable, Object> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

    if (schema == null)
        throw new IOException("Must provide a schema");

    Configuration conf = context.getConfiguration();

    DataFileWriter<Object> writer = new DataFileWriter<Object>(new PigAvroDatumWriter(schema));

    if (FileOutputFormat.getCompressOutput(context)) {
        int level = conf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = conf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
            ? CodecFactory.deflateCodec(level)
            : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    // Do max as core-default.xml has io.file.buffer.size as 4K
    writer.setSyncInterval(conf.getInt(SYNC_INTERVAL_KEY, Math.max(
            conf.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

    Path path = getDefaultWorkFile(context, EXT);
    writer.create(schema, path.getFileSystem(conf).create(path));
    return new PigAvroRecordWriter(writer);
}
 
Example 15
Source File: AvroRecordWriter.java    From spork with Apache License 2.0 5 votes vote down vote up
static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}
 
Example 16
Source File: AvroFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
public AvroFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
    file = new File(logFilePath.getLogFilePath());
    file.getParentFile().mkdirs();
    LOG.debug("Creating Brand new Writer for path {}", logFilePath.getLogFilePath());
    topic = logFilePath.getTopic();
    Schema schema = schemaRegistry.getSchema(topic);
    SpecificDatumWriter specificDatumWriter= new SpecificDatumWriter(schema);
    writer = new DataFileWriter(specificDatumWriter);
    writer.setCodec(getCodecFactory(codec));
    writer.create(schema, file);
}
 
Example 17
Source File: PigAvroOutputFormat.java    From Cubert with Apache License 2.0 5 votes vote down vote up
@Override
public RecordWriter<NullWritable, Object> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

    if (schema == null)
        throw new IOException("Must provide a schema");

    Configuration conf = context.getConfiguration();

    DataFileWriter<Object> writer = new DataFileWriter<Object>(new PigAvroDatumWriter(schema));

    if (FileOutputFormat.getCompressOutput(context)) {
        int level = conf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = conf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
            ? CodecFactory.deflateCodec(level)
            : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    // Do max as core-default.xml has io.file.buffer.size as 4K
    writer.setSyncInterval(conf.getInt(SYNC_INTERVAL_KEY, Math.max(
            conf.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

    Path path = getDefaultWorkFile(context, EXT);
    writer.create(schema, path.getFileSystem(conf).create(path));
    return new PigAvroRecordWriter(writer);
}
 
Example 18
Source File: AvroOutputFormat.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
static <T> void configureDataFileWriter(DataFileWriter<T> writer,
  TaskAttemptContext context) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(context)) {
    int level = context.getConfiguration()
      .getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
    String codecName = context.getConfiguration()
      .get(org.apache.avro.mapred.AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory =
      codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level)
        : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  writer.setSyncInterval(context.getConfiguration()
    .getInt(SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL));

  // copy metadata from job
  for (Map.Entry<String, String> e : context.getConfiguration()) {
    if (e.getKey().startsWith(org.apache.avro.mapred.AvroJob.TEXT_PREFIX)) {
      writer.setMeta(e.getKey()
        .substring(org.apache.avro.mapred.AvroJob.TEXT_PREFIX.length()),
        e.getValue());
    }
    if (e.getKey().startsWith(org.apache.avro.mapred.AvroJob.BINARY_PREFIX)) {
      writer.setMeta(e.getKey()
        .substring(org.apache.avro.mapred.AvroJob.BINARY_PREFIX.length()),
        URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1"));
    }
  }
}
 
Example 19
Source File: TestUtil.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
private static byte[] bytesFor(List<Record> records) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataFileWriter<Record> writer = new DataFileWriter<>(
            AvroUtil.newDatumWriter(records.get(0).getSchema(), Record.class));
    writer.setCodec(CodecFactory.snappyCodec());
    writer = writer.create(records.get(0).getSchema(), out);

    for (Record record : records) {
        writer.append(record);
    }

    writer.flush();

    return out.toByteArray();
}
 
Example 20
Source File: AvroFileAppender.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private static <D> DataFileWriter<D> newAvroWriter(
    Schema schema, PositionOutputStream stream, Function<Schema, DatumWriter<?>> createWriterFunc,
    CodecFactory codec, Map<String, String> metadata) throws IOException {
  DataFileWriter<D> writer = new DataFileWriter<>(
      (DatumWriter<D>) createWriterFunc.apply(schema));

  writer.setCodec(codec);

  for (Map.Entry<String, String> entry : metadata.entrySet()) {
    writer.setMeta(entry.getKey(), entry.getValue());
  }

  return writer.create(schema, stream);
}