Java Code Examples for org.apache.avro.file.DataFileWriter#setCodec()

The following examples show how to use org.apache.avro.file.DataFileWriter#setCodec() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: StageRunData.java From geowave with Apache License 2.0

6 votes

private synchronized DataFileWriter getDataWriterCreateIfNull(
    final String typeName,
    final GeoWaveAvroFormatPlugin plugin) {
  if (!cachedWriters.containsKey(typeName)) {
    FSDataOutputStream out = null;
    final DataFileWriter dfw = new DataFileWriter(new GenericDatumWriter());
    cachedWriters.put(typeName, dfw);
    dfw.setCodec(CodecFactory.snappyCodec());
    try {
      // TODO: we should probably clean up the type name to make it
      // HDFS path safe in case there are invalid characters
      // also, if a file already exists do we want to delete it or
      // append to it?
      out = fs.create(new Path(hdfsBaseDirectory, typeName));
      dfw.create(plugin.getAvroSchema(), out);

    } catch (final IOException e) {
      LOGGER.error("Unable to create output stream", e);
      // cache a null value so we don't continually try to recreate
      cachedWriters.put(typeName, null);
      return null;
    }
  }
  return cachedWriters.get(typeName);
}

Example 2

Source File: TestExtractAvroMetadata.java From localization_nifi with Apache License 2.0

6 votes

@Test
public void testExtractionWithCodec() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
    runner.setProperty(ExtractAvroMetadata.METADATA_KEYS, AVRO_CODEC_ATTR); // test dynamic attribute avro.codec

    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));

    final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
    final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);

    final ByteArrayOutputStream out = new ByteArrayOutputStream();
    final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.setCodec(CodecFactory.deflateCodec(1));
    dataFileWriter.create(schema, out);
    dataFileWriter.append(data);
    dataFileWriter.close();

    runner.enqueue(out.toByteArray());
    runner.run();

    runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);

    final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals("avro.codec", "deflate");
}

Example 3

Source File: Hdfs.java From pxf with Apache License 2.0

6 votes

@Override
public void writeAvroFile(String pathToFile, String schemaName,
                          String codecName, IAvroSchema[] data)
        throws Exception {
    Path path = getDatapath(pathToFile);
    OutputStream outStream = fs.create(path, true, bufferSize,
            replicationSize, blockSize);
    Schema schema = new Schema.Parser().parse(new FileInputStream(
            schemaName));
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<>(
            schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(
            writer);
    if (!StringUtils.isEmpty(codecName)) {
        dataFileWriter.setCodec(CodecFactory.fromString(codecName));
    }

    dataFileWriter.create(schema, outStream);

    for (IAvroSchema iAvroSchema : data) {
        GenericRecord datum = iAvroSchema.serialize();
        dataFileWriter.append(datum);
    }
    dataFileWriter.close();
}

Example 4

Source File: AvroFileAppender.java From iceberg with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
private static <D> DataFileWriter<D> newAvroWriter(
    Schema schema, OutputFile file, Function<Schema, DatumWriter<?>> createWriterFunc,
    CodecFactory codec, Map<String, String> metadata) throws IOException {
  DataFileWriter<D> writer = new DataFileWriter<>(
      (DatumWriter<D>) createWriterFunc.apply(schema));

  writer.setCodec(codec);

  for (Map.Entry<String, String> entry : metadata.entrySet()) {
    writer.setMeta(entry.getKey(), entry.getValue());
  }

  // TODO: support overwrite
  return writer.create(schema, file.create());
}

Example 5

Source File: AvroAsJsonOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

6 votes

static <K> void configureDataFileWriter(DataFileWriter<K> writer,
    JobConf job) throws UnsupportedEncodingException {

    if (FileOutputFormat.getCompressOutput(job)) {
        int level = job.getInt(org.apache.avro.mapred.AvroOutputFormat.DEFLATE_LEVEL_KEY,
                org.apache.avro.mapred.AvroOutputFormat.DEFAULT_DEFLATE_LEVEL);
        String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC) ?
            CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    writer.setSyncInterval(job.getInt(org.apache.avro.mapred.AvroOutputFormat.SYNC_INTERVAL_KEY,
            DEFAULT_SYNC_INTERVAL));

    // copy metadata from job
    for (Map.Entry<String,String> e : job) {
        if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),e.getValue());
        if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
            writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                   URLDecoder.decode(e.getValue(), "ISO-8859-1")
                   .getBytes("ISO-8859-1"));
    }
}

Example 6

Source File: AvroIO.java From beam with Apache License 2.0

6 votes

@Override
public void open(WritableByteChannel channel) throws IOException {
  this.schema = new Schema.Parser().parse(getJsonSchema());
  DataFileWriter<?> writer;
  if (getRecordFormatter() == null) {
    writer = reflectWriter = new DataFileWriter<>(new ReflectDatumWriter<>(schema));
  } else {
    writer = genericWriter = new DataFileWriter<>(new GenericDatumWriter<>(schema));
  }
  writer.setCodec(getCodec().getCodec());
  for (Map.Entry<String, Object> entry : getMetadata().entrySet()) {
    Object v = entry.getValue();
    if (v instanceof String) {
      writer.setMeta(entry.getKey(), (String) v);
    } else if (v instanceof Long) {
      writer.setMeta(entry.getKey(), (Long) v);
    } else if (v instanceof byte[]) {
      writer.setMeta(entry.getKey(), (byte[]) v);
    } else {
      throw new IllegalStateException(
          "Metadata value type must be one of String, Long, or byte[]. Found "
              + v.getClass().getSimpleName());
    }
  }
  writer.create(schema, Channels.newOutputStream(channel));
}

Example 7

Source File: PutHiveStreaming.java From nifi with Apache License 2.0

6 votes

private byte[] initAvroWriter(ProcessSession session, String codec, DataFileStream<GenericRecord> reader,
                                     DataFileWriter<GenericRecord> writer, AtomicReference<FlowFile> flowFileRef) {

    writer.setCodec(CodecFactory.fromString(codec));
    // Transfer metadata (this is a subset of the incoming file)
    for (String metaKey : reader.getMetaKeys()) {
        if (!RESERVED_METADATA.contains(metaKey)) {
            writer.setMeta(metaKey, reader.getMeta(metaKey));
        }
    }

    final ByteArrayOutputStream avroHeader = new ByteArrayOutputStream();
    flowFileRef.set(session.append(flowFileRef.get(), (out) -> {
        // Create writer so that records can be appended later.
        writer.create(reader.getSchema(), avroHeader);
        writer.close();

        final byte[] header = avroHeader.toByteArray();
        out.write(header);
    }));

    // Capture the Avro header byte array that is just written to the FlowFile.
    // This is needed when Avro records are appended to the same FlowFile.
    return avroHeader.toByteArray();
}

Example 8

Source File: AvroRecordWriter.java From presto with Apache License 2.0

6 votes

public AvroRecordWriter(Path path, JobConf jobConf, boolean isCompressed, Properties properties)
        throws IOException
{
    Schema schema;
    try {
        schema = AvroSerdeUtils.determineSchemaOrThrowException(jobConf, properties);
    }
    catch (AvroSerdeException e) {
        throw new IOException(e);
    }
    GenericDatumWriter<GenericRecord> genericDatumWriter = new GenericDatumWriter<>(schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(genericDatumWriter);

    if (isCompressed) {
        int level = jobConf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = jobConf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
                ? CodecFactory.deflateCodec(level)
                : CodecFactory.fromString(codecName);
        dataFileWriter.setCodec(factory);
    }

    outputStream = path.getFileSystem(jobConf).create(path);
    dataFileWriter.create(schema, outputStream);
    delegate = new AvroGenericRecordWriter(dataFileWriter);
}

Example 9

Source File: AvroStockFileWrite.java From hiped2 with Apache License 2.0

6 votes

public static void writeToAvro(File inputFile, OutputStream outputStream)
    throws IOException {

  DataFileWriter<Stock> writer =
      new DataFileWriter<Stock>(
          new SpecificDatumWriter<Stock>());

  writer.setCodec(CodecFactory.snappyCodec());
  writer.create(Stock.SCHEMA$, outputStream);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {
    writer.append(stock);
  }

  IOUtils.closeStream(writer);
  IOUtils.closeStream(outputStream);
}

Example 10

Source File: AvroKeyValueFileWrite.java From hiped2 with Apache License 2.0

6 votes

public static void writeToAvro(File inputFile, OutputStream outputStream)
    throws IOException {

  DataFileWriter<GenericRecord> writer =
      new DataFileWriter<GenericRecord>(
          new GenericDatumWriter<GenericRecord>());

  writer.setCodec(CodecFactory.snappyCodec());
  writer.create(SCHEMA, outputStream);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {

    AvroKeyValue<CharSequence, Stock> record
        = new AvroKeyValue<CharSequence, Stock>(new GenericData.Record(SCHEMA));
    record.setKey(stock.getSymbol());
    record.setValue(stock);

    writer.append(record.get());
  }

  IOUtils.closeStream(writer);
  IOUtils.closeStream(outputStream);
}

Example 11

Source File: SmallFilesWrite.java From hiped2 with Apache License 2.0

6 votes

public static void writeToAvro(File srcPath,
        OutputStream outputStream)
        throws IOException {
  DataFileWriter<Object> writer =
          new DataFileWriter<Object>(
              new GenericDatumWriter<Object>())
              .setSyncInterval(100);                 //<co id="ch02_smallfilewrite_comment2"/>
  writer.setCodec(CodecFactory.snappyCodec());   //<co id="ch02_smallfilewrite_comment3"/>
  writer.create(SCHEMA, outputStream);           //<co id="ch02_smallfilewrite_comment4"/>
  for (Object obj : FileUtils.listFiles(srcPath, null, false)) {
    File file = (File) obj;
    String filename = file.getAbsolutePath();
    byte content[] = FileUtils.readFileToByteArray(file);
    GenericRecord record = new GenericData.Record(SCHEMA);  //<co id="ch02_smallfilewrite_comment5"/>
    record.put(FIELD_FILENAME, filename);                   //<co id="ch02_smallfilewrite_comment6"/>
    record.put(FIELD_CONTENTS, ByteBuffer.wrap(content));   //<co id="ch02_smallfilewrite_comment7"/>
    writer.append(record);                                  //<co id="ch02_smallfilewrite_comment8"/>
    System.out.println(
            file.getAbsolutePath()
            + ": "
            + DigestUtils.md5Hex(content));
  }

  IOUtils.cleanup(null, writer);
  IOUtils.cleanup(null, outputStream);
}

Example 12

Source File: AvroHdfsDataWriter.java From incubator-gobblin with Apache License 2.0

5 votes

/**
 * Create a new {@link DataFileWriter} for writing Avro records.
 *
 * @param codecFactory a {@link CodecFactory} object for building the compression codec
 * @throws IOException if there is something wrong creating a new {@link DataFileWriter}
 */
private DataFileWriter<GenericRecord> createDataFileWriter(CodecFactory codecFactory) throws IOException {
  @SuppressWarnings("resource")
  DataFileWriter<GenericRecord> writer = new DataFileWriter<>(this.datumWriter);
  writer.setCodec(codecFactory);

  // Open the file and return the DataFileWriter
  return writer.create(this.schema, this.stagingFileOutputStream);
}

Example 13

Source File: TestUtil.java From nifi with Apache License 2.0

5 votes

private static byte[] bytesFor(List<Record> records) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataFileWriter<Record> writer = new DataFileWriter<>(
            AvroUtil.newDatumWriter(records.get(0).getSchema(), Record.class));
    writer.setCodec(CodecFactory.snappyCodec());
    writer = writer.create(records.get(0).getSchema(), out);

    for (Record record : records) {
        writer.append(record);
    }

    writer.flush();

    return out.toByteArray();
}

Example 14

Source File: PigAvroOutputFormat.java From spork with Apache License 2.0

5 votes

@Override
public RecordWriter<NullWritable, Object> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

    if (schema == null)
        throw new IOException("Must provide a schema");

    Configuration conf = context.getConfiguration();

    DataFileWriter<Object> writer = new DataFileWriter<Object>(new PigAvroDatumWriter(schema));

    if (FileOutputFormat.getCompressOutput(context)) {
        int level = conf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = conf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
            ? CodecFactory.deflateCodec(level)
            : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    // Do max as core-default.xml has io.file.buffer.size as 4K
    writer.setSyncInterval(conf.getInt(SYNC_INTERVAL_KEY, Math.max(
            conf.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

    Path path = getDefaultWorkFile(context, EXT);
    writer.create(schema, path.getFileSystem(conf).create(path));
    return new PigAvroRecordWriter(writer);
}

Example 15

Source File: AvroRecordWriter.java From spork with Apache License 2.0

5 votes

static void configureDataFileWriter(DataFileWriter<GenericData.Record> writer,
    JobConf job) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(job)) {
    int level = job.getInt(DEFLATE_LEVEL_KEY,
        DEFAULT_DEFLATE_LEVEL);
    String codecName = job.get(AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory = codecName.equals(DEFLATE_CODEC)
      ? CodecFactory.deflateCodec(level)
      : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  // Do max as core-default.xml has io.file.buffer.size as 4K
  writer.setSyncInterval(job.getInt(SYNC_INTERVAL_KEY, Math.max(
          job.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

  // copy metadata from job
  for (Map.Entry<String,String> e : job) {
    if (e.getKey().startsWith(AvroJob.TEXT_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                     e.getValue());
    if (e.getKey().startsWith(AvroJob.BINARY_PREFIX))
      writer.setMeta(e.getKey().substring(AvroJob.BINARY_PREFIX.length()),
                     URLDecoder.decode(e.getValue(), "ISO-8859-1")
                     .getBytes("ISO-8859-1"));
  }
}

Example 16

Source File: AvroFileReaderWriterFactory.java From secor with Apache License 2.0

5 votes

public AvroFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException {
    file = new File(logFilePath.getLogFilePath());
    file.getParentFile().mkdirs();
    LOG.debug("Creating Brand new Writer for path {}", logFilePath.getLogFilePath());
    topic = logFilePath.getTopic();
    Schema schema = schemaRegistry.getSchema(topic);
    SpecificDatumWriter specificDatumWriter= new SpecificDatumWriter(schema);
    writer = new DataFileWriter(specificDatumWriter);
    writer.setCodec(getCodecFactory(codec));
    writer.create(schema, file);
}

Example 17

Source File: PigAvroOutputFormat.java From Cubert with Apache License 2.0

5 votes

@Override
public RecordWriter<NullWritable, Object> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

    if (schema == null)
        throw new IOException("Must provide a schema");

    Configuration conf = context.getConfiguration();

    DataFileWriter<Object> writer = new DataFileWriter<Object>(new PigAvroDatumWriter(schema));

    if (FileOutputFormat.getCompressOutput(context)) {
        int level = conf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = conf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
            ? CodecFactory.deflateCodec(level)
            : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    // Do max as core-default.xml has io.file.buffer.size as 4K
    writer.setSyncInterval(conf.getInt(SYNC_INTERVAL_KEY, Math.max(
            conf.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

    Path path = getDefaultWorkFile(context, EXT);
    writer.create(schema, path.getFileSystem(conf).create(path));
    return new PigAvroRecordWriter(writer);
}

Example 18

Source File: AvroOutputFormat.java From aliyun-maxcompute-data-collectors with Apache License 2.0

5 votes

static <T> void configureDataFileWriter(DataFileWriter<T> writer,
  TaskAttemptContext context) throws UnsupportedEncodingException {
  if (FileOutputFormat.getCompressOutput(context)) {
    int level = context.getConfiguration()
      .getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
    String codecName = context.getConfiguration()
      .get(org.apache.avro.mapred.AvroJob.OUTPUT_CODEC, DEFLATE_CODEC);
    CodecFactory factory =
      codecName.equals(DEFLATE_CODEC) ? CodecFactory.deflateCodec(level)
        : CodecFactory.fromString(codecName);
    writer.setCodec(factory);
  }

  writer.setSyncInterval(context.getConfiguration()
    .getInt(SYNC_INTERVAL_KEY, DEFAULT_SYNC_INTERVAL));

  // copy metadata from job
  for (Map.Entry<String, String> e : context.getConfiguration()) {
    if (e.getKey().startsWith(org.apache.avro.mapred.AvroJob.TEXT_PREFIX)) {
      writer.setMeta(e.getKey()
        .substring(org.apache.avro.mapred.AvroJob.TEXT_PREFIX.length()),
        e.getValue());
    }
    if (e.getKey().startsWith(org.apache.avro.mapred.AvroJob.BINARY_PREFIX)) {
      writer.setMeta(e.getKey()
        .substring(org.apache.avro.mapred.AvroJob.BINARY_PREFIX.length()),
        URLDecoder.decode(e.getValue(), "ISO-8859-1").getBytes("ISO-8859-1"));
    }
  }
}

Example 19

Source File: TestUtil.java From localization_nifi with Apache License 2.0

5 votes

private static byte[] bytesFor(List<Record> records) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataFileWriter<Record> writer = new DataFileWriter<>(
            AvroUtil.newDatumWriter(records.get(0).getSchema(), Record.class));
    writer.setCodec(CodecFactory.snappyCodec());
    writer = writer.create(records.get(0).getSchema(), out);

    for (Record record : records) {
        writer.append(record);
    }

    writer.flush();

    return out.toByteArray();
}

Example 20

Source File: AvroFileAppender.java From iceberg with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
private static <D> DataFileWriter<D> newAvroWriter(
    Schema schema, PositionOutputStream stream, Function<Schema, DatumWriter<?>> createWriterFunc,
    CodecFactory codec, Map<String, String> metadata) throws IOException {
  DataFileWriter<D> writer = new DataFileWriter<>(
      (DatumWriter<D>) createWriterFunc.apply(schema));

  writer.setCodec(codec);

  for (Map.Entry<String, String> entry : metadata.entrySet()) {
    writer.setMeta(entry.getKey(), entry.getValue());
  }

  return writer.create(schema, stream);
}