Java Code Examples for org.apache.hadoop.io.compress.CompressionCodec#getDefaultExtension()

The following examples show how to use org.apache.hadoop.io.compress.CompressionCodec#getDefaultExtension() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: JSONFileOutputFormat.java    From ojai with Apache License 2.0 6 votes vote down vote up
@Override
public RecordWriter<LongWritable, Document> getRecordWriter(
    TaskAttemptContext job) throws IOException, InterruptedException {

  Configuration conf = job.getConfiguration();
  boolean isCompressed = getCompressOutput(job);
  CompressionCodec codec = null;
  String extension = "";
  if (isCompressed) {
    Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(
        job, GzipCodec.class);
    codec = ReflectionUtils.newInstance(codecClass, conf);
    extension = codec.getDefaultExtension();
  }
  Path path = getDefaultWorkFile(job, extension);
  FileSystem fs = path.getFileSystem(conf);
  FSDataOutputStream out = fs.create(path, false);
  if (!isCompressed) {
    return new JSONFileOutputRecordWriter(out);
  } else {
    return new JSONFileOutputRecordWriter(new DataOutputStream(
        codec.createOutputStream(out)));
  }
}
 
Example 2
Source File: CommonFileOutputFormat.java    From tinkerpop with Apache License 2.0 6 votes vote down vote up
protected DataOutputStream getDataOutputStream(final TaskAttemptContext job) throws IOException, InterruptedException {
    final Configuration conf = job.getConfiguration();
    boolean isCompressed = getCompressOutput(job);
    CompressionCodec codec = null;
    String extension = "";
    if (isCompressed) {
        final Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, DefaultCodec.class);
        codec = ReflectionUtils.newInstance(codecClass, conf);
        extension = codec.getDefaultExtension();
    }
    final Path file = super.getDefaultWorkFile(job, extension);
    final FileSystem fs = file.getFileSystem(conf);
    if (!isCompressed) {
        return new DataOutputStream(fs.create(file, false));
    } else {
        return new DataOutputStream(codec.createOutputStream(fs.create(file, false)));
    }
}
 
Example 3
Source File: KeyIgnoringVCFOutputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
/** <code>setHeader</code> or <code>readHeaderFrom</code> must have been
 * called first.
 */
@Override public RecordWriter<K,VariantContextWritable> getRecordWriter(
		TaskAttemptContext ctx)
	throws IOException
{
	Configuration conf = ctx.getConfiguration();
	boolean isCompressed = getCompressOutput(ctx);
	CompressionCodec codec = null;
	String extension = "";
	if (isCompressed) {
		Class<? extends CompressionCodec> codecClass =
				getOutputCompressorClass(ctx, BGZFCodec.class);
		codec = ReflectionUtils.newInstance(codecClass, conf);
		extension = codec.getDefaultExtension();
	}
	Path file = getDefaultWorkFile(ctx, extension);
	if (!isCompressed) {
		return getRecordWriter(ctx, file);
	} else {
		FileSystem fs = file.getFileSystem(conf);
		return getRecordWriter(ctx, codec.createOutputStream(fs.create(file)));
	}
}
 
Example 4
Source File: PartitionFinalizer.java    From secor with Apache License 2.0 6 votes vote down vote up
public PartitionFinalizer(SecorConfig config) throws Exception {
    mConfig = config;
    Class kafkaClientClass = Class.forName(mConfig.getKafkaClientClass());
    this.mKafkaClient = (KafkaClient) kafkaClientClass.newInstance();
    this.mKafkaClient.init(config);
    mZookeeperConnector = new ZookeeperConnector(mConfig);
    mMessageParser = (TimestampedMessageParser) ReflectionUtil.createMessageParser(
      mConfig.getMessageParserClass(), mConfig);
    mQuboleClient = new QuboleClient(mConfig);
    if (mConfig.getFileExtension() != null && !mConfig.getFileExtension().isEmpty()) {
        mFileExtension = mConfig.getFileExtension();
    } else if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
        CompressionCodec codec = CompressionUtil.createCompressionCodec(mConfig.getCompressionCodec());
        mFileExtension = codec.getDefaultExtension();
    } else {
        mFileExtension = "";
    }
    mLookbackPeriods = config.getFinalizerLookbackPeriods();
    LOG.info("Lookback periods: " + mLookbackPeriods);
}
 
Example 5
Source File: PushdownLargeFieldedListsVisitor.java    From datawave with Apache License 2.0 5 votes vote down vote up
protected URI createFst(SortedSet<String> values) throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
    FST fst = DatawaveFieldIndexListIteratorJexl.getFST(values);
    
    // now serialize to our file system
    CompressionCodec codec = null;
    String extension = "";
    if (config.getHdfsFileCompressionCodec() != null) {
        ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
        if (classLoader == null) {
            classLoader = this.getClass().getClassLoader();
        }
        Class<? extends CompressionCodec> clazz = Class.forName(config.getHdfsFileCompressionCodec(), true, classLoader).asSubclass(CompressionCodec.class);
        codec = clazz.newInstance();
        extension = codec.getDefaultExtension();
    }
    int fstCount = config.getFstCount().incrementAndGet();
    Path fstFile = new Path(fstHdfsUri, "PushdownLargeFileFst." + fstCount + ".fst" + extension);
    
    OutputStream fstFileOut = new BufferedOutputStream(fs.create(fstFile, false));
    if (codec != null) {
        fstFileOut = codec.createOutputStream(fstFileOut);
    }
    
    OutputStreamDataOutput outStream = new OutputStreamDataOutput(fstFileOut);
    fst.save(outStream);
    outStream.close();
    
    return fstFile.toUri();
}
 
Example 6
Source File: TextFileFactory.java    From sylph with Apache License 2.0 5 votes vote down vote up
private FileChannel createOutputStream(String rowKey, TextTimeParser timeParser, long split)
{
    Configuration hadoopConf = new Configuration();
    CompressionCodec codec = ReflectionUtils.newInstance(Lz4Codec.class, hadoopConf);
    String outputPath = this.writeTableDir + timeParser.getPartitionPath() + "_partition_" + this.partition + "_split" + split + codec.getDefaultExtension();
    logger.info("create {} text file {}", rowKey, outputPath);
    try {
        FileChannel fileChannel = new FileChannel(outputPath, split, codec, hadoopConf);
        return fileChannel;
    }
    catch (IOException var11) {
        throw new RuntimeException("textFile " + outputPath + " writer create failed", var11);
    }
}
 
Example 7
Source File: ExportManifestOutputFormat.java    From emr-dynamodb-connector with Apache License 2.0 5 votes vote down vote up
@Override
public RecordWriter<K, Text> getRecordWriter(FileSystem ignored, JobConf job, String name,
    Progressable progress) throws IOException {
  String extension = "";
  Path file = FileOutputFormat.getTaskOutputPath(job, MANIFEST_FILENAME);
  FileSystem fs = file.getFileSystem(job);
  FSDataOutputStream fileOut = fs.create(file, progress);
  if (getCompressOutput(job)) {
    Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class);
    CompressionCodec codec = ReflectionUtils.newInstance(codecClass, job);
    extension = codec.getDefaultExtension();
  }
  return new ExportManifestRecordWriter<>(fileOut, FileOutputFormat.getOutputPath(job),
      extension);
}
 
Example 8
Source File: WARCFileWriter.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
/**
 * Creates a WARC file, and opens it for writing. If a file with the same name already
 * exists, it is *overwritten*. Note that this is different behaviour from the other
 * constructor. Yes, this sucks. It will probably change in a future version.
 *
 * @param conf           The Hadoop configuration.
 * @param codec          If null, the file is uncompressed. If non-null, this compression codec
 *                       will be used. The codec's default file extension is appended to the filename.
 * @param workOutputPath The directory and filename prefix to which the data should be
 *                       written. We append a segment number and filename extensions to it.
 * @param progress       An object used by the mapred API for tracking a task's progress.
 * @throws IOException I/O exception
 */
public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath,
        Progressable progress)
        throws IOException
{
    this.conf = conf;
    this.codec = codec;
    this.workOutputPath = workOutputPath;
    this.progress = progress;
    this.extensionFormat =
            ".seg-%05d.warc" + (codec == null ? "" : codec.getDefaultExtension());
    this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
    createSegment();
}
 
Example 9
Source File: HdfsLoader.java    From sqoop-on-spark with Apache License 2.0 5 votes vote down vote up
private static String getExtension(ToJobConfiguration toJobConf, CompressionCodec codec) {
  if (toJobConf.toJobConfig.outputFormat == ToFormat.SEQUENCE_FILE)
    return ".seq";
  if (codec == null)
    return ".txt";
  return codec.getDefaultExtension();
}
 
Example 10
Source File: CompressWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
private String getHadoopCodecExtension(String hadoopCodecName, Configuration conf) throws IOException {
	CompressionCodec codec = new CompressionCodecFactory(conf).getCodecByName(hadoopCodecName);

	if (codec == null) {
		throw new IOException("Unable to load the provided Hadoop codec [" + hadoopCodecName + "]");
	}

	return codec.getDefaultExtension();
}
 
Example 11
Source File: Uploader.java    From secor with Apache License 2.0 4 votes vote down vote up
private void trim(LogFilePath srcPath, long startOffset) throws Exception {
    final TopicPartition topicPartition = new TopicPartition(srcPath.getTopic(), srcPath.getKafkaPartition());
    if (startOffset == srcPath.getOffset()) {
        // If *all* the files had the right offset already, trimFiles would have returned
        // before resetting the tracker. If just some do, we don't want to rewrite files in place
        // (it's probably safe but let's not stress it), but this shouldn't happen anyway.
        throw new RuntimeException("Some LogFilePath has unchanged offset, but they don't all? " + srcPath);
    }
    FileReader reader = null;
    FileWriter writer = null;
    LogFilePath dstPath = null;
    int copiedMessages = 0;
    // Deleting the writer closes its stream flushing all pending data to the disk.
    mFileRegistry.deleteWriter(srcPath);
    try {
        CompressionCodec codec = null;
        String extension = "";
        if (mConfig.getCompressionCodec() != null && !mConfig.getCompressionCodec().isEmpty()) {
            codec = CompressionUtil.createCompressionCodec(mConfig.getCompressionCodec());
            extension = codec.getDefaultExtension();
        }
        reader = createReader(srcPath, codec);
        KeyValue keyVal;
        while ((keyVal = reader.next()) != null) {
            if (keyVal.getOffset() >= startOffset) {
                if (writer == null) {
                    String localPrefix = mConfig.getLocalPath() + '/' +
                        IdUtil.getLocalMessageDir();
                    dstPath = new LogFilePath(localPrefix, srcPath.getTopic(),
                                              srcPath.getPartitions(), srcPath.getGeneration(),
                                              srcPath.getKafkaPartition(), startOffset,
                                              extension);
                    writer = mFileRegistry.getOrCreateWriter(dstPath,
                    		codec);
                }
                writer.write(keyVal);
                if (mDeterministicUploadPolicyTracker != null) {
                    mDeterministicUploadPolicyTracker.track(topicPartition, keyVal);
                }
                copiedMessages++;
            }
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
    mFileRegistry.deletePath(srcPath);
    if (dstPath == null) {
        LOG.info("removed file {}", srcPath.getLogFilePath());
    } else {
        LOG.info("trimmed {} messages from {} to {} with start offset {}",
                copiedMessages, srcPath.getLogFilePath(), dstPath.getLogFilePath(), startOffset);
    }
}
 
Example 12
Source File: WARCFileWriter.java    From flink-crawler with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a WARC file, and opens it for writing. If a file with the same name already exists,
 * it is *overwritten*. Note that this is different behaviour from the other constructor. Yes,
 * this sucks. It will probably change in a future version.
 *
 * @param conf
 *            The Hadoop configuration.
 * @param codec
 *            If null, the file is uncompressed. If non-null, this compression codec will be
 *            used. The codec's default file extension is appended to the filename.
 * @param workOutputPath
 *            The directory and filename prefix to which the data should be written. We append a
 *            segment number and filename extensions to it.
 * @param progress
 *            An object used by the mapred API for tracking a task's progress.
 * @throws IOException
 */
public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath,
        Progressable progress) throws IOException {
    this._conf = conf;
    this._codec = codec;
    this._workOutputPath = workOutputPath;
    this._progress = progress;
    this._extensionFormat = ".seg-%05d.attempt-%05d.warc"
            + (codec == null ? "" : codec.getDefaultExtension());
    this._maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
    createSegment();
}
 
Example 13
Source File: WARCFileWriter.java    From warc-hadoop with MIT License 3 votes vote down vote up
/**
 * Creates a WARC file, and opens it for writing. If a file with the same name already
 * exists, it is *overwritten*. Note that this is different behaviour from the other
 * constructor. Yes, this sucks. It will probably change in a future version.
 *
 * @param conf The Hadoop configuration.
 * @param codec If null, the file is uncompressed. If non-null, this compression codec
 *              will be used. The codec's default file extension is appended to the filename.
 * @param workOutputPath The directory and filename prefix to which the data should be
 *                       written. We append a segment number and filename extensions to it.
 * @param progress An object used by the mapred API for tracking a task's progress.
 * @throws IOException
 */
public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress)
        throws IOException {
    this.conf = conf;
    this.codec = codec;
    this.workOutputPath = workOutputPath;
    this.progress = progress;
    this.extensionFormat = ".seg-%05d.attempt-%05d.warc" +
            (codec == null ? "" : codec.getDefaultExtension());
    this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
    createSegment();
}