Java Code Examples for org.apache.hadoop.io.compress.CompressionCodecFactory#getCodec()

The following examples show how to use org.apache.hadoop.io.compress.CompressionCodecFactory#getCodec() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WholeTextInputFormat.java    From spliceengine with GNU Affero General Public License v3.0 6 votes vote down vote up
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (currentPath>=split.getNumPaths()) {
        return false;
    }

    Path path = split.getPath(currentPath);
    currentPath++;

    CompressionCodecFactory factory = new CompressionCodecFactory(conf);
    CompressionCodec codec = factory.getCodec(path);
    key = path.toString();
    FSDataInputStream fileIn = fs.open(path);

    value = codec!=null?codec.createInputStream(fileIn):fileIn;
    return true;
}
 
Example 2
Source File: PossiblyDecompressedInputStream.java    From big-c with Apache License 2.0 6 votes vote down vote up
public PossiblyDecompressedInputStream(Path inputPath, Configuration conf)
    throws IOException {
  CompressionCodecFactory codecs = new CompressionCodecFactory(conf);
  CompressionCodec inputCodec = codecs.getCodec(inputPath);

  FileSystem ifs = inputPath.getFileSystem(conf);
  FSDataInputStream fileIn = ifs.open(inputPath);

  if (inputCodec == null) {
    decompressor = null;
    coreInputStream = fileIn;
  } else {
    decompressor = CodecPool.getDecompressor(inputCodec);
    coreInputStream = inputCodec.createInputStream(fileIn, decompressor);
  }
}
 
Example 3
Source File: XmlInputFormat.java    From Hive-XML-SerDe with Apache License 2.0 6 votes vote down vote up
public XmlRecordReader(FileSplit input, JobConf jobConf) throws IOException {
    Configuration conf = jobConf;
    this.startTag = conf.get(START_TAG_KEY).getBytes("utf-8");
    this.endTag = conf.get(END_TAG_KEY).getBytes("utf-8");
    FileSplit split = (FileSplit) input;

    Path file = split.getPath();
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(file);
    FileSystem fs = file.getFileSystem(conf);
    if (codec != null) {
        this.fsin = new DataInputStream(codec.createInputStream(fs.open(file)));
        //Data read only happens in first split and invalid other splits.
        //This is to avoid reading duplicate data for compressed files.
        this.start = (split.getStart() == 0) ? 0 : Long.MAX_VALUE;
        this.end = Long.MAX_VALUE;
    } else {
        this.start = split.getStart();
        this.end = this.start + split.getLength();
        FSDataInputStream fileIn = fs.open(file);
        fileIn.seek(this.start);
        this.fsin = fileIn;
    }
    this.recordStartPos = this.start;
    this.pos = this.start;
}
 
Example 4
Source File: TestExport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
/**
 * Create a data file that gets exported to the db.
 * @param fileNum the number of the file (for multi-file export)
 * @param numRecords how many records to write to the file.
 * @param gzip is true if the file should be gzipped.
 */
protected void createTextFile(int fileNum, int numRecords, boolean gzip,
    ColumnGenerator... extraCols) throws IOException {
  int startId = fileNum * numRecords;

  String ext = ".txt";
  if (gzip) {
    ext = ext + ".gz";
  }
  Path tablePath = getTablePath();
  Path filePath = new Path(tablePath, "part" + fileNum + ext);

  Configuration conf = new Configuration();
  if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
    conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
  }
  FileSystem fs = FileSystem.get(conf);
  fs.mkdirs(tablePath);
  OutputStream os = fs.create(filePath);
  if (gzip) {
    CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
    CompressionCodec codec = ccf.getCodec(filePath);
    os = codec.createOutputStream(os);
  }
  BufferedWriter w = new BufferedWriter(new OutputStreamWriter(os));
  for (int i = 0; i < numRecords; i++) {
    w.write(getRecordLine(startId + i, extraCols));
  }
  w.close();
  os.close();

  if (gzip) {
    verifyCompressedFile(filePath, numRecords);
  }
}
 
Example 5
Source File: LineRecordReader.java    From RDFS with Apache License 2.0 5 votes vote down vote up
public LineRecordReader(Configuration job, 
                        FileSplit split) throws IOException {
  this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                                  Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();
  compressionCodecs = new CompressionCodecFactory(job);
  final CompressionCodec codec = compressionCodecs.getCodec(file);

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  boolean skipFirstLine = false;
  if (codec != null) {
    in = new LineReader(codec.createInputStream(fileIn), job);
    end = Long.MAX_VALUE;
  } else {
    if (start != 0) {
      skipFirstLine = true;
      --start;
      fileIn.seek(start);
    }
    in = new LineReader(fileIn, job);
  }
  if (skipFirstLine) {  // skip first line and re-establish "start".
    start += in.readLine(new Text(), 0,
                         (int)Math.min((long)Integer.MAX_VALUE, end - start));
  }
  this.pos = start;
}
 
Example 6
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 7
Source File: HadoopLogsAnalyzer.java    From hadoop with Apache License 2.0 5 votes vote down vote up
private LineReader maybeUncompressedPath(Path p)
    throws FileNotFoundException, IOException {
  CompressionCodecFactory codecs = new CompressionCodecFactory(getConf());
  inputCodec = codecs.getCodec(p);
  FileSystem fs = p.getFileSystem(getConf());
  FSDataInputStream fileIn = fs.open(p);

  if (inputCodec == null) {
    return new LineReader(fileIn, getConf());
  } else {
    inputDecompressor = CodecPool.getDecompressor(inputCodec);
    return new LineReader(inputCodec.createInputStream(fileIn,
        inputDecompressor), getConf());
  }
}
 
Example 8
Source File: TestExport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private void verifyCompressedFile(Path f, int expectedNumLines)
    throws IOException {
  Configuration conf = new Configuration();
  if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
    conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
  }
  FileSystem fs = FileSystem.get(conf);
  InputStream is = fs.open(f);
  CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
  CompressionCodec codec = ccf.getCodec(f);
  LOG.info("gzip check codec is " + codec);
  Decompressor decompressor = CodecPool.getDecompressor(codec);
  if (null == decompressor) {
    LOG.info("Verifying gzip sanity with null decompressor");
  } else {
    LOG.info("Verifying gzip sanity with decompressor: "
        + decompressor.toString());
  }
  is = codec.createInputStream(is, decompressor);
  BufferedReader r = new BufferedReader(new InputStreamReader(is));
  int numLines = 0;
  while (true) {
    String ln = r.readLine();
    if (ln == null) {
      break;
    }
    numLines++;
  }

  r.close();
  assertEquals("Did not read back correct number of lines",
      expectedNumLines, numLines);
  LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
}
 
Example 9
Source File: LineRecordReader.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
public LineRecordReader(Configuration job, 
                        FileSplit split) throws IOException {
  this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
                                  Integer.MAX_VALUE);
  start = split.getStart();
  end = start + split.getLength();
  final Path file = split.getPath();
  compressionCodecs = new CompressionCodecFactory(job);
  final CompressionCodec codec = compressionCodecs.getCodec(file);

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  boolean skipFirstLine = false;
  if (codec != null) {
    in = new LineReader(codec.createInputStream(fileIn), job);
    end = Long.MAX_VALUE;
  } else {
    if (start != 0) {
      skipFirstLine = true;
      --start;
      fileIn.seek(start);
    }
    in = new LineReader(fileIn, job);
  }
  if (skipFirstLine) {  // skip first line and re-establish "start".
    start += in.readLine(new Text(), 0,
                         (int)Math.min((long)Integer.MAX_VALUE, end - start));
  }
  this.pos = start;
}
 
Example 10
Source File: HadoopLogsAnalyzer.java    From big-c with Apache License 2.0 5 votes vote down vote up
private LineReader maybeUncompressedPath(Path p)
    throws FileNotFoundException, IOException {
  CompressionCodecFactory codecs = new CompressionCodecFactory(getConf());
  inputCodec = codecs.getCodec(p);
  FileSystem fs = p.getFileSystem(getConf());
  FSDataInputStream fileIn = fs.open(p);

  if (inputCodec == null) {
    return new LineReader(fileIn, getConf());
  } else {
    inputDecompressor = CodecPool.getDecompressor(inputCodec);
    return new LineReader(inputCodec.createInputStream(fileIn,
        inputDecompressor), getConf());
  }
}
 
Example 11
Source File: CodecFactory.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Helper routine to get compression codec class by path (file suffix).
 *
 * @param path path of file to get codec for
 * @return matching codec class for the path. null if no codec is needed.
 */
private Class<? extends CompressionCodec> getCodecClassByPath(Configuration config, String path) {
    Class<? extends CompressionCodec> codecClass = null;
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    CompressionCodec codec = factory.getCodec(new Path(path));
    if (codec != null) {
        codecClass = codec.getClass();
    }
    if (LOG.isDebugEnabled()) {
        String msg = (codecClass == null ? "No codec" : "Codec " + codecClass);
        LOG.debug("{} was found for file {}", msg, path);
    }
    return codecClass;
}
 
Example 12
Source File: FastaInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();
	current_split_pos = 1;

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 13
Source File: TestTablePartitions.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public final void testColumnPartitionedTableByOneColumnsWithCompression() throws Exception {
  ResultSet res = null;
  String tableName = IdentifierUtil.normalizeIdentifier("testColumnPartitionedTableByOneColumnsWithCompression");

  if (nodeType == NodeType.INSERT) {
    res = executeString(
      "create table " + tableName + " (col2 int4, col3 float8) USING text " +
        "WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
        "PARTITION BY column(col1 int4)");
    res.close();
    assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName));

    res = executeString(
      "insert overwrite into " + tableName + " select l_partkey, l_quantity, l_orderkey from lineitem");
  } else {
    res = executeString(
      "create table " + tableName + " (col2 int4, col3 float8) USING text " +
        "WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
        "PARTITION BY column(col1 int4) as select l_partkey, l_quantity, l_orderkey from lineitem");
  }
  res.close();

  TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName);
  if (!testingCluster.isHiveCatalogStoreRunning()) {
    assertEquals(8, desc.getStats().getNumRows().intValue());
  }

  FileSystem fs = FileSystem.get(conf);
  assertTrue(fs.exists(new Path(desc.getUri())));
  CompressionCodecFactory factory = new CompressionCodecFactory(conf);

  Path path = new Path(desc.getUri());
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));

  for (FileStatus partition : fs.listStatus(path)){
    assertTrue(fs.isDirectory(partition.getPath()));
    for (FileStatus file : fs.listStatus(partition.getPath())) {
      CompressionCodec codec = factory.getCodec(file.getPath());
      assertTrue(codec instanceof DeflateCodec);
    }
  }

  verifyPartitionDirectoryFromCatalog(DEFAULT_DATABASE_NAME, tableName, new String[]{"col1"},
    desc.getStats().getNumRows());

  executeString("DROP TABLE " + tableName + " PURGE").close();
}
 
Example 14
Source File: TestTablePartitions.java    From incubator-tajo with Apache License 2.0 4 votes vote down vote up
@Test
public final void testColumnPartitionedTableNoMatchedPartition() throws Exception {
  String tableName = "testColumnPartitionedTableNoMatchedPartition";
  ResultSet res = executeString(
      "create table " + tableName + " (col4 text) USING csv " +
          "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
          "partition by column(col1 int4, col2 int4, col3 float8)");
  res.close();

  assertTrue(catalog.existsTable(tableName));

  res = executeString(
      "insert overwrite into " + tableName +
          " select l_returnflag , l_orderkey, l_partkey, l_quantity from lineitem");
  res.close();
  TableDesc desc = catalog.getTableDesc(tableName);
  assertEquals(5, desc.getStats().getNumRows().intValue());

  FileSystem fs = FileSystem.get(conf);
  assertTrue(fs.exists(desc.getPath()));
  CompressionCodecFactory factory = new CompressionCodecFactory(conf);

  Path path = desc.getPath();
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1/col3=17.0")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2/col3=38.0")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2/col3=45.0")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3/col3=49.0")));

  for (FileStatus partition1 : fs.listStatus(path)){
    assertTrue(fs.isDirectory(partition1.getPath()));
    for (FileStatus partition2 : fs.listStatus(partition1.getPath())) {
      assertTrue(fs.isDirectory(partition2.getPath()));
      for (FileStatus partition3 : fs.listStatus(partition2.getPath())) {
        assertTrue(fs.isDirectory(partition3.getPath()));
        for (FileStatus file : fs.listStatus(partition3.getPath())) {
          CompressionCodec codec = factory.getCodec(file.getPath());
          assertTrue(codec instanceof DeflateCodec);
        }
      }
    }
  }

  res = executeString("select * from " + tableName + " where col2 = 9");
  assertFalse(res.next());
  res.close();
}
 
Example 15
Source File: TestTablePartitions.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public final void testColumnPartitionedTableByTwoColumnsWithCompression() throws Exception {
  ResultSet res = null;
  String tableName = IdentifierUtil.normalizeIdentifier("testColumnPartitionedTableByTwoColumnsWithCompression");

  if (nodeType == NodeType.INSERT) {
    res = executeString("create table " + tableName + " (col3 float8, col4 text) USING text " +
      "WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
      "PARTITION by column(col1 int4, col2 int4)");
    res.close();

    assertTrue(catalog.existsTable(DEFAULT_DATABASE_NAME, tableName));

    res = executeString(
      "insert overwrite into " + tableName +
        " select  l_quantity, l_returnflag, l_orderkey, l_partkey from lineitem");
  } else {
    res = executeString("create table " + tableName + " (col3 float8, col4 text) USING text " +
        "WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
        "PARTITION by column(col1 int4, col2 int4) as select  l_quantity, l_returnflag, l_orderkey, " +
      "l_partkey from lineitem");
  }
  res.close();

  TableDesc desc = catalog.getTableDesc(DEFAULT_DATABASE_NAME, tableName);
  if (!testingCluster.isHiveCatalogStoreRunning()) {
    assertEquals(8, desc.getStats().getNumRows().intValue());
  }

  FileSystem fs = FileSystem.get(conf);
  assertTrue(fs.exists(new Path(desc.getUri())));
  CompressionCodecFactory factory = new CompressionCodecFactory(conf);

  Path path = new Path(desc.getUri());
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3")));

  for (FileStatus partition1 : fs.listStatus(path)){
    assertTrue(fs.isDirectory(partition1.getPath()));
    for (FileStatus partition2 : fs.listStatus(partition1.getPath())) {
      assertTrue(fs.isDirectory(partition2.getPath()));
      for (FileStatus file : fs.listStatus(partition2.getPath())) {
        CompressionCodec codec = factory.getCodec(file.getPath());
        assertTrue(codec instanceof DeflateCodec);
      }
    }
  }

  verifyPartitionDirectoryFromCatalog(DEFAULT_DATABASE_NAME, tableName, new String[]{"col1", "col2"},
      desc.getStats().getNumRows());

  executeString("DROP TABLE " + tableName + " PURGE").close();
}
 
Example 16
Source File: CompressionEmulationUtil.java    From big-c with Apache License 2.0 4 votes vote down vote up
/** Publishes compression related data statistics. Following statistics are
 * published
 * <ul>
 *   <li>Total compressed input data size</li>
 *   <li>Number of compressed input data files</li>
 *   <li>Compression Ratio</li>
 *   <li>Text data dictionary size</li>
 *   <li>Random text word size</li>
 * </ul>
 */
static DataStatistics publishCompressedDataStatistics(Path inputDir, 
                        Configuration conf, long uncompressedDataSize) 
throws IOException {
  FileSystem fs = inputDir.getFileSystem(conf);
  CompressionCodecFactory compressionCodecs = 
    new CompressionCodecFactory(conf);

  // iterate over compressed files and sum up the compressed file sizes
  long compressedDataSize = 0;
  int numCompressedFiles = 0;
  // obtain input data file statuses
  FileStatus[] outFileStatuses = 
    fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
  for (FileStatus status : outFileStatuses) {
    // check if the input file is compressed
    if (compressionCodecs != null) {
      CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
      if (codec != null) {
        ++numCompressedFiles;
        compressedDataSize += status.getLen();
      }
    }
  }

  LOG.info("Gridmix is configured to use compressed input data.");
  // publish the input data size
  LOG.info("Total size of compressed input data : " 
           + StringUtils.humanReadableInt(compressedDataSize));
  LOG.info("Total number of compressed input data files : " 
           + numCompressedFiles);

  if (numCompressedFiles == 0) {
    throw new RuntimeException("No compressed file found in the input" 
        + " directory : " + inputDir.toString() + ". To enable compression"
        + " emulation, run Gridmix either with "
        + " an input directory containing compressed input file(s) or" 
        + " use the -generate option to (re)generate it. If compression"
        + " emulation is not desired, disable it by setting '" 
        + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
  }
  
  // publish compression ratio only if its generated in this gridmix run
  if (uncompressedDataSize > 0) {
    // compute the compression ratio
    double ratio = ((double)compressedDataSize) / uncompressedDataSize;

    // publish the compression ratio
    LOG.info("Input Data Compression Ratio : " + ratio);
  }
  
  return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}
 
Example 17
Source File: InputStriper.java    From big-c with Apache License 2.0 4 votes vote down vote up
/**
 * @param inputDir Pool used to resolve block locations.
 * @param bytes Target byte count
 * @param nLocs Number of block locations per split.
 * @return A set of files satisfying the byte count, with locations weighted
 *         to the dominating proportion of input bytes.
 */
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs)
    throws IOException {
  final ArrayList<Path> paths = new ArrayList<Path>();
  final ArrayList<Long> start = new ArrayList<Long>();
  final ArrayList<Long> length = new ArrayList<Long>();
  final HashMap<String,Double> sb = new HashMap<String,Double>();
  do {
    paths.add(current.getPath());
    start.add(currentStart);
    final long fromFile = Math.min(bytes, current.getLen() - currentStart);
    length.add(fromFile);
    for (BlockLocation loc :
        inputDir.locationsFor(current, currentStart, fromFile)) {
      final double tedium = loc.getLength() / (1.0 * bytes);
      for (String l : loc.getHosts()) {
        Double j = sb.get(l);
        if (null == j) {
          sb.put(l, tedium);
        } else {
          sb.put(l, j.doubleValue() + tedium);
        }
      }
    }
    currentStart += fromFile;
    bytes -= fromFile;
    // Switch to a new file if
    //  - the current file is uncompressed and completely used
    //  - the current file is compressed
    
    CompressionCodecFactory compressionCodecs = 
      new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
    if (current.getLen() - currentStart == 0
        || codec != null) {
      current = files.get(++idx % files.size());
      currentStart = 0;
    }
  } while (bytes > 0);
  final ArrayList<Entry<String,Double>> sort =
    new ArrayList<Entry<String,Double>>(sb.entrySet());
  Collections.sort(sort, hostRank);
  final String[] hosts = new String[Math.min(nLocs, sort.size())];
  for (int i = 0; i < nLocs && i < sort.size(); ++i) {
    hosts[i] = sort.get(i).getKey();
  }
  return new CombineFileSplit(paths.toArray(new Path[0]),
      toLongArray(start), toLongArray(length), hosts);
}
 
Example 18
Source File: CompressionEmulationUtil.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/** Publishes compression related data statistics. Following statistics are
 * published
 * <ul>
 *   <li>Total compressed input data size</li>
 *   <li>Number of compressed input data files</li>
 *   <li>Compression Ratio</li>
 *   <li>Text data dictionary size</li>
 *   <li>Random text word size</li>
 * </ul>
 */
static DataStatistics publishCompressedDataStatistics(Path inputDir, 
                        Configuration conf, long uncompressedDataSize) 
throws IOException {
  FileSystem fs = inputDir.getFileSystem(conf);
  CompressionCodecFactory compressionCodecs = 
    new CompressionCodecFactory(conf);

  // iterate over compressed files and sum up the compressed file sizes
  long compressedDataSize = 0;
  int numCompressedFiles = 0;
  // obtain input data file statuses
  FileStatus[] outFileStatuses = 
    fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
  for (FileStatus status : outFileStatuses) {
    // check if the input file is compressed
    if (compressionCodecs != null) {
      CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
      if (codec != null) {
        ++numCompressedFiles;
        compressedDataSize += status.getLen();
      }
    }
  }

  LOG.info("Gridmix is configured to use compressed input data.");
  // publish the input data size
  LOG.info("Total size of compressed input data : " 
           + StringUtils.humanReadableInt(compressedDataSize));
  LOG.info("Total number of compressed input data files : " 
           + numCompressedFiles);

  if (numCompressedFiles == 0) {
    throw new RuntimeException("No compressed file found in the input" 
        + " directory : " + inputDir.toString() + ". To enable compression"
        + " emulation, run Gridmix either with "
        + " an input directory containing compressed input file(s) or" 
        + " use the -generate option to (re)generate it. If compression"
        + " emulation is not desired, disable it by setting '" 
        + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
  }
  
  // publish compression ratio only if its generated in this gridmix run
  if (uncompressedDataSize > 0) {
    // compute the compression ratio
    double ratio = ((double)compressedDataSize) / uncompressedDataSize;

    // publish the compression ratio
    LOG.info("Input Data Compression Ratio : " + ratio);
  }
  
  return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}
 
Example 19
Source File: InputStriper.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * @param inputDir Pool used to resolve block locations.
 * @param bytes Target byte count
 * @param nLocs Number of block locations per split.
 * @return A set of files satisfying the byte count, with locations weighted
 *         to the dominating proportion of input bytes.
 */
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs)
    throws IOException {
  final ArrayList<Path> paths = new ArrayList<Path>();
  final ArrayList<Long> start = new ArrayList<Long>();
  final ArrayList<Long> length = new ArrayList<Long>();
  final HashMap<String,Double> sb = new HashMap<String,Double>();
  do {
    paths.add(current.getPath());
    start.add(currentStart);
    final long fromFile = Math.min(bytes, current.getLen() - currentStart);
    length.add(fromFile);
    for (BlockLocation loc :
        inputDir.locationsFor(current, currentStart, fromFile)) {
      final double tedium = loc.getLength() / (1.0 * bytes);
      for (String l : loc.getHosts()) {
        Double j = sb.get(l);
        if (null == j) {
          sb.put(l, tedium);
        } else {
          sb.put(l, j.doubleValue() + tedium);
        }
      }
    }
    currentStart += fromFile;
    bytes -= fromFile;
    // Switch to a new file if
    //  - the current file is uncompressed and completely used
    //  - the current file is compressed
    
    CompressionCodecFactory compressionCodecs = 
      new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
    if (current.getLen() - currentStart == 0
        || codec != null) {
      current = files.get(++idx % files.size());
      currentStart = 0;
    }
  } while (bytes > 0);
  final ArrayList<Entry<String,Double>> sort =
    new ArrayList<Entry<String,Double>>(sb.entrySet());
  Collections.sort(sort, hostRank);
  final String[] hosts = new String[Math.min(nLocs, sort.size())];
  for (int i = 0; i < nLocs && i < sort.size(); ++i) {
    hosts[i] = sort.get(i).getKey();
  }
  return new CombineFileSplit(paths.toArray(new Path[0]),
      toLongArray(start), toLongArray(length), hosts);
}
 
Example 20
Source File: TestTablePartitions.java    From incubator-tajo with Apache License 2.0 4 votes vote down vote up
@Test
public final void testColumnPartitionedTableByTwoColumnsWithCompression() throws Exception {
  String tableName = "testColumnPartitionedTableByTwoColumnsWithCompression";
  ResultSet res = executeString("create table " + tableName + " (col3 float8, col4 text) USING csv " +
      "WITH ('csvfile.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') " +
      "PARTITION by column(col1 int4, col2 int4)");
  res.close();

  assertTrue(catalog.existsTable(tableName));

  res = executeString(
      "insert overwrite into " + tableName +
          " select  l_quantity, l_returnflag, l_orderkey, l_partkey from lineitem");
  res.close();
  TableDesc desc = catalog.getTableDesc(tableName);
  assertEquals(5, desc.getStats().getNumRows().intValue());

  FileSystem fs = FileSystem.get(conf);
  assertTrue(fs.exists(desc.getPath()));
  CompressionCodecFactory factory = new CompressionCodecFactory(conf);

  Path path = desc.getPath();
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=1/col2=1")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=2/col2=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=2")));
  assertTrue(fs.isDirectory(new Path(path.toUri() + "/col1=3/col2=3")));

  for (FileStatus partition1 : fs.listStatus(path)){
    assertTrue(fs.isDirectory(partition1.getPath()));
    for (FileStatus partition2 : fs.listStatus(partition1.getPath())) {
      assertTrue(fs.isDirectory(partition2.getPath()));
      for (FileStatus file : fs.listStatus(partition2.getPath())) {
        CompressionCodec codec = factory.getCodec(file.getPath());
        assertTrue(codec instanceof DeflateCodec);
      }
    }
  }
}