Java Code Examples for org.apache.hadoop.fs.FileSystem#open()

The following examples show how to use org.apache.hadoop.fs.FileSystem#open() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestDirectoryRaidEncoder.java    From RDFS with Apache License 2.0 6 votes vote down vote up
private void printFileCRC(FileSystem fs, Path file, long bufferSize)
    throws IOException {
  byte[] buffer = new byte[(int)bufferSize];
  FSDataInputStream stm = fs.open(file);
  StringBuilder sb = new StringBuilder();
  sb.append("CRC for file: " + file + " size " +
    fs.getFileStatus(file).getLen() + "\n");
  while (stm.read(buffer) >= 0) {
    CRC32 crc = new CRC32();
    crc.update(buffer);
    sb.append(" " + crc.getValue());
  }
  sb.append("\n");
  System.out.println(sb.toString());
  stm.close();
}
 
Example 2
Source File: LineDocRecordReader.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Constructor
 * @param job
 * @param split  
 * @throws IOException
 */
public LineDocRecordReader(Configuration job, FileSplit split)
    throws IOException {
  long start = split.getStart();
  long end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  InputStream in = fileIn;
  boolean skipFirstLine = false;
  if (start != 0) {
    skipFirstLine = true; // wait till BufferedInputStream to skip
    --start;
    fileIn.seek(start);
  }

  this.in = new BufferedInputStream(in);
  if (skipFirstLine) { // skip first line and re-establish "start".
    start += LineDocRecordReader.readData(this.in, null, EOL);
  }
  this.start = start;
  this.pos = start;
  this.end = end;
}
 
Example 3
Source File: TestReadWhileWriting.java    From big-c with Apache License 2.0 6 votes vote down vote up
static void checkFile(Path p, int expectedsize, final Configuration conf
    ) throws IOException, InterruptedException {
  //open the file with another user account
  final String username = UserGroupInformation.getCurrentUser().getShortUserName()
      + "_" + ++userCount;

  UserGroupInformation ugi = UserGroupInformation.createUserForTesting(username, 
                               new String[] {"supergroup"});
  
  final FileSystem fs = DFSTestUtil.getFileSystemAs(ugi, conf);
  
  final HdfsDataInputStream in = (HdfsDataInputStream)fs.open(p);

  //Check visible length
  Assert.assertTrue(in.getVisibleLength() >= expectedsize);

  //Able to read?
  for(int i = 0; i < expectedsize; i++) {
    Assert.assertEquals((byte)i, (byte)in.read());  
  }

  in.close();
}
 
Example 4
Source File: PagesByURLExtractor.java    From dkpro-c4corpus with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a line-delimited string of URLs from the given file
 * <pre>
 * info url1
 * info url2
 * ...
 * </pre>
 *
 * @param urlFile file
 * @return a new-line delimited URLs
 * @throws IOException
 */
String loadURLs(String urlFile)
        throws IOException
{
    // the path needs to be handled by Hadoop FS
    Path path = new Path(urlFile);
    FileSystem fileSystem = path.getFileSystem(getConf());

    FSDataInputStream fsDataInputStream = fileSystem.open(path);
    BufferedReader br = new BufferedReader(new InputStreamReader(fsDataInputStream));

    StringBuilder sb = new StringBuilder();
    String line;
    while ((line = br.readLine()) != null) {
        // split line
        sb.append(line.split("\t")[1]);
        sb.append("\n");
    }

    // close
    fsDataInputStream.close();

    return sb.toString();
}
 
Example 5
Source File: AutoInputFormat.java    From big-c with Apache License 2.0 6 votes vote down vote up
public RecordReader getRecordReader(InputSplit split, JobConf job,
  Reporter reporter) throws IOException {
  FileSplit fileSplit = (FileSplit) split;
  FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
  FSDataInputStream is = fs.open(fileSplit.getPath());
  byte[] header = new byte[3];
  RecordReader reader = null;
  try {
    is.readFully(header);
  } catch (EOFException eof) {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  } finally {
    is.close();
  }
  if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
    reader = seqFileInputFormat.getRecordReader(split, job, reporter);
  } else {
    reader = textInputFormat.getRecordReader(split, job, reporter);
  }
  return reader;
}
 
Example 6
Source File: ContractTestUtils.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * Read the file and convert to a byte dataset.
 * This implements readfully internally, so that it will read
 * in the file without ever having to seek()
 * @param fs filesystem
 * @param path path to read from
 * @param len length of data to read
 * @return the bytes
 * @throws IOException IO problems
 */
public static byte[] readDataset(FileSystem fs, Path path, int len)
    throws IOException {
  FSDataInputStream in = fs.open(path);
  byte[] dest = new byte[len];
  int offset =0;
  int nread = 0;
  try {
    while (nread < len) {
      int nbytes = in.read(dest, offset + nread, len - nread);
      if (nbytes < 0) {
        throw new EOFException("End of file reached before reading fully.");
      }
      nread += nbytes;
    }
  } finally {
    in.close();
  }
  return dest;
}
 
Example 7
Source File: TestExport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
private void verifyCompressedFile(Path f, int expectedNumLines)
    throws IOException {
  Configuration conf = new Configuration();
  if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
    conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
  }
  FileSystem fs = FileSystem.get(conf);
  InputStream is = fs.open(f);
  CompressionCodecFactory ccf = new CompressionCodecFactory(conf);
  CompressionCodec codec = ccf.getCodec(f);
  LOG.info("gzip check codec is " + codec);
  Decompressor decompressor = CodecPool.getDecompressor(codec);
  if (null == decompressor) {
    LOG.info("Verifying gzip sanity with null decompressor");
  } else {
    LOG.info("Verifying gzip sanity with decompressor: "
        + decompressor.toString());
  }
  is = codec.createInputStream(is, decompressor);
  BufferedReader r = new BufferedReader(new InputStreamReader(is));
  int numLines = 0;
  while (true) {
    String ln = r.readLine();
    if (ln == null) {
      break;
    }
    numLines++;
  }

  r.close();
  assertEquals("Did not read back correct number of lines",
      expectedNumLines, numLines);
  LOG.info("gzip sanity check returned " + numLines + " lines; ok.");
}
 
Example 8
Source File: DFSTestUtil.java    From big-c with Apache License 2.0 5 votes vote down vote up
public static byte[] readFileBuffer(FileSystem fs, Path fileName) 
    throws IOException {
  ByteArrayOutputStream os = new ByteArrayOutputStream();
  try {
    FSDataInputStream in = fs.open(fileName);
    try {
      IOUtils.copyBytes(in, os, 1024, true);
      return os.toByteArray();
    } finally {
      in.close();
    }
  } finally {
    os.close();
  }
}
 
Example 9
Source File: Schemas.java    From kite with Apache License 2.0 5 votes vote down vote up
public static Schema fromAvro(FileSystem fs, Path location)
    throws IOException {
  InputStream in = null;
  boolean threw = true;

  try {
    in = fs.open(location);
    Schema schema = fromAvro(in);
    threw = false;
    return schema;
  } finally {
    Closeables.close(in, threw);
  }
}
 
Example 10
Source File: ImportInputFormat.java    From emr-dynamodb-connector with Apache License 2.0 5 votes vote down vote up
/**
 * An example manifest file looks like
 *
 * {"name":"DynamoDB-export","version":3, "entries":[
 * {"url":"s3://path/to/object/92dd1414-a049-4c68-88fb-a23acd44907e","mandatory":true},
 * {"url":"s3://path/to/object/ba3f3535-7aa1-4f97-a530-e72938bf4b76","mandatory":true} ]}
 */
// @formatter:on
private List<InputSplit> parseManifest(FileSystem fs, Path manifestPath, JobConf job) throws
    IOException {
  List<InputSplit> splits = null;

  FSDataInputStream fp = fs.open(manifestPath);
  JsonReader reader = new JsonReader(new InputStreamReader(fp, Charsets.UTF_8));

  reader.beginObject();
  while (reader.hasNext()) {
    String name = reader.nextName();
    switch (name) {
      case VERSION_JSON_KEY:
        job.set(DynamoDBConstants.EXPORT_FORMAT_VERSION, String.valueOf(reader.nextInt()));
        break;
      case ENTRIES_JSON_KEY:
        splits = readEntries(reader, job);
        break;
      default:
        log.info("Skipping a JSON key in the manifest file: " + name);
        reader.skipValue();
        break;
    }
  }
  reader.endObject();

  if (splits == null) {
    return Collections.emptyList();
  }
  return splits;
}
 
Example 11
Source File: AbstractBitcoinRecordReader.java    From hadoopcryptoledger with Apache License 2.0 5 votes vote down vote up
/**
* Initializes reader
* @param split Split to use (assumed to be a file split)
* @param context context of the job
*
*
* @throws java.io.IOException in case of errors reading from the filestream provided by Hadoop
* @throws java.lang.InterruptedException in case of thread interruption
*
*/
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
   FileSplit fSplit = (FileSplit)split;
 // Initialize start and end of split
    start = fSplit.getStart();
    end = start + fSplit.getLength();
    final Path file = fSplit.getPath();
    codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    final FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    // open stream
      if (isCompressedInput()) { // decompress
      	decompressor = CodecPool.getDecompressor(codec);
      	if (codec instanceof SplittableCompressionCodec) {
		
        	final SplitCompressionInputStream cIn =((SplittableCompressionCodec)codec).createInputStream(fileIn, decompressor, start, end,SplittableCompressionCodec.READ_MODE.CONTINUOUS);
				bbr = new BitcoinBlockReader(cIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,this.readAuxPOW);
		start = cIn.getAdjustedStart();
       		end = cIn.getAdjustedEnd();
        	filePosition = cIn; // take pos from compressed stream
      } else {
	bbr = new BitcoinBlockReader(codec.createInputStream(fileIn,decompressor), this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,readAuxPOW);
	filePosition = fileIn;
      }
    } else {
      fileIn.seek(start);
      bbr = new BitcoinBlockReader(fileIn, this.maxSizeBitcoinBlock,this.bufferSize,this.specificMagicByteArray,this.useDirectBuffer,readAuxPOW);  
      filePosition = fileIn;
    }
    // seek to block start (for the case a block overlaps a split)
    try {
    	bbr.seekBlockStart();
    } catch (BitcoinBlockReadException bbre) {
		LOG.error("Error reading Bitcoin blockchhain data");
		LOG.error(bbre);
    } 
}
 
Example 12
Source File: TestWasbUriAndConfiguration.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the file given and makes sure that it's a single-byte file with the
 * given value in it.
 */
private static void assertSingleByteValue(FileSystem fs, Path testFile,
    int expectedValue) throws Exception {
  InputStream inputStream = fs.open(testFile);
  int byteRead = inputStream.read();
  assertTrue("File unexpectedly empty: " + testFile, byteRead >= 0);
  assertTrue("File has more than a single byte: " + testFile,
      inputStream.read() < 0);
  inputStream.close();
  assertEquals("Unxpected content in: " + testFile, expectedValue, byteRead);
}
 
Example 13
Source File: TestAvroStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException {
    FileSystem fs = FileSystem.getLocal(new Configuration()) ;

    /* read in expected results*/
    Set<GenericData.Record> expected = getExpected (expectedOutpath);

    /* read in output results and compare */
    Path output = new Path(outPath);
    assertTrue("Output dir does not exists!", fs.exists(output)
            && fs.getFileStatus(output).isDir());

    Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
    assertTrue("Split field dirs not found!", paths != null);

    for (Path path : paths) {
      Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
      assertTrue("No files found for path: " + path.toUri().getPath(),
              files != null);
      for (Path filePath : files) {
        assertTrue("This shouldn't be a directory", fs.isFile(filePath));

        GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

        DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(
                                        fs.open(filePath), reader);
        assertEquals("codec", expectedCodec, in.getMetaString("avro.codec"));
        int count = 0;
        while (in.hasNext()) {
            GenericData.Record obj = in.next();
            assertTrue("Avro result object found that's not expected: Found "
                    + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString()
                    + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n"
                    , expected.contains(obj));
            count++;
        }
        in.close();
        assertEquals(expected.size(), count);
      }
    }
}
 
Example 14
Source File: AvroHdfsFileSink.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    try (DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>())) {
        FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
        Schema schema = null;
        String inputCodec = null;
        OutputStream output = new BufferedOutputStream(fs.create(new Path(targetFile)));
        for (FileStatus sourceStatus : sourceStatuses) {
            try (DataFileStream<GenericRecord> reader = new DataFileStream<GenericRecord>(
                    new BufferedInputStream(fs.open(sourceStatus.getPath())), new GenericDatumReader<GenericRecord>())) {

                if (schema == null) {
                    schema = reader.getSchema();
                    for (String key : reader.getMetaKeys()) {
                        if (!DataFileWriter.isReservedMeta(key)) {
                            writer.setMeta(key, reader.getMeta(key));
                        }
                    }
                    inputCodec = reader.getMetaString(DataFileConstants.CODEC);
                    if (inputCodec == null) {
                        inputCodec = DataFileConstants.NULL_CODEC;
                    }
                    writer.setCodec(CodecFactory.fromString(inputCodec));
                    writer.create(schema, output);
                }
                writer.appendAllFrom(reader, false);
            }
        }
    }
}
 
Example 15
Source File: TestScatterGather.java    From RDFS with Apache License 2.0 4 votes vote down vote up
private void pReadFile(FileSystem fileSys, Path name) throws IOException {
  FSDataInputStream stm = fileSys.open(name);
  byte[] expected = new byte[(int)(12*blockSize)];
  if (simulatedStorage) {
    for (int i= 0; i < expected.length; i++) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    Random rand = new Random(seed);
    rand.nextBytes(expected);
  }
  // do a sanity check. pread the first 4K bytes
  List<ByteBuffer> rlist = stm.readFullyScatterGather(0, 4096);
  checkAndEraseData(rlist, 4096, 0, expected, "Read Sanity Test");

  // now do a pread for the first 8K bytes
  byte[] actual = new byte[8192];
  doPread(stm, 0L, actual, 0, 8192);
  checkAndEraseData(actual, 0, expected, "Pread Test 1");

  // Now check to see if the normal read returns 0K-8K byte range
  actual = new byte[8192];
  stm.readFully(actual);
  checkAndEraseData(actual, 0, expected, "Pread Test 2");

  // Now see if we can cross a single block boundary successfully
  // read 4K bytes from blockSize - 2K offset
  rlist = stm.readFullyScatterGather(blockSize - 2048, 4096);
  checkAndEraseData(rlist, 4096, (int)(blockSize-2048), expected, "Pread Test 3");

  // now see if we can cross two block boundaries successfully
  // read blockSize + 4K bytes from blockSize - 2K offset
  int size = (int)(blockSize+4096);
  rlist = stm.readFullyScatterGather(blockSize - 2048, size);
  checkAndEraseData(rlist, size, (int)(blockSize-2048), expected, "Pread Test 4");

  // now see if we can cross two block boundaries that are not cached
  // read blockSize + 4K bytes from 10*blockSize - 2K offset
  size = (int)(blockSize+4096);
  rlist = stm.readFullyScatterGather(10*blockSize - 2048, size);
  checkAndEraseData(rlist, size, (int)(10*blockSize-2048), expected, "Pread Test 5");

  // now check that even after all these preads, we can still read
  // bytes 8K-12K
  actual = new byte[4096];
  stm.readFully(actual);
  checkAndEraseData(actual, 8192, expected, "Pread Test 6");

  // pread beyond the end of the file. It should return the last half block.
  size = blockSize/2;
  rlist = stm.readFullyScatterGather(11*blockSize+size, blockSize);
  checkAndEraseData(rlist, size, (int)(11*blockSize+size), expected, "Pread Test 5");

  IOException res = null;
  try { // normal read beyond the end of the file
    stm.readFully(11*blockSize+blockSize/2, actual, 0, blockSize);
  } catch (IOException e) {
    // should throw an exception
    res = e;
  }
  assertTrue("Error reading beyond file boundary.", res != null);
  
  stm.close();
}
 
Example 16
Source File: TestIncrementalImport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 4 votes vote down vote up
/**
 * Assert that a directory contains a file with exactly one line
 * in it, containing the prescribed number 'val'.
 */
public void assertSpecificNumber(String tableName, int val) {
  try {
    FileSystem fs = FileSystem.getLocal(new Configuration());
    Path warehouse = new Path(BaseSqoopTestCase.LOCAL_WAREHOUSE_DIR);
    Path tableDir = new Path(warehouse, tableName);
    FileStatus [] stats = fs.listStatus(tableDir);
    String [] filePaths = new String[stats.length];
    for (int i = 0; i < stats.length; i++) {
      filePaths[i] = stats[i].getPath().toString();
    }

    // Read the first file that is not a hidden file.
    boolean foundVal = false;
    for (String filePath : filePaths) {
      String fileName = new Path(filePath).getName();
      if (fileName.startsWith("_") || fileName.startsWith(".")) {
        continue;
      }

      if (foundVal) {
        // Make sure we don't have two or more "real" files in the dir.
        fail("Got an extra data-containing file in this directory.");
      }

      BufferedReader r = new BufferedReader(
          new InputStreamReader(fs.open(new Path(filePath))));
      try {
        String s = r.readLine();
        if (val == (int) Integer.valueOf(s.trim().split(",")[0])) {
          if (foundVal) {
            fail("Expected only one result, but got another line: " + s);
          }
          foundVal = true;
        }
      } finally {
        r.close();
      }
    }
  } catch (IOException e) {
    fail("Got unexpected exception: " + StringUtils.stringifyException(e));
  }
}
 
Example 17
Source File: TestShortCircuitLocalRead.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test(timeout=10000)
public void testSkipWithVerifyChecksum() throws IOException {
  int size = blockSize;
  Configuration conf = new Configuration();
  conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
  conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
  conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
      "/tmp/testSkipWithVerifyChecksum._PORT");
  DomainSocket.disableBindPathValidation();
  if (simulatedStorage) {
    SimulatedFSDataset.setFactory(conf);
  }
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1)
      .format(true).build();
  FileSystem fs = cluster.getFileSystem();
  try {
    // check that / exists
    Path path = new Path("/");
    assertTrue("/ should be a directory", fs.getFileStatus(path)
        .isDirectory() == true);
    
    byte[] fileData = AppendTestUtil.randomBytes(seed, size*3);
    // create a new file in home directory. Do not close it.
    Path file1 = new Path("filelocal.dat");
    FSDataOutputStream stm = createFile(fs, file1, 1);

    // write to file
    stm.write(fileData);
    stm.close();
    
    // now test the skip function
    FSDataInputStream instm = fs.open(file1);
    byte[] actual = new byte[fileData.length];
    // read something from the block first, otherwise BlockReaderLocal.skip()
    // will not be invoked
    int nread = instm.read(actual, 0, 3);
    long skipped = 2*size+3;
    instm.seek(skipped);
    nread = instm.read(actual, (int)(skipped + nread), 3);
    instm.close();
      
  } finally {
    fs.close();
    cluster.shutdown();
  }
}
 
Example 18
Source File: TestGridmixSubmission.java    From hadoop with Apache License 2.0 4 votes vote down vote up
/**
 * Tests the reading of traces in GridMix3. These traces are generated by
 * Rumen and are in the JSON format. The traces can optionally be compressed
 * and uncompressed traces can also be passed to GridMix3 via its standard
 * input stream. The testing is effected via JUnit assertions.
 *
 * @throws Exception if there was an error.
 */
@Test (timeout=20000)
public void testTraceReader() throws Exception {
  Configuration conf = new Configuration();
  FileSystem lfs = FileSystem.getLocal(conf);
  Path rootInputDir = new Path(System.getProperty("src.test.data"));
  rootInputDir = rootInputDir.makeQualified(lfs.getUri(),
          lfs.getWorkingDirectory());
  Path rootTempDir = new Path(System.getProperty("test.build.data",
          System.getProperty("java.io.tmpdir")), "testTraceReader");
  rootTempDir = rootTempDir.makeQualified(lfs.getUri(),
          lfs.getWorkingDirectory());
  Path inputFile = new Path(rootInputDir, "wordcount.json.gz");
  Path tempFile = new Path(rootTempDir, "gridmix3-wc.json");

  InputStream origStdIn = System.in;
  InputStream tmpIs = null;
  try {
    DebugGridmix dgm = new DebugGridmix();
    JobStoryProducer jsp = dgm.createJobStoryProducer(inputFile.toString(),
            conf);

    LOG.info("Verifying JobStory from compressed trace...");
    verifyWordCountJobStory(jsp.getNextJob());

    expandGzippedTrace(lfs, inputFile, tempFile);
    jsp = dgm.createJobStoryProducer(tempFile.toString(), conf);
    LOG.info("Verifying JobStory from uncompressed trace...");
    verifyWordCountJobStory(jsp.getNextJob());

    tmpIs = lfs.open(tempFile);
    System.setIn(tmpIs);
    LOG.info("Verifying JobStory from trace in standard input...");
    jsp = dgm.createJobStoryProducer("-", conf);
    verifyWordCountJobStory(jsp.getNextJob());
  } finally {
    System.setIn(origStdIn);
    if (tmpIs != null) {
      tmpIs.close();
    }
    lfs.delete(rootTempDir, true);
  }
}
 
Example 19
Source File: Metadata.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the summary from the metadata cache file, if the cache file is stale recreates the metadata
 * @param fs
 * @param metadataParentDir
 * @param autoRefreshTriggered true if the auto-refresh is already triggered
 * @param readerConfig
 * @return returns metadata summary
 */
public static Metadata_V4.MetadataSummary getSummary(FileSystem fs, Path metadataParentDir, boolean autoRefreshTriggered, ParquetReaderConfig readerConfig) {
  Path summaryFile = getSummaryFileName(metadataParentDir);
  Path metadataDirFile = getDirFileName(metadataParentDir);
  MetadataContext metaContext = new MetadataContext();
  try {
    // If autoRefresh is not triggered and none of the metadata files exist
    if (!autoRefreshTriggered && !metadataExists(fs, metadataParentDir)) {
      logger.debug("Metadata doesn't exist in {}", metadataParentDir);
      return null;
    } else if (autoRefreshTriggered && !fs.exists(summaryFile)) {
      logger.debug("Metadata Summary file {} does not exist", summaryFile);
      return null;
    } else {
      // If the autorefresh is not triggered, check if the cache file is stale and trigger auto-refresh
      if (!autoRefreshTriggered) {
        Metadata metadata = new Metadata(readerConfig);
        if (!fs.exists(metadataDirFile)) {
          return null;
        }
        ParquetTableMetadataDirs metadataDirs  = readMetadataDirs(fs, metadataDirFile, metaContext, readerConfig);
        if (metadata.tableModified(metadataDirs.getDirectories(), summaryFile, metadataParentDir, metaContext, fs) && true) {
          ParquetTableMetadata_v4 parquetTableMetadata = (metadata.createMetaFilesRecursivelyAsProcessUser(Path.getPathWithoutSchemeAndAuthority(summaryFile.getParent()), fs, true, null, true)).getLeft();
          return parquetTableMetadata.getSummary();
        }
      }
      // Read the existing metadataSummary cache file to get the metadataSummary
      ObjectMapper mapper = new ObjectMapper();
      final SimpleModule serialModule = new SimpleModule();
      serialModule.addDeserializer(SchemaPath.class, new SchemaPath.De());
      serialModule.addKeyDeserializer(ColumnTypeMetadata_v4.Key.class, new ColumnTypeMetadata_v4.Key.DeSerializer());
      AfterburnerModule module = new AfterburnerModule();
      module.setUseOptimizedBeanDeserializer(true);
      mapper.registerModule(serialModule);
      mapper.registerModule(module);
      mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
      InputStream is = fs.open(summaryFile);
      Metadata_V4.MetadataSummary metadataSummary = mapper.readValue(is, Metadata_V4.MetadataSummary.class);
      return metadataSummary;
      }
  } catch (IOException e) {
    logger.debug("Failed to read '{}' summary metadata file", summaryFile, e);
    return null;
  }
}
 
Example 20
Source File: HDFSUtils.java    From metron with Apache License 2.0 3 votes vote down vote up
/**
 * Reads a provided path as raw bytes.
 *
 * @param inPath The path to be read
 * @return The raw bytes of the contents
 * @throws IOException If an error occurs during reading the path
 */
public static byte[] readBytes(Path inPath) throws IOException {
  FileSystem fs = FileSystem.get(inPath.toUri(), new Configuration());
  try (FSDataInputStream inputStream = fs.open(inPath)) {
    return IOUtils.toByteArray(inputStream);
  }
}