Java Code Examples for org.apache.hadoop.fs.FSDataOutputStream#flush()

The following examples show how to use org.apache.hadoop.fs.FSDataOutputStream#flush() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: HBaseFsck.java From hbase with Apache License 2.0

6 votes

@Override
public FSDataOutputStream call() throws IOException {
  try {
    FileSystem fs = CommonFSUtils.getCurrentFileSystem(this.conf);
    FsPermission defaultPerms =
      CommonFSUtils.getFilePermissions(fs, this.conf, HConstants.DATA_FILE_UMASK_KEY);
    Path tmpDir = getTmpDir(conf);
    this.hbckLockPath = new Path(tmpDir, HBCK_LOCK_FILE);
    fs.mkdirs(tmpDir);
    final FSDataOutputStream out = createFileWithRetries(fs, this.hbckLockPath, defaultPerms);
    out.writeBytes(InetAddress.getLocalHost().toString());
    // Add a note into the file we write on why hbase2 is writing out an hbck1 lock file.
    out.writeBytes(" Written by an hbase-2.x Master to block an " +
        "attempt by an hbase-1.x HBCK tool making modification to state. " +
        "See 'HBCK must match HBase server version' in the hbase refguide.");
    out.flush();
    return out;
  } catch(RemoteException e) {
    if(AlreadyBeingCreatedException.class.getName().equals(e.getClassName())){
      return null;
    } else {
      throw e;
    }
  }
}

Example 2

Source File: TestS3NInMemoryFileSystem.java From big-c with Apache License 2.0

6 votes

public void testBasicReadWriteIO() throws IOException {
  FSDataOutputStream writeData = fs.create(new Path(TEST_PATH));
  writeData.write(TEST_DATA.getBytes());
  writeData.flush();
  writeData.close();
  
  FSDataInputStream readData = fs.open(new Path(TEST_PATH));
  BufferedReader br = new BufferedReader(new InputStreamReader(readData));
  String line = "";
  StringBuffer stringBuffer = new StringBuffer();
  while ((line = br.readLine()) != null) {
      stringBuffer.append(line);
  }
  br.close();
  
  assert(TEST_DATA.equals(stringBuffer.toString()));
}

Example 3

Source File: TestS3InMemoryFileSystem.java From hadoop with Apache License 2.0

6 votes

public void testBasicReadWriteIO() throws IOException {
  FSDataOutputStream writeStream = fs.create(new Path(TEST_PATH));
  writeStream.write(TEST_DATA.getBytes());
  writeStream.flush();
  writeStream.close();
  
  FSDataInputStream readStream = fs.open(new Path(TEST_PATH));
  BufferedReader br = new BufferedReader(new InputStreamReader(readStream));
  String line = "";
  StringBuffer stringBuffer = new StringBuffer();
  while ((line = br.readLine()) != null) {
      stringBuffer.append(line);
  }
  br.close();
  
  assert(TEST_DATA.equals(stringBuffer.toString()));
}

Example 4

Source File: TestStochasticLoadBalancerHeterogeneousCostRules.java From hbase with Apache License 2.0

6 votes

@Test
public void testLoadingFomHDFS() throws Exception {
  HTU.startMiniDFSCluster(3);
  try {
    MiniDFSCluster cluster = HTU.getDFSCluster();
    DistributedFileSystem fs = cluster.getFileSystem();
    // Writing file
    Path path = new Path(fs.getHomeDirectory(), DEFAULT_RULES_FILE_NAME);
    FSDataOutputStream stream = fs.create(path);
    stream.write("server1 10".getBytes());
    stream.flush();
    stream.close();

    Configuration configuration = HTU.getConfiguration();

    // start costFunction
    configuration.set(
      HeterogeneousRegionCountCostFunction.HBASE_MASTER_BALANCER_HETEROGENEOUS_RULES_FILE,
      path.toString());
    this.costFunction = new HeterogeneousRegionCountCostFunction(configuration);
    this.costFunction.loadRules();
    Assert.assertEquals(1, this.costFunction.getNumberOfRulesLoaded());
  } finally {
    HTU.shutdownMiniCluster();
  }
}

Example 5

Source File: TestS3InMemoryFileSystem.java From big-c with Apache License 2.0

6 votes

public void testBasicReadWriteIO() throws IOException {
  FSDataOutputStream writeStream = fs.create(new Path(TEST_PATH));
  writeStream.write(TEST_DATA.getBytes());
  writeStream.flush();
  writeStream.close();
  
  FSDataInputStream readStream = fs.open(new Path(TEST_PATH));
  BufferedReader br = new BufferedReader(new InputStreamReader(readStream));
  String line = "";
  StringBuffer stringBuffer = new StringBuffer();
  while ((line = br.readLine()) != null) {
      stringBuffer.append(line);
  }
  br.close();
  
  assert(TEST_DATA.equals(stringBuffer.toString()));
}

Example 6

Source File: TestBlockUnderConstruction.java From hadoop with Apache License 2.0

6 votes

void writeFile(Path file, FSDataOutputStream stm, int size)
throws IOException {
  long blocksBefore = stm.getPos() / BLOCK_SIZE;
  
  TestFileCreation.writeFile(stm, BLOCK_SIZE);
  // need to make sure the full block is completely flushed to the DataNodes
  // (see FSOutputSummer#flush)
  stm.flush();
  int blocksAfter = 0;
  // wait until the block is allocated by DataStreamer
  BlockLocation[] locatedBlocks;
  while(blocksAfter <= blocksBefore) {
    locatedBlocks = DFSClientAdapter.getDFSClient(hdfs).getBlockLocations(
        file.toString(), 0L, BLOCK_SIZE*NUM_BLOCKS);
    blocksAfter = locatedBlocks == null ? 0 : locatedBlocks.length;
  }
}

Example 7

Source File: TestHDFSIntegration.java From incubator-sentry with Apache License 2.0

6 votes

private void loadDataTwoCols(Statement stmt) throws IOException, SQLException {
  FSDataOutputStream f1 = miniDFS.getFileSystem().create(new Path("/tmp/f2.txt"));
  f1.writeChars("m1d1_t1, m1d1_t2\n");
  f1.writeChars("m1d1_t2, m1d1_t2\n");
  f1.writeChars("m1d1_t3, m1d1_t2\n");
  f1.flush();
  f1.close();
  stmt.execute("load data inpath \'/tmp/f2.txt\' overwrite into table p1 partition (month=1, day=1)");
  ResultSet rs = stmt.executeQuery("select * from p1");
  List<String> vals = new ArrayList<String>();
  while (rs.next()) {
    vals.add(rs.getString(1));
  }
  Assert.assertEquals(3, vals.size());
  rs.close();
}

Example 8

Source File: TestInLineFileSystem.java From hudi with Apache License 2.0

6 votes

private OuterPathInfo generateOuterFileAndGetInfo(int inlineContentSize) throws IOException {
  OuterPathInfo toReturn = new OuterPathInfo();
  Path outerPath = getRandomOuterFSPath();
  listOfGeneratedPaths.add(outerPath);
  toReturn.outerPath = outerPath;
  FSDataOutputStream wrappedOut = outerPath.getFileSystem(conf).create(outerPath, true);
  // append random bytes
  byte[] randomBytes = new byte[RANDOM.nextInt(1000)];
  RANDOM.nextBytes(randomBytes);
  wrappedOut.write(randomBytes);
  toReturn.startOffset = wrappedOut.getPos();
  // add inline content
  byte[] embeddedInlineBytes = new byte[inlineContentSize];
  RANDOM.nextBytes(embeddedInlineBytes);
  wrappedOut.write(embeddedInlineBytes);
  toReturn.expectedBytes = embeddedInlineBytes;
  toReturn.length = embeddedInlineBytes.length;
  // suffix random bytes
  randomBytes = new byte[RANDOM.nextInt(1000)];
  RANDOM.nextBytes(randomBytes);
  wrappedOut.write(randomBytes);
  wrappedOut.flush();
  wrappedOut.close();
  return toReturn;
}

Example 9

Source File: TestBlockToken.java From big-c with Apache License 2.0

5 votes

/**
 * This test writes a file and gets the block locations without closing the
 * file, and tests the block token in the last block. Block token is verified
 * by ensuring it is of correct kind.
 * 
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void testBlockTokenInLastLocatedBlock() throws IOException,
    InterruptedException {
  Configuration conf = new HdfsConfiguration();
  conf.setBoolean(DFSConfigKeys.DFS_BLOCK_ACCESS_TOKEN_ENABLE_KEY, true);
  conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 512);
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
      .numDataNodes(1).build();
  cluster.waitActive();

  try {
    FileSystem fs = cluster.getFileSystem();
    String fileName = "/testBlockTokenInLastLocatedBlock";
    Path filePath = new Path(fileName);
    FSDataOutputStream out = fs.create(filePath, (short) 1);
    out.write(new byte[1000]);
    // ensure that the first block is written out (see FSOutputSummer#flush)
    out.flush();
    LocatedBlocks locatedBlocks = cluster.getNameNodeRpc().getBlockLocations(
        fileName, 0, 1000);
    while (locatedBlocks.getLastLocatedBlock() == null) {
      Thread.sleep(100);
      locatedBlocks = cluster.getNameNodeRpc().getBlockLocations(fileName, 0,
          1000);
    }
    Token<BlockTokenIdentifier> token = locatedBlocks.getLastLocatedBlock()
        .getBlockToken();
    Assert.assertEquals(BlockTokenIdentifier.KIND_NAME, token.getKind());
    out.close();
  } finally {
    cluster.shutdown();
  }
}

Example 10

Source File: HdfsIOBenchmark.java From incubator-crail with Apache License 2.0

5 votes

public void writeSequentialHeap() throws Exception {
		System.out.println("writing sequential file in heap mode " + path);
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		FSDataOutputStream instream = fs.create(path);
		byte[] buf = new byte[size];
		double sumbytes = 0;
		double ops = 0;
		System.out.println("read size " + size);
		System.out.println("operations " + loop);
		
		long start = System.currentTimeMillis();
		while (ops < loop) {
//			System.out.println("writing data, len " + buf.length);
			instream.write(buf, 0, buf.length);
			sumbytes = sumbytes + buf.length;
			ops = ops + 1.0;
		}
		instream.flush();
		long end = System.currentTimeMillis();
		double executionTime = ((double) (end - start)) / 1000.0;
		double throughput = 0.0;
		double latency = 0.0;
		double sumbits = sumbytes * 8.0;
		if (executionTime > 0) {
			throughput = sumbits / executionTime / 1024.0 / 1024.0;
			latency = 1000000.0 * executionTime / ops;
		}
		
		System.out.println("execution time " + executionTime);
		System.out.println("ops " + ops);
		System.out.println("sumbytes " + sumbytes);
		System.out.println("throughput " + throughput);
		System.out.println("latency " + latency);
		System.out.println("closing stream");
		instream.close();	
		fs.close();		
	}

Example 11

Source File: JobClasspathHelper.java From kite with Apache License 2.0

5 votes

/**
 * This method creates an file that contains a line with a MD5 sum
 * 
 * @param fs
 *            FileSystem where to create the file.
 * @param md5sum
 *            The string containing the MD5 sum.
 * @param remoteMd5Path
 *            The path where to save the file.
 * @throws IOException
 */
private void createMd5SumFile(FileSystem fs, String md5sum, Path remoteMd5Path) throws IOException {
  FSDataOutputStream os = null;
  try {
    os = fs.create(remoteMd5Path, true);
    os.writeBytes(md5sum);
    os.flush();
  } catch (Exception e) {
    LOG.error("{}", e);
  } finally {
    if (os != null) {
      os.close();
    }
  }
}

Example 12

Source File: TestLogalyzer.java From hadoop with Apache License 2.0

5 votes

/**
 * Create simple log file
 * 
 * @return
 * @throws IOException
 */

private Path createLogFile() throws IOException {

  FileContext files = FileContext.getLocalFSFileContext();

  Path ws = new Path(workSpace.getAbsoluteFile().getAbsolutePath());

  files.delete(ws, true);
  Path workSpacePath = new Path(workSpace.getAbsolutePath(), "log");
  files.mkdir(workSpacePath, null, true);

  LOG.info("create logfile.log");
  Path logfile1 = new Path(workSpacePath, "logfile.log");

  FSDataOutputStream os = files.create(logfile1,
      EnumSet.of(CreateFlag.CREATE));
  os.writeBytes("4 3" + EL + "1 3" + EL + "4 44" + EL);
  os.writeBytes("2 3" + EL + "1 3" + EL + "0 45" + EL);
  os.writeBytes("4 3" + EL + "1 3" + EL + "1 44" + EL);

  os.flush();
  os.close();
  LOG.info("create logfile1.log");

  Path logfile2 = new Path(workSpacePath, "logfile1.log");

  os = files.create(logfile2, EnumSet.of(CreateFlag.CREATE));
  os.writeBytes("4 3" + EL + "1 3" + EL + "3 44" + EL);
  os.writeBytes("2 3" + EL + "1 3" + EL + "0 45" + EL);
  os.writeBytes("4 3" + EL + "1 3" + EL + "1 44" + EL);

  os.flush();
  os.close();

  return workSpacePath;
}

Example 13

Source File: TestWasbFsck.java From hadoop with Apache License 2.0

5 votes

/**
 * Tests that we recover files properly
 */
@Test
@Ignore  /* flush() no longer does anything  @@TODO: reinstate an appropriate test of fsck recovery*/
public void testRecover() throws Exception {
  Path danglingFile = new Path("/crashedInTheMiddle");

  // Create a file and leave it dangling and try to recover it.
  FSDataOutputStream stream = fs.create(danglingFile);
  stream.write(new byte[] { 1, 2, 3 });
  stream.flush();

  // Now we should still only see a zero-byte file in this place
  FileStatus fileStatus = fs.getFileStatus(danglingFile);
  assertNotNull(fileStatus);
  assertEquals(0, fileStatus.getLen());
  assertEquals(1, getNumTempBlobs());

  // Run WasbFsck -move to recover the file.
  runFsck("-move");

  // Now we should the see the file in lost+found with the data there.
  fileStatus = fs.getFileStatus(new Path("/lost+found",
      danglingFile.getName()));
  assertNotNull(fileStatus);
  assertEquals(3, fileStatus.getLen());
  assertEquals(0, getNumTempBlobs());
  // But not in its original location
  assertFalse(fs.exists(danglingFile));
}

Example 14

Source File: TestLineReader.java From tajo with Apache License 2.0

5 votes

@Test
public void testCRLFLine() throws IOException {
  TajoConf conf = new TajoConf();
  Path testFile = new Path(CommonTestingUtil.getTestDir(TEST_PATH), "testCRLFLineText.txt");

  FileSystem fs = testFile.getFileSystem(conf);
  FSDataOutputStream outputStream = fs.create(testFile, true);
  outputStream.write("0\r\n1\r\n".getBytes());
  outputStream.flush();
  IOUtils.closeStream(outputStream);

  ByteBufInputChannel channel = new ByteBufInputChannel(fs.open(testFile));
  ByteBufLineReader reader = new ByteBufLineReader(channel, BufferPool.directBuffer(2));
  FileStatus status = fs.getFileStatus(testFile);

  long totalRead = 0;
  int i = 0;
  AtomicInteger bytes = new AtomicInteger();
  for(;;){
    ByteBuf buf = reader.readLineBuf(bytes);
    totalRead += bytes.get();
    if(buf == null) break;
    String row  = buf.toString(Charset.defaultCharset());
    assertEquals(i, Integer.parseInt(row));
    i++;
  }
  IOUtils.cleanup(null, reader);
  assertFalse(channel.isOpen());
  assertEquals(status.getLen(), totalRead);
  assertEquals(status.getLen(), reader.readBytes());
}

Example 15

Source File: CubertMD.java From Cubert with Apache License 2.0

5 votes

private static void writeMetaFile(String metaFilePath,
                                  HashMap<String, String> metaFileKeyValues) throws IOException
{
    Job tempjob = new Job();
    Configuration tempconf = tempjob.getConfiguration();
    FileSystem fs = FileSystem.get(tempconf);

    FSDataOutputStream outStream = fs.create(new Path(metaFilePath + "/.meta"));
    for (String key : metaFileKeyValues.keySet())
        outStream.write((key + " " + metaFileKeyValues.get(key) + "\n").getBytes());
    outStream.flush();
    outStream.close();
}

Example 16

Source File: CrawlDBTestUtil.java From anthelion with Apache License 2.0

5 votes

/**
 * Generate seedlist
 * @throws IOException 
 */
public static void generateSeedList(FileSystem fs, Path urlPath, List<String> urls, List<String>metadata) throws IOException{
  FSDataOutputStream out;
  Path file=new Path(urlPath,"urls.txt");
  fs.mkdirs(urlPath);
  out=fs.create(file);
  
  Iterator<String> urls_i=urls.iterator();
  Iterator<String> metadata_i=metadata.iterator();
  
  String url;
  String md;
  while(urls_i.hasNext()){
    url=urls_i.next();

    out.writeBytes(url);
          
    if (metadata_i.hasNext()) {
      md = metadata_i.next();
      out.writeBytes(md);
    }

    out.writeBytes("\n");
  }
  
  out.flush();
  out.close();
}

Example 17

Source File: VectorizedHashAggPartitionSpillHandler.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Called by {@link VectorizedHashAggOperator} to ensure all spilled partitions are
 * entirely on disk. This is needed before the operator starts pumping out data from
 * in-memory partitions.
 *
 * The reason we do this is because once a partition has been spilled, it is very likely
 * that there would be subsequent incoming data belonging to the spilled partition. We will
 * continue to do the in-memory aggregation (contraction) or buffering (non-contraction).
 * The partition may or may not get spilled again. However, once all the input data has
 * been processed by the operator, spilled partition(s) could be holding data in-memory
 * and we need to spill/flush this data too.
 *
 * @throws Exception
 */
public void spillAnyInMemoryDataForSpilledPartitions() throws Exception {
  /* get a local reference for efficiency */
  final List<VectorizedHashAggDiskPartition> activeSpilledPartitions = this.activeSpilledPartitions;

  /* spill the memory portion of each spilled partition */
  for (VectorizedHashAggDiskPartition partitionToSpill : activeSpilledPartitions) {
    /* the in-memory portion of partition could be empty if after the partition
     * was spilled, no incoming data ever mapped to that particular partition.
     * the writeToStream() function that spills partition's data structures
     * is aware of this fact and is a NOOP if partition is empty.
     */
    final VectorizedHashAggPartition inmemoryPartition = partitionToSpill.getInmemoryPartitionBackPointer();
    final SpillFile partitionSpillFile = partitionToSpill.getSpillFile();
    final VectorizedHashAggPartitionSerializable partitionSerializable = new VectorizedHashAggPartitionSerializable(inmemoryPartition,
      this.operatorStats, this.warnMaxSpillTime);
    FSDataOutputStream outputStream = partitionToSpill.getSpillStream();
    /* write the partition to disk */
    partitionSerializable.writeToStream(outputStream);
    /* track number of spills */
    spills++;
    /* downsize the partition to minimum memory (ideally zeroed out for single batch) we would still like to keep allocated */
    inmemoryPartition.resetToMinimumSize();
    final long batchesSpilled = partitionSerializable.getNumBatchesSpilled();
    final long recordsSpilled = partitionSerializable.getNumRecordsSpilled();
    final long spilledDataSize = partitionSerializable.getSpilledDataSize();
    updateLocalStats(batchesSpilled, recordsSpilled, spilledDataSize);
    partitionToSpill.addNewSpilledBatches(batchesSpilled);
    logger.debug("Flushed in-memory data for partition: {}, batches spilled: {}, spill file path: {}",
      inmemoryPartition.getIdentifier(), batchesSpilled, partitionSpillFile.getPath());
    outputStream.flush();
    /* no more to spill for this partition since operator will start a new iteration, so released the cached handle */
    partitionToSpill.closeSpillStream();
  }
}

Example 18

Source File: TestHoodieLogFormat.java From hudi with Apache License 2.0

4 votes

@Test
public void testAvroLogRecordReaderWithRollbackPartialBlock()
    throws IOException, URISyntaxException, InterruptedException {
  Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
  // Set a small threshold so that every block is a new version
  Writer writer =
      HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
          .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();

  // Write 1
  List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
  List<IndexedRecord> copyOfRecords1 = records1.stream()
      .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
  Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
  header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
  header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
  HoodieDataBlock dataBlock = getDataBlock(records1, header);
  writer = writer.appendBlock(dataBlock);
  writer.close();

  // Write 2
  header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101");
  // Append some arbit byte[] to thee end of the log (mimics a partially written commit)
  fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
  FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
  // create a block with
  outputStream.write(HoodieLogFormat.MAGIC);
  // Write out a length that does not confirm with the content
  outputStream.writeLong(1000);

  outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION);
  outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());

  // Write out some header
  outputStream.write(HoodieLogBlock.getLogMetadataBytes(header));
  outputStream.writeLong("something-random".getBytes().length);
  outputStream.write("something-random".getBytes());
  outputStream.flush();
  outputStream.close();

  // Rollback the last write
  header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "102");
  header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "101");
  header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE,
      String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal()));
  HoodieCommandBlock commandBlock = new HoodieCommandBlock(header);
  writer =
      HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
          .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
  writer = writer.appendBlock(commandBlock);

  // Write 3
  header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "103");
  List<IndexedRecord> records3 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
  List<IndexedRecord> copyOfRecords3 = records3.stream()
      .map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());

  header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
  dataBlock = getDataBlock(records3, header);
  writer = writer.appendBlock(dataBlock);
  writer.close();

  List<String> allLogFiles =
      FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
          .map(s -> s.getPath().toString()).collect(Collectors.toList());

  HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "103",
      10240L, true, false, bufferSize, BASE_OUTPUT_PATH);
  assertEquals(200, scanner.getTotalLogRecords(), "We would read 200 records");
  Set<String> readKeys = new HashSet<>(200);
  scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
  assertEquals(200, readKeys.size(), "Stream collect should return all 200 records");
  copyOfRecords1.addAll(copyOfRecords3);
  Set<String> originalKeys =
      copyOfRecords1.stream().map(s -> ((GenericRecord) s).get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString())
          .collect(Collectors.toSet());
  assertEquals(originalKeys, readKeys, "CompositeAvroLogReader should return 200 records from 2 versions");
}

Example 19

Source File: TestHoodieLogFormat.java From hudi with Apache License 2.0

4 votes

@Test
public void testAppendAndReadOnCorruptedLog() throws IOException, URISyntaxException, InterruptedException {
  Writer writer =
      HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
          .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
  List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100);
  Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
  header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
  header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
  HoodieDataBlock dataBlock = getDataBlock(records, header);
  writer = writer.appendBlock(dataBlock);
  writer.close();

  // Append some arbit byte[] to thee end of the log (mimics a partially written commit)
  fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
  FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
  // create a block with
  outputStream.write(HoodieLogFormat.MAGIC);
  // Write out a length that does not confirm with the content
  outputStream.writeLong(474);
  outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());
  outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION);
  // Write out a length that does not confirm with the content
  outputStream.writeLong(400);
  // Write out incomplete content
  outputStream.write("something-random".getBytes());
  outputStream.flush();
  outputStream.close();

  // Append a proper block that is of the missing length of the corrupted block
  writer =
          HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
                  .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
  records = SchemaTestUtil.generateTestRecords(0, 10);
  header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
  dataBlock = getDataBlock(records, header);
  writer = writer.appendBlock(dataBlock);
  writer.close();

  // First round of reads - we should be able to read the first block and then EOF
  Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
  assertTrue(reader.hasNext(), "First block should be available");
  reader.next();
  assertTrue(reader.hasNext(), "We should have corrupted block next");
  HoodieLogBlock block = reader.next();
  assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block");
  assertTrue(reader.hasNext(), "Third block should be available");
  reader.next();
  assertFalse(reader.hasNext(), "There should be no more block left");

  reader.close();

  // Simulate another failure back to back
  outputStream = fs.append(writer.getLogFile().getPath());
  // create a block with
  outputStream.write(HoodieLogFormat.MAGIC);
  // Write out a length that does not confirm with the content
  outputStream.writeLong(1000);
  outputStream.writeInt(HoodieLogBlockType.AVRO_DATA_BLOCK.ordinal());
  outputStream.writeInt(HoodieLogFormat.CURRENT_VERSION);
  // Write out a length that does not confirm with the content
  outputStream.writeLong(500);
  // Write out some bytes
  outputStream.write("something-else-random".getBytes());
  outputStream.flush();
  outputStream.close();

  // Should be able to append a new block
  writer =
      HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
          .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
  records = SchemaTestUtil.generateTestRecords(0, 100);
  header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
  dataBlock = getDataBlock(records, header);
  writer = writer.appendBlock(dataBlock);
  writer.close();

  // Second round of reads - we should be able to read the first and last block
  reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
  assertTrue(reader.hasNext(), "First block should be available");
  reader.next();
  assertTrue(reader.hasNext(), "We should get the 1st corrupted block next");
  reader.next();
  assertTrue(reader.hasNext(), "Third block should be available");
  reader.next();
  assertTrue(reader.hasNext(), "We should get the 2nd corrupted block next");
  block = reader.next();
  assertEquals(HoodieLogBlockType.CORRUPT_BLOCK, block.getBlockType(), "The read block should be a corrupt block");
  assertTrue(reader.hasNext(), "We should get the last block next");
  reader.next();
  assertFalse(reader.hasNext(), "We should have no more blocks left");
  reader.close();
}

Example 20

Source File: TrainingSparkRunner.java From ambiverse-nlu with Apache License 2.0

4 votes

private void binaryEvaluation(DataFrame predictions, String output, TrainingSettings trainingSettings) throws IOException {

        FileSystem fs = FileSystem.get(new Configuration());
        Path evalPath = new Path(output+"binary_evaluation_"+trainingSettings.getClassificationMethod()+".txt");
        fs.delete(evalPath, true);
        FSDataOutputStream fsdos = fs.create(evalPath);

        BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(predictions
                .select("rawPrediction", "label")
                .javaRDD()
                .map((Row row) -> {
                    Vector vector = row.getAs("rawPrediction");
                    Double label = row.getAs("label");
                    return new Tuple2<Object, Object>(vector.apply(1), label);
                }).rdd());


        // Precision by threshold
        JavaRDD<Tuple2<Object, Object>> precision = metrics.precisionByThreshold().toJavaRDD();
        IOUtils.write("\nPrecision by threshold: " + precision.collect(), fsdos);

        // Recall by threshold
        JavaRDD<Tuple2<Object, Object>> recall = metrics.recallByThreshold().toJavaRDD();
        IOUtils.write("\nRecall by threshold: " + recall.collect(), fsdos);

        // F Score by threshold
        JavaRDD<Tuple2<Object, Object>> f1Score = metrics.fMeasureByThreshold().toJavaRDD();
        IOUtils.write("\nF1 Score by threshold: " + f1Score.collect(), fsdos);

        JavaRDD<Tuple2<Object, Object>> f2Score = metrics.fMeasureByThreshold(2.0).toJavaRDD();
        IOUtils.write("\nF2 Score by threshold: " + f2Score.collect(), fsdos);

        // Precision-recall curve
        JavaRDD<Tuple2<Object, Object>> prc = metrics.pr().toJavaRDD();
        IOUtils.write("\nPrecision-recall curve: " + prc.collect(), fsdos);

        // Thresholds
        JavaRDD<Double> thresholds = precision.map(t -> new Double(t._1().toString()));

        // ROC Curve
        JavaRDD<Tuple2<Object, Object>> roc = metrics.roc().toJavaRDD();
        IOUtils.write("\nROC curve: " + roc.collect(), fsdos);

        // AUPRC
        IOUtils.write("\nArea under precision-recall curve = " + metrics.areaUnderPR(), fsdos);

        // AUROC
        IOUtils.write("\nArea under ROC = " + metrics.areaUnderROC(), fsdos);

        fsdos.flush();
        IOUtils.closeQuietly(fsdos);
    }