Java Code Examples for org.apache.hadoop.fs.FileSystem#getFileBlockLocations()

The following examples show how to use org.apache.hadoop.fs.FileSystem#getFileBlockLocations() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestNativeAzureFileSystemBlockLocations.java From big-c with Apache License 2.0

6 votes

private static BlockLocation[] getBlockLocationsOutput(int fileSize,
    int blockSize, long start, long len, String blockLocationHost)
    throws Exception {
  Configuration conf = new Configuration();
  conf.set(NativeAzureFileSystem.AZURE_BLOCK_SIZE_PROPERTY_NAME, ""
      + blockSize);
  if (blockLocationHost != null) {
    conf.set(NativeAzureFileSystem.AZURE_BLOCK_LOCATION_HOST_PROPERTY_NAME,
        blockLocationHost);
  }
  AzureBlobStorageTestAccount testAccount = AzureBlobStorageTestAccount
      .createMock(conf);
  FileSystem fs = testAccount.getFileSystem();
  Path testFile = createTestFile(fs, fileSize);
  FileStatus stat = fs.getFileStatus(testFile);
  BlockLocation[] locations = fs.getFileBlockLocations(stat, start, len);
  testAccount.cleanup();
  return locations;
}

Example 2

Source File: TestCombineFileInputFormat.java From RDFS with Apache License 2.0

6 votes

@Override
protected LocatedFileStatus[] listLocatedStatus(JobConf job) throws IOException {
  Path[] files = getInputPaths(job);
  LocatedFileStatus[] results = new LocatedFileStatus[files.length];
  for (int i = 0; i < files.length; i++) {
    Path p = files[i];
    FileSystem fs = p.getFileSystem(job);
    FileStatus stat = fs.getFileStatus(p);
    if (stat.isDir()) {
      results[i] = new LocatedFileStatus(stat, null);
    } else {
      results[i] = new LocatedFileStatus(stat,
          fs.getFileBlockLocations(stat, 0, stat.getLen()));
    }
  }
  return results;
}

Example 3

Source File: Util.java From iceberg with Apache License 2.0

6 votes

public static String[] blockLocations(CombinedScanTask task, Configuration conf) {
  Set<String> locationSets = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    Path path = new Path(f.file().path().toString());
    try {
      FileSystem fs = path.getFileSystem(conf);
      for (BlockLocation b : fs.getFileBlockLocations(path, f.start(), f.length())) {
        locationSets.addAll(Arrays.asList(b.getHosts()));
      }
    } catch (IOException ioe) {
      LOG.warn("Failed to get block locations for path {}", path, ioe);
    }
  }

  return locationSets.toArray(new String[0]);
}

Example 4

Source File: BlurBlockPlacementPolicyDefaultTest.java From incubator-retired-blur with Apache License 2.0

6 votes

private void waitForReplication(FileSystem fileSystem, Path p, int replicas) throws IOException, InterruptedException {
  FileStatus fileStatus = fileSystem.getFileStatus(p);
  boolean fail = true;
  while (fail) {
    fail = false;
    BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(p, 0, fileStatus.getLen());
    for (BlockLocation blockLocation : blockLocations) {
      System.out.println(blockLocation);
      String[] hosts = blockLocation.getHosts();
      if (hosts.length != replicas) {
        fail = true;
      }
    }
    Thread.sleep(1000);
  }
}

Example 5

Source File: TestSmallBlock.java From hadoop-gpu with Apache License 2.0

6 votes

private void checkFile(FileSystem fileSys, Path name) throws IOException {
  BlockLocation[] locations = fileSys.getFileBlockLocations(
      fileSys.getFileStatus(name), 0, fileSize);
  assertEquals("Number of blocks", fileSize, locations.length);
  FSDataInputStream stm = fileSys.open(name);
  byte[] expected = new byte[fileSize];
  if (simulatedStorage) {
    for (int i = 0; i < expected.length; ++i) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    Random rand = new Random(seed);
    rand.nextBytes(expected);
  }
  // do a sanity check. Read the file
  byte[] actual = new byte[fileSize];
  stm.readFully(0, actual);
  checkAndEraseData(actual, 0, expected, "Read Sanity Test");
  stm.close();
}

Example 6

Source File: MergeSortRowIdMatcher.java From incubator-retired-blur with Apache License 2.0

5 votes

private static String getFirstBlockId(FileSystem fileSystem, Path realFile) throws IOException {
  FileStatus fileStatus = fileSystem.getFileStatus(realFile);
  BlockLocation[] locations = fileSystem.getFileBlockLocations(fileStatus, 0, 1);
  HdfsBlockLocation location = (HdfsBlockLocation) locations[0];
  LocatedBlock locatedBlock = location.getLocatedBlock();
  ExtendedBlock block = locatedBlock.getBlock();
  return toNiceString(block.getBlockId());
}

Example 7

Source File: BlurBlockPlacementPolicyDefaultTest.java From incubator-retired-blur with Apache License 2.0

5 votes

private void assertBlocksExistOnShardServer(FileSystem fileSystem, Path p, String shardServer) throws IOException {
  FileStatus fileStatus = fileSystem.getFileStatus(p);
  BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(p, 0, fileStatus.getLen());
  for (BlockLocation blockLocation : blockLocations) {
    System.out.println(blockLocation);
    String[] hosts = blockLocation.getHosts();
    assertTrue(Arrays.asList(hosts).contains(shardServer));
  }
}

Example 8

Source File: MultiFileSplit.java From hadoop with Apache License 2.0

5 votes

public String[] getLocations() throws IOException {
  HashSet<String> hostSet = new HashSet<String>();
  for (Path file : getPaths()) {
    FileSystem fs = file.getFileSystem(getJob());
    FileStatus status = fs.getFileStatus(file);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(status,
                                        0, status.getLen());
    if (blkLocations != null && blkLocations.length > 0) {
      addToSet(hostSet, blkLocations[0].getHosts());
    }
  }
  return hostSet.toArray(new String[hostSet.size()]);
}

Example 9

Source File: TestFileLocalRead.java From RDFS with Apache License 2.0

5 votes

private void checkFile(FileSystem fileSys, Path name, int repl)
  throws IOException {
  boolean done = false;

  // wait till all full blocks are confirmed by the datanodes.
  while (!done) {
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {}
    done = true;
    BlockLocation[] locations = fileSys.getFileBlockLocations(
        fileSys.getFileStatus(name), 0, fileSize);
    if (locations.length < numBlocks) {
      done = false;
      continue;
    }
    for (int idx = 0; idx < locations.length; idx++) {
      if (locations[idx].getHosts().length < repl) {
        done = false;
        break;
      }
    }
  }
  FSDataInputStream stm = fileSys.open(name);
  final byte[] expected;
  if (simulatedStorage) {
    expected = new byte[numBlocks * blockSize];
    for (int i= 0; i < expected.length; i++) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    expected = AppendTestUtil.randomBytes(seed, numBlocks*blockSize);
  }
  // do a sanity check. Read the file
  byte[] actual = new byte[numBlocks * blockSize];
  System.out.println("Verifying file ");
  stm.readFully(0, actual);
  stm.close();
  checkData(actual, 0, expected, "Read 1");
}

Example 10

Source File: FileInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

/** 
 * Generate the list of files and make them into FileSplits.
 */ 
public List<InputSplit> getSplits(JobContext job
                                  ) throws IOException {
  long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
  long maxSize = getMaxSplitSize(job);

  // generate splits
  List<InputSplit> splits = new ArrayList<InputSplit>();
  for (FileStatus file: listStatus(job)) {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(job, path)) { 
      long blockSize = file.getBlockSize();
      long splitSize = computeSplitSize(blockSize, minSize, maxSize);

      long bytesRemaining = length;
      while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
        int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
        splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
                                 blkLocations[blkIndex].getHosts()));
        bytesRemaining -= splitSize;
      }
      
      if (bytesRemaining != 0) {
        splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining, 
                   blkLocations[blkLocations.length-1].getHosts()));
      }
    } else if (length != 0) {
      splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
    } else { 
      //Create empty hosts array for zero length files
      splits.add(new FileSplit(path, 0, length, new String[0]));
    }
  }
  LOG.debug("Total # of splits: " + splits.size());
  return splits;
}

Example 11

Source File: TestSafeMode.java From big-c with Apache License 2.0

5 votes

void checkGetBlockLocationsWorks(FileSystem fs, Path fileName) throws IOException {
  FileStatus stat = fs.getFileStatus(fileName);
  try {  
    fs.getFileBlockLocations(stat, 0, 1000);
  } catch (SafeModeException e) {
    assertTrue("Should have not got safemode exception", false);
  } catch (RemoteException re) {
    assertTrue("Should have not got safemode exception", false);   
  }    
}

Example 12

Source File: TestSafeMode.java From hadoop with Apache License 2.0

5 votes

void checkGetBlockLocationsWorks(FileSystem fs, Path fileName) throws IOException {
  FileStatus stat = fs.getFileStatus(fileName);
  try {  
    fs.getFileBlockLocations(stat, 0, 1000);
  } catch (SafeModeException e) {
    assertTrue("Should have not got safemode exception", false);
  } catch (RemoteException re) {
    assertTrue("Should have not got safemode exception", false);   
  }    
}

Example 13

Source File: AdmmIterationInputFormat.java From laser with Apache License 2.0

4 votes

public List<InputSplit> getSplits(JobContext job) throws IOException {
	Configuration conf = job.getConfiguration();
	int numMapTasks = conf.getInt("admm.iteration.num.map.tasks", 0);
	if (0 == numMapTasks) {
		return super.getSplits(job);
	}

	// generate splits
	List<InputSplit> splits = new ArrayList<InputSplit>();
	List<FileStatus> files = listStatus(job);

	for (FileStatus file : files) {
		Path path = file.getPath();
		FileSystem fs = path.getFileSystem(job.getConfiguration());
		long length = file.getLen();
		BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0,
				length);
		if ((length != 0) && isSplitable(job, path)) {
			long blockSize = file.getBlockSize();
			long splitSize = Math.max(
					computeSplitSize(JAVA_OPTS, numMapTasks, length),
					blockSize);
			long splitLength = (long) (length / Math.ceil((double) length
					/ splitSize));
			long bytesRemaining = length;

			while (((double) bytesRemaining) / splitLength > SPLIT_SLOP) {
				int blkIndex = getBlockIndex(blkLocations, length
						- bytesRemaining);
				splits.add(new FileSplit(path, length - bytesRemaining,
						splitLength, blkLocations[blkIndex].getHosts()));

				bytesRemaining -= splitLength;
			}

			if (bytesRemaining != 0) {
				splits.add(new FileSplit(path, length - bytesRemaining,
						bytesRemaining,
						blkLocations[blkLocations.length - 1].getHosts()));
			}
		} else if (length != 0) {
			splits.add(new FileSplit(path, 0, length, blkLocations[0]
					.getHosts()));
		} else {
			splits.add(new FileSplit(path, 0, length, new String[0]));
		}
	}

	// Save the number of input files in the job-conf
	job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
	job.getConfiguration().setInt("admm.iteration.num.map.tasks",
			splits.size());
	return splits;
}

Example 14

Source File: TestBlocksWithNotEnoughRacks.java From big-c with Apache License 2.0

4 votes

@Test
public void testNodeDecomissionRespectsRackPolicy() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
  } finally {
    cluster.shutdown();
  }
}

Example 15

Source File: TestHostsFiles.java From hadoop with Apache License 2.0

4 votes

@Test
public void testHostsExcludeInUI() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    String names = name + "\n" + "localhost:42\n";
    LOG.info("adding '" + names + "' to exclude file " + excludeFile.toUri().getPath());
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
    
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName mxbeanName =
        new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
    String nodes = (String) mbs.getAttribute(mxbeanName, "LiveNodes");
    assertTrue("Live nodes should contain the decommissioned node",
        nodes.contains("Decommissioned"));
  } finally {
    cluster.shutdown();
  }
}

Example 16

Source File: TestDistributedFileSystem.java From hadoop with Apache License 2.0

4 votes

@Test
public void testStatistics() throws Exception {
  int lsLimit = 2;
  final Configuration conf = getTestConfiguration();
  conf.setInt(DFSConfigKeys.DFS_LIST_LIMIT, lsLimit);
  final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
  try {
    final FileSystem fs = cluster.getFileSystem();
    Path dir = new Path("/test");
    Path file = new Path(dir, "file");
    
    int readOps = DFSTestUtil.getStatistics(fs).getReadOps();
    int writeOps = DFSTestUtil.getStatistics(fs).getWriteOps();
    int largeReadOps = DFSTestUtil.getStatistics(fs).getLargeReadOps();
    fs.mkdirs(dir);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    FSDataOutputStream out = fs.create(file, (short)1);
    out.close();
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    FileStatus status = fs.getFileStatus(file);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileBlockLocations(file, 0, 0);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileBlockLocations(status, 0, 0);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    FSDataInputStream in = fs.open(file);
    in.close();
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.setReplication(file, (short)2);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    Path file1 = new Path(dir, "file1");
    fs.rename(file, file1);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.getContentSummary(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    
    // Iterative ls test
    for (int i = 0; i < 10; i++) {
      Path p = new Path(dir, Integer.toString(i));
      fs.mkdirs(p);
      FileStatus[] list = fs.listStatus(dir);
      if (list.length > lsLimit) {
        // if large directory, then count readOps and largeReadOps by 
        // number times listStatus iterates
        int iterations = (int)Math.ceil((double)list.length/lsLimit);
        largeReadOps += iterations;
        readOps += iterations;
      } else {
        // Single iteration in listStatus - no large read operation done
        readOps++;
      }
      
      // writeOps incremented by 1 for mkdirs
      // readOps and largeReadOps incremented by 1 or more
      checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    }
    
    fs.getStatus(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileChecksum(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.setPermission(file1, new FsPermission((short)0777));
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.setTimes(file1, 0L, 0L);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    fs.setOwner(file1, ugi.getUserName(), ugi.getGroupNames()[0]);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.delete(dir, true);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
  } finally {
    if (cluster != null) cluster.shutdown();
  }
  
}

Example 17

Source File: TestDatanodeDeath.java From hadoop with Apache License 2.0

4 votes

static private void checkFile(FileSystem fileSys, Path name, int repl,
                       int numblocks, int filesize, long seed)
  throws IOException {
  boolean done = false;
  int attempt = 0;

  long len = fileSys.getFileStatus(name).getLen();
  assertTrue(name + " should be of size " + filesize +
             " but found to be of size " + len, 
             len == filesize);

  // wait till all full blocks are confirmed by the datanodes.
  while (!done) {
    attempt++;
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {}
    done = true;
    BlockLocation[] locations = fileSys.getFileBlockLocations(
        fileSys.getFileStatus(name), 0, filesize);

    if (locations.length < numblocks) {
      if (attempt > 100) {
        System.out.println("File " + name + " has only " +
                           locations.length + " blocks, " +
                           " but is expected to have " + numblocks +
                           " blocks.");
      }
      done = false;
      continue;
    }
    for (int idx = 0; idx < locations.length; idx++) {
      if (locations[idx].getHosts().length < repl) {
        if (attempt > 100) {
          System.out.println("File " + name + " has " +
                             locations.length + " blocks: " +
                             " The " + idx + " block has only " +
                             locations[idx].getHosts().length + 
                             " replicas but is expected to have " 
                             + repl + " replicas.");
        }
        done = false;
        break;
      }
    }
  }
  FSDataInputStream stm = fileSys.open(name);
  final byte[] expected = AppendTestUtil.randomBytes(seed, fileSize);

  // do a sanity check. Read the file
  byte[] actual = new byte[filesize];
  stm.readFully(0, actual);
  checkData(actual, 0, expected, "Read 1");
}

Example 18

Source File: WikipediaEventInputFormat.java From datawave with Apache License 2.0

4 votes

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    final Configuration conf = job.getConfiguration();
    
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    
    // generate splits
    List<InputSplit> splits = new ArrayList<>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);
            
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    
    // Save the number of input files in the job-conf
    conf.setLong(NUM_INPUT_FILES, files.size());
    
    return splits;
}

Example 19

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

4 votes

List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers,
    long maxSplitSize, long minSplitSize, ReadContext readContext)
    throws IOException {
  List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
  Filter filter = ParquetInputFormat.getFilter(configuration);

  long rowGroupsDropped = 0;
  long totalRowGroups = 0;

  for (Footer footer : footers) {
    final Path file = footer.getFile();
    LOG.debug("{}", file);
    FileSystem fs = file.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(file);
    ParquetMetadata parquetMetaData = footer.getParquetMetadata();
    List<BlockMetaData> blocks = parquetMetaData.getBlocks();

    List<BlockMetaData> filteredBlocks;

    totalRowGroups += blocks.size();
    filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
    rowGroupsDropped += blocks.size() - filteredBlocks.size();

    if (filteredBlocks.isEmpty()) {
      continue;
    }

    BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
    splits.addAll(
        generateSplits(
            filteredBlocks,
            fileBlockLocations,
            fileStatus,
            readContext.getRequestedSchema().toString(),
            readContext.getReadSupportMetadata(),
            minSplitSize,
            maxSplitSize)
        );
  }

  if (rowGroupsDropped > 0 && totalRowGroups > 0) {
    int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
    LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
  } else {
    LOG.info("There were no row groups that could be dropped due to filter predicates");
  }
  return splits;
}

Example 20

Source File: TestHostsFiles.java From big-c with Apache License 2.0

4 votes

@Test
public void testHostsExcludeInUI() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    String names = name + "\n" + "localhost:42\n";
    LOG.info("adding '" + names + "' to exclude file " + excludeFile.toUri().getPath());
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
    
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName mxbeanName =
        new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
    String nodes = (String) mbs.getAttribute(mxbeanName, "LiveNodes");
    assertTrue("Live nodes should contain the decommissioned node",
        nodes.contains("Decommissioned"));
  } finally {
    cluster.shutdown();
  }
}