Java Code Examples for org.apache.hadoop.fs.FileSystem#getFileBlockLocations()

The following examples show how to use org.apache.hadoop.fs.FileSystem#getFileBlockLocations() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestNativeAzureFileSystemBlockLocations.java    From big-c with Apache License 2.0 6 votes vote down vote up
private static BlockLocation[] getBlockLocationsOutput(int fileSize,
    int blockSize, long start, long len, String blockLocationHost)
    throws Exception {
  Configuration conf = new Configuration();
  conf.set(NativeAzureFileSystem.AZURE_BLOCK_SIZE_PROPERTY_NAME, ""
      + blockSize);
  if (blockLocationHost != null) {
    conf.set(NativeAzureFileSystem.AZURE_BLOCK_LOCATION_HOST_PROPERTY_NAME,
        blockLocationHost);
  }
  AzureBlobStorageTestAccount testAccount = AzureBlobStorageTestAccount
      .createMock(conf);
  FileSystem fs = testAccount.getFileSystem();
  Path testFile = createTestFile(fs, fileSize);
  FileStatus stat = fs.getFileStatus(testFile);
  BlockLocation[] locations = fs.getFileBlockLocations(stat, start, len);
  testAccount.cleanup();
  return locations;
}
 
Example 2
Source File: TestCombineFileInputFormat.java    From RDFS with Apache License 2.0 6 votes vote down vote up
@Override
protected LocatedFileStatus[] listLocatedStatus(JobConf job) throws IOException {
  Path[] files = getInputPaths(job);
  LocatedFileStatus[] results = new LocatedFileStatus[files.length];
  for (int i = 0; i < files.length; i++) {
    Path p = files[i];
    FileSystem fs = p.getFileSystem(job);
    FileStatus stat = fs.getFileStatus(p);
    if (stat.isDir()) {
      results[i] = new LocatedFileStatus(stat, null);
    } else {
      results[i] = new LocatedFileStatus(stat,
          fs.getFileBlockLocations(stat, 0, stat.getLen()));
    }
  }
  return results;
}
 
Example 3
Source File: Util.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public static String[] blockLocations(CombinedScanTask task, Configuration conf) {
  Set<String> locationSets = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    Path path = new Path(f.file().path().toString());
    try {
      FileSystem fs = path.getFileSystem(conf);
      for (BlockLocation b : fs.getFileBlockLocations(path, f.start(), f.length())) {
        locationSets.addAll(Arrays.asList(b.getHosts()));
      }
    } catch (IOException ioe) {
      LOG.warn("Failed to get block locations for path {}", path, ioe);
    }
  }

  return locationSets.toArray(new String[0]);
}
 
Example 4
Source File: BlurBlockPlacementPolicyDefaultTest.java    From incubator-retired-blur with Apache License 2.0 6 votes vote down vote up
private void waitForReplication(FileSystem fileSystem, Path p, int replicas) throws IOException, InterruptedException {
  FileStatus fileStatus = fileSystem.getFileStatus(p);
  boolean fail = true;
  while (fail) {
    fail = false;
    BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(p, 0, fileStatus.getLen());
    for (BlockLocation blockLocation : blockLocations) {
      System.out.println(blockLocation);
      String[] hosts = blockLocation.getHosts();
      if (hosts.length != replicas) {
        fail = true;
      }
    }
    Thread.sleep(1000);
  }
}
 
Example 5
Source File: TestSmallBlock.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
private void checkFile(FileSystem fileSys, Path name) throws IOException {
  BlockLocation[] locations = fileSys.getFileBlockLocations(
      fileSys.getFileStatus(name), 0, fileSize);
  assertEquals("Number of blocks", fileSize, locations.length);
  FSDataInputStream stm = fileSys.open(name);
  byte[] expected = new byte[fileSize];
  if (simulatedStorage) {
    for (int i = 0; i < expected.length; ++i) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    Random rand = new Random(seed);
    rand.nextBytes(expected);
  }
  // do a sanity check. Read the file
  byte[] actual = new byte[fileSize];
  stm.readFully(0, actual);
  checkAndEraseData(actual, 0, expected, "Read Sanity Test");
  stm.close();
}
 
Example 6
Source File: MergeSortRowIdMatcher.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private static String getFirstBlockId(FileSystem fileSystem, Path realFile) throws IOException {
  FileStatus fileStatus = fileSystem.getFileStatus(realFile);
  BlockLocation[] locations = fileSystem.getFileBlockLocations(fileStatus, 0, 1);
  HdfsBlockLocation location = (HdfsBlockLocation) locations[0];
  LocatedBlock locatedBlock = location.getLocatedBlock();
  ExtendedBlock block = locatedBlock.getBlock();
  return toNiceString(block.getBlockId());
}
 
Example 7
Source File: BlurBlockPlacementPolicyDefaultTest.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
private void assertBlocksExistOnShardServer(FileSystem fileSystem, Path p, String shardServer) throws IOException {
  FileStatus fileStatus = fileSystem.getFileStatus(p);
  BlockLocation[] blockLocations = fileSystem.getFileBlockLocations(p, 0, fileStatus.getLen());
  for (BlockLocation blockLocation : blockLocations) {
    System.out.println(blockLocation);
    String[] hosts = blockLocation.getHosts();
    assertTrue(Arrays.asList(hosts).contains(shardServer));
  }
}
 
Example 8
Source File: MultiFileSplit.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public String[] getLocations() throws IOException {
  HashSet<String> hostSet = new HashSet<String>();
  for (Path file : getPaths()) {
    FileSystem fs = file.getFileSystem(getJob());
    FileStatus status = fs.getFileStatus(file);
    BlockLocation[] blkLocations = fs.getFileBlockLocations(status,
                                        0, status.getLen());
    if (blkLocations != null && blkLocations.length > 0) {
      addToSet(hostSet, blkLocations[0].getHosts());
    }
  }
  return hostSet.toArray(new String[hostSet.size()]);
}
 
Example 9
Source File: TestFileLocalRead.java    From RDFS with Apache License 2.0 5 votes vote down vote up
private void checkFile(FileSystem fileSys, Path name, int repl)
  throws IOException {
  boolean done = false;

  // wait till all full blocks are confirmed by the datanodes.
  while (!done) {
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {}
    done = true;
    BlockLocation[] locations = fileSys.getFileBlockLocations(
        fileSys.getFileStatus(name), 0, fileSize);
    if (locations.length < numBlocks) {
      done = false;
      continue;
    }
    for (int idx = 0; idx < locations.length; idx++) {
      if (locations[idx].getHosts().length < repl) {
        done = false;
        break;
      }
    }
  }
  FSDataInputStream stm = fileSys.open(name);
  final byte[] expected;
  if (simulatedStorage) {
    expected = new byte[numBlocks * blockSize];
    for (int i= 0; i < expected.length; i++) {  
      expected[i] = SimulatedFSDataset.DEFAULT_DATABYTE;
    }
  } else {
    expected = AppendTestUtil.randomBytes(seed, numBlocks*blockSize);
  }
  // do a sanity check. Read the file
  byte[] actual = new byte[numBlocks * blockSize];
  System.out.println("Verifying file ");
  stm.readFully(0, actual);
  stm.close();
  checkData(actual, 0, expected, "Read 1");
}
 
Example 10
Source File: FileInputFormat.java    From hadoop-gpu with Apache License 2.0 5 votes vote down vote up
/** 
 * Generate the list of files and make them into FileSplits.
 */ 
public List<InputSplit> getSplits(JobContext job
                                  ) throws IOException {
  long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
  long maxSize = getMaxSplitSize(job);

  // generate splits
  List<InputSplit> splits = new ArrayList<InputSplit>();
  for (FileStatus file: listStatus(job)) {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if ((length != 0) && isSplitable(job, path)) { 
      long blockSize = file.getBlockSize();
      long splitSize = computeSplitSize(blockSize, minSize, maxSize);

      long bytesRemaining = length;
      while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
        int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
        splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
                                 blkLocations[blkIndex].getHosts()));
        bytesRemaining -= splitSize;
      }
      
      if (bytesRemaining != 0) {
        splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining, 
                   blkLocations[blkLocations.length-1].getHosts()));
      }
    } else if (length != 0) {
      splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
    } else { 
      //Create empty hosts array for zero length files
      splits.add(new FileSplit(path, 0, length, new String[0]));
    }
  }
  LOG.debug("Total # of splits: " + splits.size());
  return splits;
}
 
Example 11
Source File: TestSafeMode.java    From big-c with Apache License 2.0 5 votes vote down vote up
void checkGetBlockLocationsWorks(FileSystem fs, Path fileName) throws IOException {
  FileStatus stat = fs.getFileStatus(fileName);
  try {  
    fs.getFileBlockLocations(stat, 0, 1000);
  } catch (SafeModeException e) {
    assertTrue("Should have not got safemode exception", false);
  } catch (RemoteException re) {
    assertTrue("Should have not got safemode exception", false);   
  }    
}
 
Example 12
Source File: TestSafeMode.java    From hadoop with Apache License 2.0 5 votes vote down vote up
void checkGetBlockLocationsWorks(FileSystem fs, Path fileName) throws IOException {
  FileStatus stat = fs.getFileStatus(fileName);
  try {  
    fs.getFileBlockLocations(stat, 0, 1000);
  } catch (SafeModeException e) {
    assertTrue("Should have not got safemode exception", false);
  } catch (RemoteException re) {
    assertTrue("Should have not got safemode exception", false);   
  }    
}
 
Example 13
Source File: AdmmIterationInputFormat.java    From laser with Apache License 2.0 4 votes vote down vote up
public List<InputSplit> getSplits(JobContext job) throws IOException {
	Configuration conf = job.getConfiguration();
	int numMapTasks = conf.getInt("admm.iteration.num.map.tasks", 0);
	if (0 == numMapTasks) {
		return super.getSplits(job);
	}

	// generate splits
	List<InputSplit> splits = new ArrayList<InputSplit>();
	List<FileStatus> files = listStatus(job);

	for (FileStatus file : files) {
		Path path = file.getPath();
		FileSystem fs = path.getFileSystem(job.getConfiguration());
		long length = file.getLen();
		BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0,
				length);
		if ((length != 0) && isSplitable(job, path)) {
			long blockSize = file.getBlockSize();
			long splitSize = Math.max(
					computeSplitSize(JAVA_OPTS, numMapTasks, length),
					blockSize);
			long splitLength = (long) (length / Math.ceil((double) length
					/ splitSize));
			long bytesRemaining = length;

			while (((double) bytesRemaining) / splitLength > SPLIT_SLOP) {
				int blkIndex = getBlockIndex(blkLocations, length
						- bytesRemaining);
				splits.add(new FileSplit(path, length - bytesRemaining,
						splitLength, blkLocations[blkIndex].getHosts()));

				bytesRemaining -= splitLength;
			}

			if (bytesRemaining != 0) {
				splits.add(new FileSplit(path, length - bytesRemaining,
						bytesRemaining,
						blkLocations[blkLocations.length - 1].getHosts()));
			}
		} else if (length != 0) {
			splits.add(new FileSplit(path, 0, length, blkLocations[0]
					.getHosts()));
		} else {
			splits.add(new FileSplit(path, 0, length, new String[0]));
		}
	}

	// Save the number of input files in the job-conf
	job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
	job.getConfiguration().setInt("admm.iteration.num.map.tasks",
			splits.size());
	return splits;
}
 
Example 14
Source File: TestBlocksWithNotEnoughRacks.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
public void testNodeDecomissionRespectsRackPolicy() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
  } finally {
    cluster.shutdown();
  }
}
 
Example 15
Source File: TestHostsFiles.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testHostsExcludeInUI() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    String names = name + "\n" + "localhost:42\n";
    LOG.info("adding '" + names + "' to exclude file " + excludeFile.toUri().getPath());
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
    
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName mxbeanName =
        new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
    String nodes = (String) mbs.getAttribute(mxbeanName, "LiveNodes");
    assertTrue("Live nodes should contain the decommissioned node",
        nodes.contains("Decommissioned"));
  } finally {
    cluster.shutdown();
  }
}
 
Example 16
Source File: TestDistributedFileSystem.java    From hadoop with Apache License 2.0 4 votes vote down vote up
@Test
public void testStatistics() throws Exception {
  int lsLimit = 2;
  final Configuration conf = getTestConfiguration();
  conf.setInt(DFSConfigKeys.DFS_LIST_LIMIT, lsLimit);
  final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).build();
  try {
    final FileSystem fs = cluster.getFileSystem();
    Path dir = new Path("/test");
    Path file = new Path(dir, "file");
    
    int readOps = DFSTestUtil.getStatistics(fs).getReadOps();
    int writeOps = DFSTestUtil.getStatistics(fs).getWriteOps();
    int largeReadOps = DFSTestUtil.getStatistics(fs).getLargeReadOps();
    fs.mkdirs(dir);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    FSDataOutputStream out = fs.create(file, (short)1);
    out.close();
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    FileStatus status = fs.getFileStatus(file);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileBlockLocations(file, 0, 0);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileBlockLocations(status, 0, 0);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    FSDataInputStream in = fs.open(file);
    in.close();
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.setReplication(file, (short)2);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    Path file1 = new Path(dir, "file1");
    fs.rename(file, file1);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.getContentSummary(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    
    // Iterative ls test
    for (int i = 0; i < 10; i++) {
      Path p = new Path(dir, Integer.toString(i));
      fs.mkdirs(p);
      FileStatus[] list = fs.listStatus(dir);
      if (list.length > lsLimit) {
        // if large directory, then count readOps and largeReadOps by 
        // number times listStatus iterates
        int iterations = (int)Math.ceil((double)list.length/lsLimit);
        largeReadOps += iterations;
        readOps += iterations;
      } else {
        // Single iteration in listStatus - no large read operation done
        readOps++;
      }
      
      // writeOps incremented by 1 for mkdirs
      // readOps and largeReadOps incremented by 1 or more
      checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    }
    
    fs.getStatus(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.getFileChecksum(file1);
    checkStatistics(fs, ++readOps, writeOps, largeReadOps);
    
    fs.setPermission(file1, new FsPermission((short)0777));
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.setTimes(file1, 0L, 0L);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    fs.setOwner(file1, ugi.getUserName(), ugi.getGroupNames()[0]);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
    fs.delete(dir, true);
    checkStatistics(fs, readOps, ++writeOps, largeReadOps);
    
  } finally {
    if (cluster != null) cluster.shutdown();
  }
  
}
 
Example 17
Source File: TestDatanodeDeath.java    From hadoop with Apache License 2.0 4 votes vote down vote up
static private void checkFile(FileSystem fileSys, Path name, int repl,
                       int numblocks, int filesize, long seed)
  throws IOException {
  boolean done = false;
  int attempt = 0;

  long len = fileSys.getFileStatus(name).getLen();
  assertTrue(name + " should be of size " + filesize +
             " but found to be of size " + len, 
             len == filesize);

  // wait till all full blocks are confirmed by the datanodes.
  while (!done) {
    attempt++;
    try {
      Thread.sleep(1000);
    } catch (InterruptedException e) {}
    done = true;
    BlockLocation[] locations = fileSys.getFileBlockLocations(
        fileSys.getFileStatus(name), 0, filesize);

    if (locations.length < numblocks) {
      if (attempt > 100) {
        System.out.println("File " + name + " has only " +
                           locations.length + " blocks, " +
                           " but is expected to have " + numblocks +
                           " blocks.");
      }
      done = false;
      continue;
    }
    for (int idx = 0; idx < locations.length; idx++) {
      if (locations[idx].getHosts().length < repl) {
        if (attempt > 100) {
          System.out.println("File " + name + " has " +
                             locations.length + " blocks: " +
                             " The " + idx + " block has only " +
                             locations[idx].getHosts().length + 
                             " replicas but is expected to have " 
                             + repl + " replicas.");
        }
        done = false;
        break;
      }
    }
  }
  FSDataInputStream stm = fileSys.open(name);
  final byte[] expected = AppendTestUtil.randomBytes(seed, fileSize);

  // do a sanity check. Read the file
  byte[] actual = new byte[filesize];
  stm.readFully(0, actual);
  checkData(actual, 0, expected, "Read 1");
}
 
Example 18
Source File: WikipediaEventInputFormat.java    From datawave with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    final Configuration conf = job.getConfiguration();
    
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    
    // generate splits
    List<InputSplit> splits = new ArrayList<>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        FileSystem fs = path.getFileSystem(conf);
        long length = file.getLen();
        BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
        if ((length != 0) && isSplitable(job, path)) {
            long blockSize = file.getBlockSize();
            long splitSize = computeSplitSize(blockSize, minSize, maxSize);
            
            long bytesRemaining = length;
            while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts()));
                bytesRemaining -= splitSize;
            }
            
            if (bytesRemaining != 0) {
                splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts()));
            }
        } else if (length != 0) {
            splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new FileSplit(path, 0, length, new String[0]));
        }
    }
    
    // Save the number of input files in the job-conf
    conf.setLong(NUM_INPUT_FILES, files.size());
    
    return splits;
}
 
Example 19
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers,
    long maxSplitSize, long minSplitSize, ReadContext readContext)
    throws IOException {
  List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
  Filter filter = ParquetInputFormat.getFilter(configuration);

  long rowGroupsDropped = 0;
  long totalRowGroups = 0;

  for (Footer footer : footers) {
    final Path file = footer.getFile();
    LOG.debug("{}", file);
    FileSystem fs = file.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(file);
    ParquetMetadata parquetMetaData = footer.getParquetMetadata();
    List<BlockMetaData> blocks = parquetMetaData.getBlocks();

    List<BlockMetaData> filteredBlocks;

    totalRowGroups += blocks.size();
    filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
    rowGroupsDropped += blocks.size() - filteredBlocks.size();

    if (filteredBlocks.isEmpty()) {
      continue;
    }

    BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
    splits.addAll(
        generateSplits(
            filteredBlocks,
            fileBlockLocations,
            fileStatus,
            readContext.getRequestedSchema().toString(),
            readContext.getReadSupportMetadata(),
            minSplitSize,
            maxSplitSize)
        );
  }

  if (rowGroupsDropped > 0 && totalRowGroups > 0) {
    int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
    LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
  } else {
    LOG.info("There were no row groups that could be dropped due to filter predicates");
  }
  return splits;
}
 
Example 20
Source File: TestHostsFiles.java    From big-c with Apache License 2.0 4 votes vote down vote up
@Test
public void testHostsExcludeInUI() throws Exception {
  Configuration conf = getConf();
  short REPLICATION_FACTOR = 2;
  final Path filePath = new Path("/testFile");

  // Configure an excludes file
  FileSystem localFileSys = FileSystem.getLocal(conf);
  Path workingDir = localFileSys.getWorkingDirectory();
  Path dir = new Path(workingDir, "build/test/data/temp/decommission");
  Path excludeFile = new Path(dir, "exclude");
  Path includeFile = new Path(dir, "include");
  assertTrue(localFileSys.mkdirs(dir));
  DFSTestUtil.writeFile(localFileSys, excludeFile, "");
  DFSTestUtil.writeFile(localFileSys, includeFile, "");
  conf.set(DFSConfigKeys.DFS_HOSTS_EXCLUDE, excludeFile.toUri().getPath());
  conf.set(DFSConfigKeys.DFS_HOSTS, includeFile.toUri().getPath());

  // Two blocks and four racks
  String racks[] = {"/rack1", "/rack1", "/rack2", "/rack2"};
  MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
    .numDataNodes(racks.length).racks(racks).build();
  final FSNamesystem ns = cluster.getNameNode().getNamesystem();

  try {
    // Create a file with one block
    final FileSystem fs = cluster.getFileSystem();
    DFSTestUtil.createFile(fs, filePath, 1L, REPLICATION_FACTOR, 1L);
    ExtendedBlock b = DFSTestUtil.getFirstBlock(fs, filePath);
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);

    // Decommission one of the hosts with the block, this should cause 
    // the block to get replicated to another host on the same rack,
    // otherwise the rack policy is violated.
    BlockLocation locs[] = fs.getFileBlockLocations(
        fs.getFileStatus(filePath), 0, Long.MAX_VALUE);
    String name = locs[0].getNames()[0];
    String names = name + "\n" + "localhost:42\n";
    LOG.info("adding '" + names + "' to exclude file " + excludeFile.toUri().getPath());
    DFSTestUtil.writeFile(localFileSys, excludeFile, name);
    ns.getBlockManager().getDatanodeManager().refreshNodes(conf);
    DFSTestUtil.waitForDecommission(fs, name);

    // Check the block still has sufficient # replicas across racks
    DFSTestUtil.waitForReplication(cluster, b, 2, REPLICATION_FACTOR, 0);
    
    MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
    ObjectName mxbeanName =
        new ObjectName("Hadoop:service=NameNode,name=NameNodeInfo");
    String nodes = (String) mbs.getAttribute(mxbeanName, "LiveNodes");
    assertTrue("Live nodes should contain the decommissioned node",
        nodes.contains("Decommissioned"));
  } finally {
    cluster.shutdown();
  }
}