Java Code Examples for org.apache.hadoop.fs.FileSystem#listFiles()

The following examples show how to use org.apache.hadoop.fs.FileSystem#listFiles() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestV2LsOperations.java    From big-c with Apache License 2.0 6 votes vote down vote up
/**
 * To get this project to compile under Hadoop 1, this code needs to be
 * commented out
 *
 *
 * @param fs filesystem
 * @param dir dir
 * @param subdir subdir
 * @param recursive recurse?
 * @throws IOException IO problems
 */
public static void assertListFilesFinds(FileSystem fs,
                                        Path dir,
                                        Path subdir,
                                        boolean recursive) throws IOException {
  RemoteIterator<LocatedFileStatus> iterator =
    fs.listFiles(dir, recursive);
  boolean found = false;
  int entries = 0;
  StringBuilder builder = new StringBuilder();
  while (iterator.hasNext()) {
    LocatedFileStatus next = iterator.next();
    entries++;
    builder.append(next.toString()).append('\n');
    if (next.getPath().equals(subdir)) {
      found = true;
    }
  }
  assertTrue("Path " + subdir
             + " not found in directory " + dir + " : "
             + " entries=" + entries
             + " content"
             + builder.toString(),
             found);
}
 
Example 2
Source File: PcapFinalizer.java    From metron with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a lazily-read Iterable over a set of sequence files.
 */
protected SequenceFileIterable readInterimResults(Path interimResultPath, Configuration config,
    FileSystem fs) throws IOException {
  List<Path> files = new ArrayList<>();
  for (RemoteIterator<LocatedFileStatus> it = fs.listFiles(interimResultPath, false);
      it.hasNext(); ) {
    Path p = it.next().getPath();
    if (p.getName().equals("_SUCCESS")) {
      fs.delete(p, false);
      continue;
    }
    files.add(p);
  }
  if (files.size() == 0) {
    LOG.info("No files to process with specified date range.");
  } else {
    LOG.debug("Interim results path={}", interimResultPath);
    Collections.sort(files, (o1, o2) -> o1.getName().compareTo(o2.getName()));
  }
  return new SequenceFileIterable(files, config);
}
 
Example 3
Source File: BackupUtils.java    From hbase with Apache License 2.0 6 votes vote down vote up
public static BackupInfo loadBackupInfo(Path backupRootPath, String backupId, FileSystem fs)
    throws IOException {
  Path backupPath = new Path(backupRootPath, backupId);

  RemoteIterator<LocatedFileStatus> it = fs.listFiles(backupPath, true);
  while (it.hasNext()) {
    LocatedFileStatus lfs = it.next();
    if (lfs.getPath().getName().equals(BackupManifest.MANIFEST_FILE_NAME)) {
      // Load BackupManifest
      BackupManifest manifest = new BackupManifest(fs, lfs.getPath().getParent());
      BackupInfo info = manifest.toBackupInfo();
      return info;
    }
  }
  return null;
}
 
Example 4
Source File: SparkUtils.java    From deeplearning4j with Apache License 2.0 6 votes vote down vote up
/**
 * List of the files in the given directory (path), as a {@code JavaRDD<String>}
 *
 * @param sc                Spark context
 * @param path              Path to list files in
 * @param recursive         Whether to walk the directory tree recursively (i.e., include subdirectories)
 * @param allowedExtensions If null: all files will be accepted. If non-null: only files with the specified extension will be allowed.
 *                          Exclude the extension separator - i.e., use "txt" not ".txt" here.
 * @param config            Hadoop configuration to use. Must not be null.
 * @return Paths in the directory
 * @throws IOException If error occurs getting directory contents
 */
public static JavaRDD<String> listPaths(@NonNull JavaSparkContext sc, String path, boolean recursive,
                                        Set<String> allowedExtensions, @NonNull Configuration config) throws IOException {
    List<String> paths = new ArrayList<>();
    FileSystem hdfs = FileSystem.get(URI.create(path), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(path), recursive);

    while (fileIter.hasNext()) {
        String filePath = fileIter.next().getPath().toString();
        if(allowedExtensions == null){
            paths.add(filePath);
        } else {
            String ext = FilenameUtils.getExtension(path);
            if(allowedExtensions.contains(ext)){
                paths.add(filePath);
            }
        }
    }
    return sc.parallelize(paths);
}
 
Example 5
Source File: TestHDFSIntegration.java    From incubator-sentry with Apache License 2.0 6 votes vote down vote up
private void verifyAccessToPath(String user, String group, String path, boolean hasPermission) throws Exception{
  Path p = new Path(path);
  UserGroupInformation hadoopUser =
      UserGroupInformation.createUserForTesting(user, new String[] {group});
  FileSystem fs = DFSTestUtil.getFileSystemAs(hadoopUser, hadoopConf);
  try {
    fs.listFiles(p, true);
    if(!hasPermission) {
      Assert.assertFalse("Expected listing files to fail", false);
    }
  } catch (Exception e) {
    if(hasPermission) {
      throw e;
    }
  }
}
 
Example 6
Source File: HDFSSecUtils.java    From bdt with Apache License 2.0 6 votes vote down vote up
public String listFiles(String content, String hdfsDirPath) throws IOException {
    String files = "";
    Path path = new Path(hdfsDirPath);

    FileSystem fileSystem = FileSystem.get(conf);

    if ("files".equals(content)) {
        RemoteIterator<LocatedFileStatus> iterator = fileSystem.listFiles(path, false);

        while (iterator.hasNext()) {
            files = files + iterator.next().getPath().getName() + "\n";
        }
    } else {
        FileStatus[] status = fileSystem.listStatus(path);
        for (int i = 0; i < status.length; i++) {
            if (status[i].isDirectory()) {
                files = files + status[i].getPath().getName() + "/\n";
            } else {
                files = files + status[i].getPath().getName() + "\n";
            }
        }
    }

    return files.trim();
}
 
Example 7
Source File: JobLibLoader.java    From SpyGlass with Apache License 2.0 6 votes vote down vote up
public static void loadJars(String libPathStr, Configuration config) {
	
	try {
		Path libPath = new Path(libPathStr);

		FileSystem fs = FileSystem.get(config);

		RemoteIterator<LocatedFileStatus> itr = fs.listFiles(libPath, true);

		while (itr.hasNext()) {
			LocatedFileStatus f = itr.next();

			if (!f.isDirectory() && f.getPath().getName().endsWith("jar")) {
				logger.info("Loading Jar : " + f.getPath().getName());
				DistributedCache.addFileToClassPath(f.getPath(), config);
			}
		}
	} catch (Exception e) {
		e.printStackTrace();
		logger.error(e.toString());
	}
}
 
Example 8
Source File: GenerateData.java    From hadoop with Apache License 2.0 6 votes vote down vote up
static DataStatistics publishPlainDataStatistics(Configuration conf, 
                                                 Path inputDir) 
throws IOException {
  FileSystem fs = inputDir.getFileSystem(conf);

  // obtain input data file statuses
  long dataSize = 0;
  long fileCount = 0;
  RemoteIterator<LocatedFileStatus> iter = fs.listFiles(inputDir, true);
  PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
  while (iter.hasNext()) {
    LocatedFileStatus lStatus = iter.next();
    if (filter.accept(lStatus.getPath())) {
      dataSize += lStatus.getLen();
      ++fileCount;
    }
  }

  // publish the plain data statistics
  LOG.info("Total size of input data : " 
           + StringUtils.humanReadableInt(dataSize));
  LOG.info("Total number of input data files : " + fileCount);
  
  return new DataStatistics(dataSize, fileCount, false);
}
 
Example 9
Source File: CommonFSUtils.java    From hbase with Apache License 2.0 6 votes vote down vote up
/**
 * Calls fs.listFiles() to get FileStatus and BlockLocations together for reducing rpc call
 *
 * @param fs file system
 * @param dir directory
 * @return LocatedFileStatus list
 */
public static List<LocatedFileStatus> listLocatedStatus(final FileSystem fs,
    final Path dir) throws IOException {
  List<LocatedFileStatus> status = null;
  try {
    RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fs
        .listFiles(dir, false);
    while (locatedFileStatusRemoteIterator.hasNext()) {
      if (status == null) {
        status = Lists.newArrayList();
      }
      status.add(locatedFileStatusRemoteIterator.next());
    }
  } catch (FileNotFoundException fnfe) {
    // if directory doesn't exist, return null
    if (LOG.isTraceEnabled()) {
      LOG.trace("{} doesn't exist", dir);
    }
  }
  return status;
}
 
Example 10
Source File: HoodieTestUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static FileStatus[] listAllLogFilesInPath(FileSystem fs, String basePath, String logfileExtension)
    throws IOException {
  RemoteIterator<LocatedFileStatus> itr = fs.listFiles(new Path(basePath), true);
  List<FileStatus> returns = new ArrayList<>();
  while (itr.hasNext()) {
    LocatedFileStatus status = itr.next();
    if (status.getPath().getName().contains(logfileExtension)) {
      returns.add(status);
    }
  }
  return returns.toArray(new FileStatus[returns.size()]);
}
 
Example 11
Source File: CorruptDataFilesAction.java    From hbase with Apache License 2.0 5 votes vote down vote up
@Override
public void perform() throws Exception {
  getLogger().info("Start corrupting data files");

  FileSystem fs = CommonFSUtils.getRootDirFileSystem(getConf());
  Path rootDir = CommonFSUtils.getRootDir(getConf());
  Path defaultDir = rootDir.suffix("/data/default");
  RemoteIterator<LocatedFileStatus> iterator =  fs.listFiles(defaultDir, true);
  while (iterator.hasNext()){
    LocatedFileStatus status = iterator.next();
    if(!HFile.isHFileFormat(fs, status.getPath())){
      continue;
    }
    if(RandomUtils.nextFloat(0, 100) > chance){
      continue;
    }

    FSDataOutputStream out = fs.create(status.getPath(), true);
    try {
      out.write(0);
    } finally {
      out.close();
    }
    getLogger().info("Corrupting {}", status.getPath());
  }
  getLogger().info("Done corrupting data files");
}
 
Example 12
Source File: HdfsIOBenchmark.java    From crail with Apache License 2.0 5 votes vote down vote up
void browseDir() throws Exception {
	System.out.println("reading enumarate dir, path " + path);
	Configuration conf = new Configuration();
	FileSystem fs = FileSystem.get(conf); 
	
	//benchmark
	System.out.println("starting benchmark...");
	RemoteIterator<LocatedFileStatus> iter = fs.listFiles(path, false);
	while (iter.hasNext()) {
		LocatedFileStatus status = iter.next();
		System.out.println(status.getPath());
	}		
	fs.close();
}
 
Example 13
Source File: YarnFileStageTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Verifies the content and name of file in the directory {@code worDir} are same with {@code expectedFiles}.
 * @param targetFileSystem the filesystem type of {@code workDir}
 * @param workDir the directory verified
 * @param expectedFiles the expected name and content of the files
 * @throws IOException if error occurs when visiting the {@code workDir}
 * @throws InterruptedException if the sleep is interrupted.
 */
private static void verifyDirectoryRecursive(
	FileSystem targetFileSystem,
	Path workDir,
	Map<String, String> expectedFiles)
	throws IOException, InterruptedException {

	final HashMap<String /* (relative) path */, /* contents */ String> targetFiles =
		new HashMap<>();
	final RemoteIterator<LocatedFileStatus> targetFilesIterator =
		targetFileSystem.listFiles(workDir, true);
	final int workDirPrefixLength =
		workDir.toString().length() + 1; // one more for the concluding "/"
	while (targetFilesIterator.hasNext()) {
		final LocatedFileStatus targetFile = targetFilesIterator.next();

		int retries = 5;
		do {
			try (FSDataInputStream in = targetFileSystem.open(targetFile.getPath())) {
				String absolutePathString = targetFile.getPath().toString();
				String relativePath = absolutePathString.substring(workDirPrefixLength);
				targetFiles.put(relativePath, in.readUTF());

				assertEquals("extraneous data in file " + relativePath, -1, in.read());
				break;
			} catch (FileNotFoundException e) {
				// For S3, read-after-write may be eventually consistent, i.e. when trying
				// to access the object before writing it; see
				// https://docs.aws.amazon.com/AmazonS3/latest/dev/Introduction.html#ConsistencyModel
				// -> try again a bit later
				Thread.sleep(50);
			}
		} while ((retries--) > 0);
	}
	assertThat(targetFiles, equalTo(expectedFiles));
}
 
Example 14
Source File: RestoreTablesClient.java    From hbase with Apache License 2.0 5 votes vote down vote up
private List<Path> getFilesRecursively(String fileBackupDir)
    throws IllegalArgumentException, IOException {
  FileSystem fs = FileSystem.get((new Path(fileBackupDir)).toUri(), new Configuration());
  List<Path> list = new ArrayList<>();
  RemoteIterator<LocatedFileStatus> it = fs.listFiles(new Path(fileBackupDir), true);
  while (it.hasNext()) {
    Path p = it.next().getPath();
    if (HFile.isHFileFormat(fs, p)) {
      list.add(p);
    }
  }
  return list;
}
 
Example 15
Source File: HdfsRm.java    From BigData-In-Practice with Apache License 2.0 5 votes vote down vote up
/**
 * 列出文件夹下的所有文件
 */
public static void listFiles(String remotePathStr, boolean recursive) throws IOException {
    Path remotePath = new Path(remotePathStr);
    FileSystem fileSystem = SysUtil.getFileSystem();

    RemoteIterator<LocatedFileStatus> iterator = fileSystem.listFiles(remotePath, recursive);
    System.out.println(String.format("文件夹《%s》下的所有文件:", remotePathStr));
    while (iterator.hasNext()) {
        System.out.println(iterator.next());
    }
    fileSystem.close();
}
 
Example 16
Source File: TestBackupBase.java    From hbase with Apache License 2.0 5 votes vote down vote up
protected void dumpBackupDir() throws IOException {
  // Dump Backup Dir
  FileSystem fs = FileSystem.get(conf1);
  RemoteIterator<LocatedFileStatus> it = fs.listFiles(new Path(BACKUP_ROOT_DIR), true);
  while (it.hasNext()) {
    LOG.debug(Objects.toString(it.next().getPath()));
  }
}
 
Example 17
Source File: HDFSFileManagerImpl.java    From entrada with GNU General Public License v3.0 5 votes vote down vote up
@Override
public List<String> expired(String location, int maxAge, String... filter) {
  if (!exists(location)) {
    log.error("Location {} does not exist, cannot continue", location);
    return Collections.emptyList();
  }
  List<String> files = new ArrayList<>();
  FileSystem fs = null;

  try {
    fs = createFS();
    RemoteIterator<LocatedFileStatus> fileStatusListIterator =
        fs.listFiles(new Path(location), true);

    while (fileStatusListIterator.hasNext()) {
      LocatedFileStatus fileStatus = fileStatusListIterator.next();
      files.add(fileStatus.getPath().toString());
    }
  } catch (Exception e) {
    log.error("Error while getting files", e);
    return Collections.emptyList();
  }

  // retrun found files, can be partial list in case of an exception
  return files
      .stream()
      .filter(p -> checkFilter(p, Arrays.asList(filter)))
      .collect(Collectors.toList());
}
 
Example 18
Source File: ProtoParquetWriterWithOffsetTest.java    From garmadon with Apache License 2.0 5 votes vote down vote up
private Set<LocatedFileStatus> listFiles(FileSystem fs, Path p) throws IOException {
    RemoteIterator<LocatedFileStatus> it = fs.listFiles(p, true);
    Set<LocatedFileStatus> s = new HashSet<>();
    while (it.hasNext()) {
        s.add(it.next());
    }
    return s;
}
 
Example 19
Source File: TestPlannerUtil.java    From tajo with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetNonZeroLengthDataFiles() throws Exception {
  String queryFiles = ClassLoader.getSystemResource("queries").toString() + "/TestSelectQuery";
  Path path = new Path(queryFiles);

  TableDesc tableDesc = new TableDesc();
  tableDesc.setName("Test");
  tableDesc.setUri(path.toUri());

  FileSystem fs = path.getFileSystem(util.getConfiguration());

  List<Path> expectedFiles = new ArrayList<>();
  RemoteIterator<LocatedFileStatus> files = fs.listFiles(path, true);
  while (files.hasNext()) {
    LocatedFileStatus file = files.next();
    if (file.isFile() && file.getLen() > 0) {
      expectedFiles.add(file.getPath());
    }
  }
  int fileNum = expectedFiles.size() / 5;

  int numResultFiles = 0;
  for (int i = 0; i <= 5; i++) {
    int start = i * fileNum;

    FragmentProto[] fragments =
        PhysicalPlanUtil.getNonZeroLengthDataFiles(util.getConfiguration(), tableDesc, start, fileNum);
    assertNotNull(fragments);

    numResultFiles += fragments.length;
    int expectedSize = fileNum;
    if (i == 5) {
      //last
      expectedSize = expectedFiles.size() - (fileNum * 5);
    }

    comparePath(expectedFiles, fragments, start, expectedSize);
  }

  assertEquals(expectedFiles.size(), numResultFiles);
}
 
Example 20
Source File: TestSparkMultiLayerParameterAveraging.java    From deeplearning4j with Apache License 2.0 4 votes vote down vote up
@Test
    public void testFitViaStringPaths() throws Exception {

        Path tempDir = testDir.newFolder("DL4J-testFitViaStringPaths").toPath();
        File tempDirF = tempDir.toFile();
        tempDirF.deleteOnExit();

        int dataSetObjSize = 5;
        int batchSizePerExecutor = 25;
        DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize, 1000, false);
        int i = 0;
        while (iter.hasNext()) {
            File nextFile = new File(tempDirF, i + ".bin");
            DataSet ds = iter.next();
            ds.save(nextFile);
            i++;
        }

        System.out.println("Saved to: " + tempDirF.getAbsolutePath());



        MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(new RmsProp())
                        .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).list()
                        .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder().nIn(28 * 28).nOut(50)
                                        .activation(Activation.TANH).build())
                        .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(
                                        LossFunctions.LossFunction.MCXENT).nIn(50).nOut(10)
                                                        .activation(Activation.SOFTMAX).build())
                        .build();

        SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc, conf,
                        new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                                        .workerPrefetchNumBatches(5).batchSizePerWorker(batchSizePerExecutor)
                                        .averagingFrequency(1).repartionData(Repartition.Always).build());
        sparkNet.setCollectTrainingStats(true);


        //List files:
        Configuration config = new Configuration();
        FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
        RemoteIterator<LocatedFileStatus> fileIter =
                        hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

        List<String> paths = new ArrayList<>();
        while (fileIter.hasNext()) {
            String path = fileIter.next().getPath().toString();
            paths.add(path);
        }

        INDArray paramsBefore = sparkNet.getNetwork().params().dup();
        JavaRDD<String> pathRdd = sc.parallelize(paths);
        sparkNet.fitPaths(pathRdd);

        INDArray paramsAfter = sparkNet.getNetwork().params().dup();
        assertNotEquals(paramsBefore, paramsAfter);

        SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
//        System.out.println(stats.statsAsString());
        stats.statsAsString();

        sparkNet.getTrainingMaster().deleteTempFiles(sc);
    }