Java Code Examples for org.apache.hadoop.fs.FileSystem.listFiles()

The following are Jave code examples for showing how to use listFiles() of the org.apache.hadoop.fs.FileSystem class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
Example 1
Project: hadoop-oss   File: RollingFileSystemSinkTestBase.java   Source Code and License Vote up 7 votes
/**
 * Assert that the number of log files in the target directory is as expected.
 * @param fs the target FileSystem
 * @param dir the target directory path
 * @param expected the expected number of files
 * @throws IOException thrown if listing files fails
 */
public void assertFileCount(FileSystem fs, Path dir, int expected)
    throws IOException {
  RemoteIterator<LocatedFileStatus> i = fs.listFiles(dir, true);
  int count = 0;

  while (i.hasNext()) {
    i.next();
    count++;
  }

  assertTrue("The sink created additional unexpected log files. " + count
      + "files were created", expected >= count);
  assertTrue("The sink created too few log files. " + count + "files were "
      + "created", expected <= count);
}
 
Example 2
Project: monarch   File: AbstractTierStoreReader.java   Source Code and License Vote up 6 votes
/**
 * Get all ORC files present in directory for the specified table and partition/bucket. The ORC
 * files returned are in ascending order of the (insertion) time-partition and sequence-id within
 * the time-partition.
 *
 * @param orcDir the ORC store directory
 * @param args the arguments in order: table-name, bucket-id, time-partition-id
 * @return the list of all ORC files
 */
private String[] getOrcFiles(final String orcDir, final String fileExt, final String... args) {
  try {
    FileSystem fileSystem = FileSystem.get(conf);
    Path distributedPath = new Path(Paths.get(orcDir, args).toString());
    ArrayList<String> filePathStrings = new ArrayList<>();
    if (fileSystem.exists(distributedPath)) {
      RemoteIterator<LocatedFileStatus> fileListItr = fileSystem.listFiles(distributedPath, true);
      while (fileListItr != null && fileListItr.hasNext()) {
        LocatedFileStatus file = fileListItr.next();
        if (!file.getPath().getName().endsWith(fileExt)) {
          // exclude CRC files
          filePathStrings.add(file.getPath().toUri().toString());
        }
      }

      Collections.sort(filePathStrings);
    }
    String[] retArray = new String[filePathStrings.size()];
    filePathStrings.toArray(retArray);
    return retArray;
  } catch (IOException e) {
    e.printStackTrace();
  }
  return new String[0];
}
 
Example 3
Project: monarch   File: HDFSQuasiService.java   Source Code and License Vote up 6 votes
public int getFilesCount(String storeBaseDir, String tableName) {
  int filesCount = 0;
  try {
    FileSystem fs = FileSystem.get(conf);
    Path storeBasePath = new Path(fs.getHomeDirectory(), storeBaseDir);
    Path tablePath = new Path(storeBasePath, tableName);
    if (fs.exists(tablePath)) {
      RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator =
          fs.listFiles(tablePath, false);
      while (locatedFileStatusRemoteIterator.hasNext()) {
        filesCount++;
        LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
        System.out.println("File name is " + next.getPath());
      }
    }
  } catch (IOException e) {
    e.printStackTrace();
  }
  return filesCount;
}
 
Example 4
Project: monarch   File: HDFSQuasiService.java   Source Code and License Vote up 6 votes
public List<OrcStruct> getORCRecords(String storeBaseDir, String tableName) throws IOException {
  List<OrcStruct> orcrecords = new ArrayList<>();
  try {
    FileSystem fs = FileSystem.get(conf);
    Path storeBasePath = new Path(fs.getHomeDirectory(), storeBaseDir);
    Path tablePath = new Path(storeBasePath, tableName);
    if (fs.exists(tablePath)) {
      RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator =
          fs.listFiles(tablePath, false);
      while (locatedFileStatusRemoteIterator.hasNext()) {
        LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
        final org.apache.hadoop.hive.ql.io.orc.Reader fis =
            OrcFile.createReader(next.getPath(), OrcFile.readerOptions(conf));
        RecordReader rows = fis.rows();
        while (rows.hasNext()) {
          orcrecords.add((OrcStruct) rows.next(null));
        }
        System.out.println("File name is " + next.getPath());
      }
    }
  } catch (IOException e) {
    e.printStackTrace();
  }
  return orcrecords;
}
 
Example 5
Project: hadoop   File: TestV2LsOperations.java   Source Code and License Vote up 6 votes
/**
 * To get this project to compile under Hadoop 1, this code needs to be
 * commented out
 *
 *
 * @param fs filesystem
 * @param dir dir
 * @param subdir subdir
 * @param recursive recurse?
 * @throws IOException IO problems
 */
public static void assertListFilesFinds(FileSystem fs,
                                        Path dir,
                                        Path subdir,
                                        boolean recursive) throws IOException {
  RemoteIterator<LocatedFileStatus> iterator =
    fs.listFiles(dir, recursive);
  boolean found = false;
  int entries = 0;
  StringBuilder builder = new StringBuilder();
  while (iterator.hasNext()) {
    LocatedFileStatus next = iterator.next();
    entries++;
    builder.append(next.toString()).append('\n');
    if (next.getPath().equals(subdir)) {
      found = true;
    }
  }
  assertTrue("Path " + subdir
             + " not found in directory " + dir + " : "
             + " entries=" + entries
             + " content"
             + builder.toString(),
             found);
}
 
Example 6
Project: hadoop   File: GenerateData.java   Source Code and License Vote up 6 votes
static DataStatistics publishPlainDataStatistics(Configuration conf, 
                                                 Path inputDir) 
throws IOException {
  FileSystem fs = inputDir.getFileSystem(conf);

  // obtain input data file statuses
  long dataSize = 0;
  long fileCount = 0;
  RemoteIterator<LocatedFileStatus> iter = fs.listFiles(inputDir, true);
  PathFilter filter = new Utils.OutputFileUtils.OutputFilesFilter();
  while (iter.hasNext()) {
    LocatedFileStatus lStatus = iter.next();
    if (filter.accept(lStatus.getPath())) {
      dataSize += lStatus.getLen();
      ++fileCount;
    }
  }

  // publish the plain data statistics
  LOG.info("Total size of input data : " 
           + StringUtils.humanReadableInt(dataSize));
  LOG.info("Total number of input data files : " + fileCount);
  
  return new DataStatistics(dataSize, fileCount, false);
}
 
Example 7
Project: ditb   File: IntegrationTestBigLinkedList.java   Source Code and License Vote up 6 votes
static SortedSet<byte []> readKeysToSearch(final Configuration conf)
throws IOException, InterruptedException {
  Path keysInputDir = new Path(conf.get(SEARCHER_INPUTDIR_KEY));
  FileSystem fs = FileSystem.get(conf);
  SortedSet<byte []> result = new TreeSet<byte []>(Bytes.BYTES_COMPARATOR);
  if (!fs.exists(keysInputDir)) {
    throw new FileNotFoundException(keysInputDir.toString());
  }
  if (!fs.isDirectory(keysInputDir)) {
    throw new UnsupportedOperationException("TODO");
  } else {
    RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(keysInputDir, false);
    while(iterator.hasNext()) {
      LocatedFileStatus keyFileStatus = iterator.next();
      // Skip "_SUCCESS" file.
      if (keyFileStatus.getPath().getName().startsWith("_")) continue;
      result.addAll(readFileToSearch(conf, fs, keyFileStatus));
    }
  }
  return result;
}
 
Example 8
Project: spydra   File: LifecycleIT.java   Source Code and License Vote up 5 votes
private int getFileCount(URI uri) throws IOException {
  FileSystem fs = gcpUtils.fileSystemForUri(uri);
  RemoteIterator<LocatedFileStatus> it = fs.listFiles(new Path(uri), true);
  int count = 0;
  while (it.hasNext()) {
    it.next();
    count++;
  }
  return count;
}
 
Example 9
Project: kafka-connect-fs   File: AbstractPolicy.java   Source Code and License Vote up 5 votes
public Iterator<FileMetadata> listFiles(FileSystem fs) throws IOException {
    return new Iterator<FileMetadata>() {
        RemoteIterator<LocatedFileStatus> it = fs.listFiles(fs.getWorkingDirectory(), recursive);
        LocatedFileStatus current = null;
        boolean previous = false;

        @Override
        public boolean hasNext() {
            try {
                if (current == null) {
                    if (!it.hasNext()) return false;
                    current = it.next();
                    return hasNext();
                }
                if (current.isFile() &&
                        fileRegexp.matcher(current.getPath().getName()).find()) {
                    return true;
                }
                current = null;
                return hasNext();
            } catch (IOException ioe) {
                throw new ConnectException(ioe);
            }
        }

        @Override
        public FileMetadata next() {
            if (!hasNext() && current == null) {
                throw new NoSuchElementException("There are no more items");
            }
            FileMetadata metadata = toMetadata(current);
            current = null;
            return metadata;
        }
    };
}
 
Example 10
Project: java-learn   File: Ls.java   Source Code and License Vote up 5 votes
public static void main(String[] args) throws Exception {
    String uri = "hdfs://hadoop-master:9000/";

    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(uri), config, "root");

    FileStatus[] listStatus = fs.listStatus(new Path("/")); for (FileStatus file : listStatus) {
        System.out.println("[" + (file.isFile() ? "file" : "dir") + "] " + file.getPath().getName());
    }

    RemoteIterator<LocatedFileStatus> listFiles = fs.listFiles(new Path("/"), true);
    while (listFiles.hasNext()) {

        LocatedFileStatus fileStatus = listFiles.next();

        log.info("block size:{}",fileStatus.getBlockSize());
        log.info("owner : {}", fileStatus.getOwner());
        log.info("replication : {}" ,fileStatus.getReplication());
        log.info("permission : {}", fileStatus.getPermission());
        log.info("path name : {}",fileStatus.getPath().getName());

        log.info("========block info=========");

        BlockLocation[] blockLocations = fileStatus.getBlockLocations();

        for (BlockLocation blockLocation : blockLocations){

            log.info("block offset : {}",blockLocation.getOffset());
            log.info("block length : {}",blockLocation.getLength());

            String[] dataNodes = blockLocation.getHosts();
            for (String dataNode : dataNodes){
                log.info("dataNode :{}",dataNode);
            }
        }
    }
}
 
Example 11
Project: cloudup   File: S3ADiag.java   Source Code and License Vote up 4 votes
public int run(String[] args, PrintStream stream) throws Exception {
  out = stream;
  List<String> paths = parseArgs(args);
  if (paths.size() != 1) {
    errorln(USAGE);
    return E_USAGE;
  }
  println("Hadoop %s", getVersion());
  println("Compiled by %s on %s", getUser(), getDate());
  println("Compiled with protoc %s", getProtocVersion());
  println("From source with checksum %s", getSrcChecksum());


  Configuration conf = getConf();
  Path path = new Path(paths.get(0));
  FileSystem fs = path.getFileSystem(conf);

  println("Filesystem for %s is %s", path, fs);

  // examine the FS
  Configuration fsConf = fs.getConf();
  for (int i = 0; i < props.length; i++) {
    showProp(fsConf, (String) props[i][0], (Boolean) props[i][1]);
  }

  Path root = fs.makeQualified(new Path("/"));
  try (DurationInfo d = new DurationInfo(LOG,
      "Listing  %s", root)) {
    println("%s has %d entries", root, fs.listStatus(root).length);
  }

  String dirName = "dir-" + UUID.randomUUID();
  Path dir = new Path(root, dirName);
  try (DurationInfo d = new DurationInfo(LOG,
      "Creating a directory %s", dir)) {
    fs.mkdirs(dir);
  }
  try {
    Path file = new Path(dir, "file");
    try (DurationInfo d = new DurationInfo(LOG,
        "Creating a file %s", file)) {
      FSDataOutputStream data = fs.create(file, true);
      data.writeUTF(HELLO);
      data.close();
    }
    try (DurationInfo d = new DurationInfo(LOG,
        "Listing  %s", dir)) {
      fs.listFiles(dir, false);
    }

    try (DurationInfo d = new DurationInfo(LOG,
        "Reading a file %s", file)) {
      FSDataInputStream in = fs.open(file);
      String utf = in.readUTF();
      in.close();
      if (!HELLO.equals(utf)) {
        throw new IOException("Expected " + file + " to contain the text "
            + HELLO + " -but it has the text \"" + utf + "\"");
      }
    }
    try (DurationInfo d = new DurationInfo(LOG,
        "Deleting file %s", file)) {
      fs.delete(file, true);
    }
  } finally {
    try (DurationInfo d = new DurationInfo(LOG,
        "Deleting directory %s", dir)) {
      try {
        fs.delete(dir, true);
      } catch (Exception e) {
        LOG.warn("When deleting {}: ", dir, e);
      }
    }


  }


  // Validate parameters.
  return SUCCESS;
}
 
Example 12
Project: monarch   File: HDFSClient.java   Source Code and License Vote up 4 votes
public static RemoteIterator<LocatedFileStatus> listHDFSFiles(FileSystem fs, Path path)
    throws IOException {
  return fs.listFiles(path, true);
}
 
Example 13
Project: hadoop   File: TraceBuilder.java   Source Code and License Vote up 4 votes
/**
 * Processes the input file/folder argument. If the input is a file,
 * then it is directly considered for further processing by TraceBuilder.
 * If the input is a folder, then all the history logs in the
 * input folder are considered for further processing.
 *
 * If isRecursive is true, then the input path is recursively scanned
 * for job history logs for further processing by TraceBuilder.
 *
 * NOTE: If the input represents a globbed path, then it is first flattened
 *       and then the individual paths represented by the globbed input
 *       path are considered for further processing.
 *
 * @param input        input path, possibly globbed
 * @param conf         configuration
 * @param isRecursive  whether to recursively traverse the input paths to
 *                     find history logs
 * @return the input history log files' paths
 * @throws FileNotFoundException
 * @throws IOException
 */
static List<Path> processInputArgument(String input, Configuration conf,
    boolean isRecursive) throws FileNotFoundException, IOException {
  Path inPath = new Path(input);
  FileSystem fs = inPath.getFileSystem(conf);
  FileStatus[] inStatuses = fs.globStatus(inPath);

  List<Path> inputPaths = new LinkedList<Path>();
  if (inStatuses == null || inStatuses.length == 0) {
    return inputPaths;
  }

  for (FileStatus inStatus : inStatuses) {
    Path thisPath = inStatus.getPath();
    if (inStatus.isDirectory()) {

      // Find list of files in this path(recursively if -recursive option
      // is specified).
      List<FileStatus> historyLogs = new ArrayList<FileStatus>();

      RemoteIterator<LocatedFileStatus> iter =
        fs.listFiles(thisPath, isRecursive);
      while (iter.hasNext()) {
        LocatedFileStatus child = iter.next();
        String fileName = child.getPath().getName();

        if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) {
          historyLogs.add(child);
        }
      }

      if (historyLogs.size() > 0) {
        // Add the sorted history log file names in this path to the
        // inputPaths list
        FileStatus[] sortableNames =
            historyLogs.toArray(new FileStatus[historyLogs.size()]);
        Arrays.sort(sortableNames, new HistoryLogsComparator());

        for (FileStatus historyLog : sortableNames) {
          inputPaths.add(historyLog.getPath());
        }
      }
    } else {
      inputPaths.add(thisPath);
    }
  }

  return inputPaths;
}
 
Example 14
Project: Deep-Learning-with-Hadoop   File: TestSparkMultiLayerParameterAveraging.java   Source Code and License Vote up 4 votes
@Test
public void testFitViaStringPaths() throws Exception {

    Path tempDir = Files.createTempDirectory("DL4J-testFitViaStringPaths");
    File tempDirF = tempDir.toFile();
    tempDirF.deleteOnExit();

    int dataSetObjSize = 5;
    int batchSizePerExecutor = 25;
    DataSetIterator iter = new MnistDataSetIterator(dataSetObjSize,1000,false);
    int i=0;
    while(iter.hasNext()){
        File nextFile = new File(tempDirF, i + ".bin");
        DataSet ds = iter.next();
        ds.save(nextFile);
        i++;
    }

    System.out.println("Saved to: " + tempDirF.getAbsolutePath());




    MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
            .updater(Updater.RMSPROP)
            .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).iterations(1)
            .list()
            .layer(0, new org.deeplearning4j.nn.conf.layers.DenseLayer.Builder()
                    .nIn(28*28).nOut(50)
                    .activation("tanh").build())
            .layer(1, new org.deeplearning4j.nn.conf.layers.OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
                    .nIn(50).nOut(10)
                    .activation("softmax")
                    .build())
            .pretrain(false).backprop(true)
            .build();

    SparkDl4jMultiLayer sparkNet = new SparkDl4jMultiLayer(sc,conf,
            new ParameterAveragingTrainingMaster.Builder(numExecutors(), dataSetObjSize)
                    .workerPrefetchNumBatches(5)
                    .batchSizePerWorker(batchSizePerExecutor)
                    .averagingFrequency(1)
                    .repartionData(Repartition.Always)
                    .build());
    sparkNet.setCollectTrainingStats(true);


    //List all the files:
    Configuration config = new Configuration();
    FileSystem hdfs = FileSystem.get(tempDir.toUri(), config);
    RemoteIterator<LocatedFileStatus> fileIter = hdfs.listFiles(new org.apache.hadoop.fs.Path(tempDir.toString()), false);

    List<String> paths = new ArrayList<>();
    while(fileIter.hasNext()){
        String path = fileIter.next().getPath().toString();
        paths.add(path);
    }

    INDArray paramsBefore = sparkNet.getNetwork().params().dup();
    JavaRDD<String> pathRdd = sc.parallelize(paths);
    sparkNet.fitPaths(pathRdd);

    INDArray paramsAfter = sparkNet.getNetwork().params().dup();
    assertNotEquals(paramsBefore, paramsAfter);

    SparkTrainingStats stats = sparkNet.getSparkTrainingStats();
    System.out.println(stats.statsAsString());
}