Java Code Examples for org.apache.hadoop.fs.FileSystem#globStatus()

The following examples show how to use org.apache.hadoop.fs.FileSystem#globStatus() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: BulkIngestMapFileLoader.java From datawave with Apache License 2.0

6 votes

/**
 * Gets a list of job directories that are marked as completed. That is, these are job directories for which the MapReduce jobs have completed and there are
 * map files ready to be loaded.
 */
private Path[] getJobDirectories() throws IOException {
    log.debug("Checking for completed job directories.");
    FileSystem fs = getFileSystem(srcHdfs);
    FileStatus[] files = fs.globStatus(new Path(workDir, jobDirPattern + '/' + COMPLETE_FILE_MARKER));
    Path[] jobDirectories;
    if (files != null && files.length > 0) {
        final int order = (FIFO ? 1 : -1);
        Arrays.sort(files, (o1, o2) -> {
            long m1 = o1.getModificationTime();
            long m2 = o2.getModificationTime();
            return order * ((m1 < m2) ? -1 : ((m1 > m2) ? 1 : 0));
        });
        jobDirectories = new Path[Math.min(MAX_DIRECTORIES, files.length)];
        for (int i = 0; i < jobDirectories.length; i++) {
            jobDirectories[i] = files[i].getPath().getParent();
        }
    } else {
        jobDirectories = new Path[0];
    }
    log.debug("Completed job directories: " + Arrays.toString(jobDirectories));
    return jobDirectories;
}

Example 2

Source File: PigStorageWithStatistics.java From spork with Apache License 2.0

6 votes

private Long getInputSizeInBytes() throws IOException {
    if (loc == null) {
        return 0L;
    }

    long inputBytes = 0L;
    for (String location : getPathStrings(loc)) {
        Path path = new Path(location);
        FileSystem fs = path.getFileSystem(new Configuration());
        FileStatus[] status = fs.globStatus(path);
        if (status != null) {
            for (FileStatus s : status) {
                inputBytes += MapRedUtil.getPathLength(fs, s);
            }
        }
    }
    return inputBytes;
}

Example 3

Source File: LocatedFileStatusFetcher.java From big-c with Apache License 2.0

6 votes

@Override
public Result call() throws Exception {
  Result result = new Result();
  FileSystem fs = path.getFileSystem(conf);
  result.fs = fs;
  FileStatus[] matches = fs.globStatus(path, inputFilter);
  if (matches == null) {
    result.addError(new IOException("Input path does not exist: " + path));
  } else if (matches.length == 0) {
    result.addError(new IOException("Input Pattern " + path
        + " matches 0 files"));
  } else {
    result.matchedFileStatuses = matches;
  }
  return result;
}

Example 4

Source File: ControlledMapReduceJob.java From RDFS with Apache License 2.0

6 votes

private FileStatus[] listSignalFiles(FileSystem fileSys, final boolean isMap)
    throws IOException {
  return fileSys.globStatus(new Path(signalFileDir.toString() + "/*"),
      new PathFilter() {
        @Override
        public boolean accept(Path path) {
          if (isMap && path.getName().startsWith(MAP_SIGFILE_PREFIX)) {
            LOG.debug("Found signal file : " + path.getName());
            return true;
          } else if (!isMap
              && path.getName().startsWith(REDUCE_SIGFILE_PREFIX)) {
            LOG.debug("Found signal file : " + path.getName());
            return true;
          }
          LOG.info("Didn't find any relevant signal files.");
          return false;
        }
      });
}

Example 5

Source File: PigSequenceFileInputFormat.java From spork with Apache License 2.0

6 votes

@Override
protected List<FileStatus> listStatus(JobContext job) throws IOException {        
    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    List<FileStatus> files = new ArrayList<FileStatus>();
    for (int i=0; i<dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration()); 
        FileStatus[] matches = fs.globStatus(p, hiddenFileFilter);
        if (matches == null) {
            throw new IOException("Input path does not exist: " + p);
        } else if (matches.length == 0) {
            throw new IOException("Input Pattern " + p + " matches 0 files");
        } else {
            for (FileStatus globStat: matches) {
                files.add(globStat);
            }
        }
    }
    return MapRedUtil.getAllFileRecursively(files, job.getConfiguration());        
}

Example 6

Source File: FSUtils.java From hudi with Apache License 2.0

5 votes

/**
 * Gets all partition paths assuming date partitioning (year, month, day) three levels down.
 */
public static List<String> getAllPartitionFoldersThreeLevelsDown(FileSystem fs, String basePath) throws IOException {
  List<String> datePartitions = new ArrayList<>();
  // Avoid listing and including any folders under the metafolder
  PathFilter filter = getExcludeMetaPathFilter();
  FileStatus[] folders = fs.globStatus(new Path(basePath + "/*/*/*"), filter);
  for (FileStatus status : folders) {
    Path path = status.getPath();
    datePartitions.add(String.format("%s/%s/%s", path.getParent().getParent().getName(), path.getParent().getName(),
        path.getName()));
  }
  return datePartitions;
}

Example 7

Source File: ListFiles.java From Cubert with Apache License 2.0

5 votes

@Override
public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException,
        InterruptedException
{
    List<String> files = new ArrayList<String>();
    String dirsStr = JsonUtils.getText(json.get("args"), "dirs");
    String[] dirs = CommonUtils.trim(dirsStr.split(","));

    for (String dir : dirs)
    {
        Path path = new Path(dir);
        FileSystem fs = path.getFileSystem(PhaseContext.getConf());
        FileStatus[] allStatus = fs.globStatus(path);

        if (allStatus == null || allStatus.length == 0)
            continue;

        for (FileStatus status : allStatus)
        {
            if (status.isDir())
            {
                listFiles(fs, status.getPath(), files);
            }
            else
            {
                files.add(status.getPath().toUri().getPath());
            }
        }

    }

    iterator = files.iterator();
    output = TupleFactory.getInstance().newTuple(1);
}

Example 8

Source File: CopyCommitter.java From hadoop with Apache License 2.0

5 votes

private void deleteAttemptTempFiles(Path targetWorkPath,
                                    FileSystem targetFS,
                                    String jobId) throws IOException {

  FileStatus[] tempFiles = targetFS.globStatus(
      new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*"));

  if (tempFiles != null && tempFiles.length > 0) {
    for (FileStatus file : tempFiles) {
      LOG.info("Cleaning up " + file.getPath());
      targetFS.delete(file.getPath(), false);
    }
  }
}

Example 9

Source File: TestHarFileSystem.java From RDFS with Apache License 2.0

5 votes

/**
 * check if the block size of the part files is what we had specified
 */
private void checkBlockSize(FileSystem fs, Path finalPath, long blockSize) throws IOException {
  FileStatus[] statuses = fs.globStatus(new Path(finalPath, "part-*"));
  for (FileStatus status: statuses) {
    assertTrue(status.getBlockSize() == blockSize);
  }
}

Example 10

Source File: PartitionPreservingTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  String nestedPath = getNestedPathFromTimestamp(timestamp);
  Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,nestedPath + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Example 11

Source File: TestXMLLoader.java From spork with Apache License 2.0

5 votes

/**
 * This test case test the special case when a non-matching tag spans two file
 * splits in a .bz2 compressed file. At the same time, the part that falls in
 * the first split is a prefix of the matching tag.
 * In other words, till the end of the first split, it looks like the tag is
 * matching but it is not actually matching.
 *
 * @throws Exception
 */
public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception {
  Configuration conf = new Configuration();
  long blockSize = 100 * 1024;
  conf.setLong("fs.local.block.size", blockSize);

  String tagName = "event";

  PigServer pig = new PigServer(LOCAL, conf);
  FileSystem localFs = FileSystem.getLocal(conf);
  FileStatus[] testFiles = localFs.globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2"));
  assertTrue("No test files", testFiles.length > 0);
  for (FileStatus testFile : testFiles) {
    String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\");
    String query = "A = LOAD '" + testFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
    pig.registerQuery(query);
    Iterator<?> it = pig.openIterator("A");
    while (it.hasNext()) {
      Tuple tuple = (Tuple) it.next();
      if (tuple == null)
        break;
      else {
        if (tuple.size() > 0) {
          assertTrue(((String)tuple.get(0)).startsWith("<"+tagName+">"));
        }
      }
    }
  }
}

Example 12

Source File: FileInputFormat.java From big-c with Apache License 2.0

5 votes

private List<FileStatus> singleThreadedListStatus(JobConf job, Path[] dirs,
    PathFilter inputFilter, boolean recursive) throws IOException {
  List<FileStatus> result = new ArrayList<FileStatus>();
  List<IOException> errors = new ArrayList<IOException>();
  for (Path p: dirs) {
    FileSystem fs = p.getFileSystem(job); 
    FileStatus[] matches = fs.globStatus(p, inputFilter);
    if (matches == null) {
      errors.add(new IOException("Input path does not exist: " + p));
    } else if (matches.length == 0) {
      errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
    } else {
      for (FileStatus globStat: matches) {
        if (globStat.isDirectory()) {
          RemoteIterator<LocatedFileStatus> iter =
              fs.listLocatedStatus(globStat.getPath());
          while (iter.hasNext()) {
            LocatedFileStatus stat = iter.next();
            if (inputFilter.accept(stat.getPath())) {
              if (recursive && stat.isDirectory()) {
                addInputPathRecursively(result, fs, stat.getPath(),
                    inputFilter);
              } else {
                result.add(stat);
              }
            }
          }
        } else {
          result.add(globStat);
        }
      }
    }
  }
  if (!errors.isEmpty()) {
    throw new InvalidInputException(errors);
  }
  return result;
}

Example 13

Source File: PartitionPreservingCollapsingIntegrationTests.java From datafu with Apache License 2.0

5 votes

private HashMap<Long,Long> loadIntermediateCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  String nestedPath = getNestedPathFromTimestamp(timestamp);
  Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath)));
  for (FileStatus stat : fs.globStatus(new Path(_intermediatePath,nestedPath + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}

Example 14

Source File: PathUtils.java From datafu with Apache License 2.0

5 votes

/**
 * List all paths matching the "yyyyMMdd" format under a given path.
 * 
 * @param fs file system
 * @param path path to search under
 * @return paths paths matching pattern
 * @throws IOException IOException
 */
public static List<DatePath> findDatedPaths(FileSystem fs, Path path) throws IOException
{
  FileStatus[] outputPaths = fs.globStatus(new Path(path, "*"), nonHiddenPathFilter);
  
  List<DatePath> outputs = new ArrayList<DatePath>();
  
  if (outputPaths != null)
  {
    for (FileStatus outputPath : outputPaths)
    {
      Date date;
      try
      {
        date = datedPathFormat.parse(outputPath.getPath().getName());
      }
      catch (ParseException e)
      {
        continue;
      }
      
      outputs.add(new DatePath(date,outputPath.getPath()));
    }
  }
  
  Collections.sort(outputs);
  
  return outputs;
}

Example 15

Source File: RubixFile.java From Cubert with Apache License 2.0

4 votes

public static FileStatus[] getRubixFiles(Path path, FileSystem fs)
    throws IOException
{
    Path globPath = new Path(path, RubixConstants.RUBIX_EXTENSION_FOR_GLOB);
    return fs.globStatus(globPath);
}

Example 16

Source File: FileInputFormat.java From RDFS with Apache License 2.0

4 votes

/** List input directories.
 * Mark this method to be final to make sure this method does not
 * get overridden by any subclass.
 * If a subclass historically overrides this method, now it needs to override
 * {@link #listLocatedStatus(JobConf)} instead. 
 * 
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
final static protected FileStatus[] listStatus(JobConf job) throws IOException {
  Path[] dirs = getInputPaths(job);
  if (dirs.length == 0) {
    throw new IOException("No input paths specified in job");
  }

  // Whether we need to recursive look into the directory structure
  boolean recursive = job.getBoolean("mapred.input.dir.recursive", false);
  
  List<FileStatus> result = new ArrayList<FileStatus>();
  List<IOException> errors = new ArrayList<IOException>();
  
  // creates a MultiPathFilter with the hiddenFileFilter and the
  // user provided one (if any).
  List<PathFilter> filters = new ArrayList<PathFilter>();
  filters.add(hiddenFileFilter);
  PathFilter jobFilter = getInputPathFilter(job);
  if (jobFilter != null) {
    filters.add(jobFilter);
  }
  PathFilter inputFilter = new MultiPathFilter(filters);

  for (Path p: dirs) {
    FileSystem fs = p.getFileSystem(job); 
    FileStatus[] matches = fs.globStatus(p, inputFilter);
    if (matches == null) {
      errors.add(new IOException("Input path does not exist: " + p));
    } else if (matches.length == 0) {
      errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
    } else {
      for (FileStatus globStat: matches) {
        if (globStat.isDir()) {
          for(FileStatus stat: fs.listStatus(globStat.getPath(),
              inputFilter)) {
            if (recursive && stat.isDir()) {
              addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
            } else {
              result.add(stat);
            }
          }
        } else {
          result.add(globStat);
        }
      }
    }
  }

  if (!errors.isEmpty()) {
    throw new InvalidInputException(errors);
  }
  LOG.info("Total input paths to process : " + result.size()); 
  return result.toArray(new FileStatus[result.size()]);
}

Example 17

Source File: PathData.java From big-c with Apache License 2.0

4 votes

/**
 * Expand the given path as a glob pattern.  Non-existent paths do not
 * throw an exception because creation commands like touch and mkdir need
 * to create them.  The "stat" field will be null if the path does not
 * exist.
 * @param pattern the pattern to expand as a glob
 * @param conf the hadoop configuration
 * @return list of {@link PathData} objects.  if the pattern is not a glob,
 * and does not exist, the list will contain a single PathData with a null
 * stat 
 * @throws IOException anything else goes wrong...
 */
public static PathData[] expandAsGlob(String pattern, Configuration conf)
throws IOException {
  Path globPath = new Path(pattern);
  FileSystem fs = globPath.getFileSystem(conf);    
  FileStatus[] stats = fs.globStatus(globPath);
  PathData[] items = null;
  
  if (stats == null) {
    // remove any quoting in the glob pattern
    pattern = pattern.replaceAll("\\\\(.)", "$1");
    // not a glob & file not found, so add the path with a null stat
    items = new PathData[]{ new PathData(fs, pattern, null) };
  } else {
    // figure out what type of glob path was given, will convert globbed
    // paths to match the type to preserve relativity
    PathType globType;
    URI globUri = globPath.toUri();
    if (globUri.getScheme() != null) {
      globType = PathType.HAS_SCHEME;
    } else if (!globUri.getPath().isEmpty() &&
               new Path(globUri.getPath()).isAbsolute()) {
      globType = PathType.SCHEMELESS_ABSOLUTE;
    } else {
      globType = PathType.RELATIVE;
    }

    // convert stats to PathData
    items = new PathData[stats.length];
    int i=0;
    for (FileStatus stat : stats) {
      URI matchUri = stat.getPath().toUri();
      String globMatch = null;
      switch (globType) {
        case HAS_SCHEME: // use as-is, but remove authority if necessary
          if (globUri.getAuthority() == null) {
            matchUri = removeAuthority(matchUri);
          }
          globMatch = uriToString(matchUri, false);
          break;
        case SCHEMELESS_ABSOLUTE: // take just the uri's path
          globMatch = matchUri.getPath();
          break;
        case RELATIVE: // make it relative to the current working dir
          URI cwdUri = fs.getWorkingDirectory().toUri();
          globMatch = relativize(cwdUri, matchUri, stat.isDirectory());
          break;
      }
      items[i++] = new PathData(fs, globMatch, stat);
    }
  }
  Arrays.sort(items);
  return items;
}

Example 18

Source File: TestCopyFiles.java From hadoop with Apache License 2.0

4 votes

public void testMapCount() throws Exception {
  String namenode = null;
  MiniDFSCluster dfs = null;
  MiniDFSCluster mr = null;
  try {
    Configuration conf = new Configuration();
    
    dfs= new MiniDFSCluster.Builder(conf).numDataNodes(3).format(true).build();
    
    FileSystem fs = dfs.getFileSystem();
    final FsShell shell = new FsShell(conf);
    namenode = fs.getUri().toString();
    MyFile[] files = createFiles(fs.getUri(), "/srcdat");
    long totsize = 0;
    for (MyFile f : files) {
      totsize += f.getSize();
    }
    
    Configuration job = new JobConf(conf);
    job.setLong("distcp.bytes.per.map", totsize / 3);
    ToolRunner.run(new DistCpV1(job),
        new String[] {"-m", "100",
                      "-log",
                      namenode+"/logs",
                      namenode+"/srcdat",
                      namenode+"/destdat"});
    assertTrue("Source and destination directories do not match.",
               checkFiles(fs, "/destdat", files));

    String logdir = namenode + "/logs";
    System.out.println(execCmd(shell, "-lsr", logdir));
    FileStatus[] logs = fs.listStatus(new Path(logdir));
    // rare case where splits are exact, logs.length can be 4
    assertTrue( logs.length == 2);

    deldir(fs, "/destdat");
    deldir(fs, "/logs");
    ToolRunner.run(new DistCpV1(job),
        new String[] {"-m", "1",
                      "-log",
                      namenode+"/logs",
                      namenode+"/srcdat",
                      namenode+"/destdat"});

    System.out.println(execCmd(shell, "-lsr", logdir));
    logs = fs.globStatus(new Path(namenode+"/logs/part*"));
    assertTrue("Unexpected map count, logs.length=" + logs.length,
        logs.length == 1);
  } finally {
    if (dfs != null) { dfs.shutdown(); }
    if (mr != null) { mr.shutdown(); }
  }
}

Example 19

Source File: DeleteHDFS.java From localization_nifi with Apache License 2.0

4 votes

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    String fileOrDirectoryName = null;
    FlowFile flowFile = session.get();

    // If this processor has an incoming connection, then do not run unless a
    // FlowFile is actually sent through
    if (flowFile == null && context.hasIncomingConnection()) {
        context.yield();
        return;
    }

    if (flowFile != null) {
        fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY).evaluateAttributeExpressions(flowFile).getValue();
    } else {
        fileOrDirectoryName = context.getProperty(FILE_OR_DIRECTORY).evaluateAttributeExpressions().getValue();
    }

    final FileSystem fileSystem = getFileSystem();
    try {
        // Check if the user has supplied a file or directory pattern
        List<Path> pathList = Lists.newArrayList();
        if (GLOB_MATCHER.reset(fileOrDirectoryName).find()) {
            FileStatus[] fileStatuses = fileSystem.globStatus(new Path(fileOrDirectoryName));
            if (fileStatuses != null) {
                for (FileStatus fileStatus : fileStatuses) {
                    pathList.add(fileStatus.getPath());
                }
            }
        } else {
            pathList.add(new Path(fileOrDirectoryName));
        }

        Map<String, String> attributes = Maps.newHashMapWithExpectedSize(2);
        for (Path path : pathList) {
            attributes.put("filename", path.getName());
            attributes.put("path", path.getParent().toString());
            if (fileSystem.exists(path)) {
                fileSystem.delete(path, context.getProperty(RECURSIVE).asBoolean());
                if (!context.hasIncomingConnection()) {
                    flowFile = session.create();
                }
                session.transfer(session.putAllAttributes(flowFile, attributes), REL_SUCCESS);
            } else {
                getLogger().warn("File (" + path + ") does not exist");
                if (!context.hasIncomingConnection()) {
                    flowFile = session.create();
                }
                session.transfer(session.putAllAttributes(flowFile, attributes), REL_FAILURE);
            }
        }
    } catch (IOException e) {
        getLogger().warn("Error processing delete for file or directory", e);
        if (flowFile != null) {
            session.rollback(true);
        }
    }
}

Example 20

Source File: FileSystemUtils.java From Cubert with Apache License 2.0

4 votes

public static List<Path> getDurationPaths(FileSystem fs,
                                          Path root,
                                          DateTime startDate,
                                          DateTime endDate,
                                          boolean isDaily,
                                          int hourStep,
                                          boolean errorOnMissing,
                                          boolean useHourlyForMissingDaily) throws IOException
{
    List<Path> paths = new ArrayList<Path>();
    while (endDate.compareTo(startDate) >= 0) {
        Path loc;
        if (isDaily)
            loc = generateDatedPath(root, endDate.getYear(), endDate.getMonthOfYear(), endDate.getDayOfMonth());
        else
            loc = generateDatedPath(root, endDate.getYear(), endDate.getMonthOfYear(), endDate.getDayOfMonth(),
                    endDate.getHourOfDay());

        // Check that directory exists, and contains avro files.
        if (fs.exists(loc) && fs.globStatus(new Path(loc, "*" + "avro")).length > 0) {
            paths.add(loc);
        }

        else {

            loc = generateDatedPath(new Path(root.getParent(),"hourly"), endDate.getYear(),
                    endDate.getMonthOfYear(), endDate.getDayOfMonth());
            if(isDaily && useHourlyForMissingDaily && fs.exists(loc))
            {
                  for (FileStatus hour: fs.listStatus(loc)) {
                      paths.add(hour.getPath());
                  }
            }

            else if (errorOnMissing) {
                throw new RuntimeException("Missing directory " + loc.toString());
            }

        }
        if (hourStep ==24)
            endDate = endDate.minusDays(1);
        else
            endDate = endDate.minusHours(hourStep);
    }
    return paths;
}