Java Code Examples for org.apache.hadoop.fs.FileSystem.globStatus()

The following are Jave code examples for showing how to use globStatus() of the org.apache.hadoop.fs.FileSystem class. You can vote up the examples you like. Your votes will be used in our system to get more good examples.
+ Save this method
Example 1
Project: hdfs-shell   File: ContextCommands.java   View Source Code Vote up 9 votes
public synchronized String getCurrentDir() {
    if (currentDir == null) {
        try {
            final Path path = new Path(Path.CUR_DIR);
            final FileSystem fs = getFileSystem();
            final FileStatus[] fileStatuses = fs.globStatus(path);
            if (fileStatuses == null || fileStatuses.length == 0) {
                return "";
            }
            homeDir = currentDir = fileStatuses[0].getPath().toUri().getPath();
        } catch (Exception e) {
            return "";
        }
    }
    return currentDir;
}
 
Example 2
Project: hadoop   File: LocatedFileStatusFetcher.java   View Source Code Vote up 7 votes
@Override
public Result call() throws Exception {
  Result result = new Result();
  FileSystem fs = path.getFileSystem(conf);
  result.fs = fs;
  FileStatus[] matches = fs.globStatus(path, inputFilter);
  if (matches == null) {
    result.addError(new IOException("Input path does not exist: " + path));
  } else if (matches.length == 0) {
    result.addError(new IOException("Input Pattern " + path
        + " matches 0 files"));
  } else {
    result.matchedFileStatuses = matches;
  }
  return result;
}
 
Example 3
Project: circus-train   File: SimpleCopyListing.java   View Source Code Vote up 6 votes
/**
 * Collect the list of <sourceRelativePath, sourceFileStatus> to be copied and write to the sequence file. In essence,
 * any file or directory that need to be copied or sync-ed is written as an entry to the sequence file, with the
 * possible exception of the source root: when either -update (sync) or -overwrite switch is specified, and if the the
 * source root is a directory, then the source root entry is not written to the sequence file, because only the
 * contents of the source directory need to be copied in this case. See
 * {@link com.hotels.bdp.circustrain.s3mapreducecp.util.ConfigurationUtil#getRelativePath} for how relative path is
 * computed. See computeSourceRootPath method for how the root path of the source is computed.
 *
 * @param fileListWriter
 * @param options
 * @param globbedPaths
 * @throws IOException
 */
@VisibleForTesting
public void doBuildListing(SequenceFile.Writer fileListWriter, S3MapReduceCpOptions options) throws IOException {
  List<Path> globbedPaths = new ArrayList<>(options.getSources().size());

  for (Path sourcePath : options.getSources()) {
    FileSystem fs = sourcePath.getFileSystem(getConf());
    FileStatus sourceFileStatus = fs.getFileStatus(sourcePath);
    if (sourceFileStatus.isFile()) {
      LOG.debug("Adding path {}", sourceFileStatus.getPath());
      globbedPaths.add(sourceFileStatus.getPath());
    } else {
      FileStatus[] inputs = fs.globStatus(sourcePath);
      if (inputs != null && inputs.length > 0) {
        for (FileStatus onePath : inputs) {
          LOG.debug("Adding path {}", onePath.getPath());
          globbedPaths.add(onePath.getPath());
        }
      } else {
        throw new InvalidInputException("Source path " + sourcePath + " doesn't exist");
      }
    }
  }
  doBuildListing(fileListWriter, options, globbedPaths);
}
 
Example 4
Project: oryx2   File: DeleteOldDataFn.java   View Source Code Vote up 6 votes
@Override
public void call(T ignored) throws IOException {
  Path dataDirPath = new Path(dataDirString + "/*");
  FileSystem fs = FileSystem.get(dataDirPath.toUri(), hadoopConf);
  FileStatus[] inputPathStatuses = fs.globStatus(dataDirPath);
  if (inputPathStatuses != null) {
    long oldestTimeAllowed =
        System.currentTimeMillis() - TimeUnit.MILLISECONDS.convert(maxAgeHours, TimeUnit.HOURS);
    Arrays.stream(inputPathStatuses).filter(FileStatus::isDirectory).map(FileStatus::getPath).
        filter(subdir -> {
          Matcher m = dirTimestampPattern.matcher(subdir.getName());
          return m.find() && Long.parseLong(m.group(1)) < oldestTimeAllowed;
        }).forEach(subdir -> {
          log.info("Deleting old data at {}", subdir);
          try {
            fs.delete(subdir, true);
          } catch (IOException e) {
            log.warn("Unable to delete {}; continuing", subdir, e);
          }
        });
  }
}
 
Example 5
Project: embulk-input-parquet_hadoop   File: ParquetHadoopInputPlugin.java   View Source Code Vote up 6 votes
private List<FileStatus> listFileStatuses(FileSystem fs, Path rootPath) throws IOException
{
    List<FileStatus> fileStatuses = Lists.newArrayList();

    FileStatus[] entries = fs.globStatus(rootPath, HiddenFileFilter.INSTANCE);
    if (entries == null) {
        return fileStatuses;
    }

    for (FileStatus entry : entries) {
        if (entry.isDirectory()) {
            List<FileStatus> subEntries = listRecursive(fs, entry);
            fileStatuses.addAll(subEntries);
        }
        else {
            fileStatuses.add(entry);
        }
    }

    return fileStatuses;
}
 
Example 6
Project: hadoop-oss   File: PathData.java   View Source Code Vote up 5 votes
/**
 * Expand the given path as a glob pattern.  Non-existent paths do not
 * throw an exception because creation commands like touch and mkdir need
 * to create them.  The "stat" field will be null if the path does not
 * exist.
 * @param pattern the pattern to expand as a glob
 * @param conf the hadoop configuration
 * @return list of {@link PathData} objects.  if the pattern is not a glob,
 * and does not exist, the list will contain a single PathData with a null
 * stat 
 * @throws IOException anything else goes wrong...
 */
public static PathData[] expandAsGlob(String pattern, Configuration conf)
throws IOException {
  Path globPath = new Path(pattern);
  FileSystem fs = globPath.getFileSystem(conf);    
  FileStatus[] stats = fs.globStatus(globPath);
  PathData[] items = null;
  
  if (stats == null) {
    // remove any quoting in the glob pattern
    pattern = pattern.replaceAll("\\\\(.)", "$1");
    // not a glob & file not found, so add the path with a null stat
    items = new PathData[]{ new PathData(fs, pattern, null) };
  } else {
    // figure out what type of glob path was given, will convert globbed
    // paths to match the type to preserve relativity
    PathType globType;
    URI globUri = globPath.toUri();
    if (globUri.getScheme() != null) {
      globType = PathType.HAS_SCHEME;
    } else if (!globUri.getPath().isEmpty() &&
               new Path(globUri.getPath()).isAbsolute()) {
      globType = PathType.SCHEMELESS_ABSOLUTE;
    } else {
      globType = PathType.RELATIVE;
    }

    // convert stats to PathData
    items = new PathData[stats.length];
    int i=0;
    for (FileStatus stat : stats) {
      URI matchUri = stat.getPath().toUri();
      String globMatch = null;
      switch (globType) {
        case HAS_SCHEME: // use as-is, but remove authority if necessary
          if (globUri.getAuthority() == null) {
            matchUri = removeAuthority(matchUri);
          }
          globMatch = uriToString(matchUri, false);
          break;
        case SCHEMELESS_ABSOLUTE: // take just the uri's path
          globMatch = matchUri.getPath();
          break;
        case RELATIVE: // make it relative to the current working dir
          URI cwdUri = fs.getWorkingDirectory().toUri();
          globMatch = relativize(cwdUri, matchUri, stat.isDirectory());
          break;
      }
      items[i++] = new PathData(fs, globMatch, stat);
    }
  }
  Arrays.sort(items);
  return items;
}
 
Example 7
Project: hadoop-oss   File: TestTraceUtils.java   View Source Code Vote up 5 votes
/**
 * Test tracing the globber.  This is a regression test for HDFS-9187.
 */
@Test
public void testTracingGlobber() throws Exception {
  // Bypass the normal FileSystem object creation path by just creating an
  // instance of a subclass.
  FileSystem fs = new LocalFileSystem();
  fs.initialize(new URI("file:///"), new Configuration());
  fs.globStatus(new Path("/"));
  fs.close();
}
 
Example 8
Project: ditb   File: FSUtils.java   View Source Code Vote up 5 votes
public static List<Path> getTableDirs(final FileSystem fs, final Path rootdir)
    throws IOException {
  List<Path> tableDirs = new LinkedList<Path>();

  for(FileStatus status :
      fs.globStatus(new Path(rootdir,
          new Path(HConstants.BASE_NAMESPACE_DIR, "*")))) {
    tableDirs.addAll(FSUtils.getLocalTableDirs(fs, status.getPath()));
  }
  return tableDirs;
}
 
Example 9
Project: aliyun-maxcompute-data-collectors   File: HdfsOdpsImportJob.java   View Source Code Vote up 5 votes
private DatasetDescriptor getDatasetDescriptorFromParquetFile(Job job, FileSystem fs, String uri)
    throws IOException {

  ArrayList<FileStatus> files = new ArrayList<FileStatus>();
  FileStatus[] dirs;
  dirs = fs.globStatus(fs.makeQualified(getInputPath()));
  for (int i = 0; (dirs != null && i < dirs.length); i++) {
    files.addAll(Arrays.asList(fs.listStatus(dirs[i].getPath(), HIDDEN_FILES_PATH_FILTER)));
    // We only check one file, so exit the loop when we have at least
    // one.
    if (files.size() > 0) {
      break;
    }
  }

  ParquetMetadata parquetMetadata;
  try {
    parquetMetadata =
        ParquetFileReader.readFooter(job.getConfiguration(),
            fs.makeQualified(files.get(0).getPath()));
  } catch (IOException e) {
    LOG.error("Wrong file format. Please check the export file's format.", e);
    throw e;
  }
  MessageType schema = parquetMetadata.getFileMetaData().getSchema();
  Schema avroSchema = new AvroSchemaConverter().convert(schema);
  DatasetDescriptor descriptor =
      new DatasetDescriptor.Builder().schema(avroSchema).format(Formats.PARQUET)
          .compressionType(ParquetJob.getCompressionType(job.getConfiguration())).build();
  return descriptor;
}
 
Example 10
Project: hadoop   File: FileInputFormat.java   View Source Code Vote up 5 votes
private List<FileStatus> singleThreadedListStatus(JobConf job, Path[] dirs,
    PathFilter inputFilter, boolean recursive) throws IOException {
  List<FileStatus> result = new ArrayList<FileStatus>();
  List<IOException> errors = new ArrayList<IOException>();
  for (Path p: dirs) {
    FileSystem fs = p.getFileSystem(job); 
    FileStatus[] matches = fs.globStatus(p, inputFilter);
    if (matches == null) {
      errors.add(new IOException("Input path does not exist: " + p));
    } else if (matches.length == 0) {
      errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
    } else {
      for (FileStatus globStat: matches) {
        if (globStat.isDirectory()) {
          RemoteIterator<LocatedFileStatus> iter =
              fs.listLocatedStatus(globStat.getPath());
          while (iter.hasNext()) {
            LocatedFileStatus stat = iter.next();
            if (inputFilter.accept(stat.getPath())) {
              if (recursive && stat.isDirectory()) {
                addInputPathRecursively(result, fs, stat.getPath(),
                    inputFilter);
              } else {
                result.add(stat);
              }
            }
          }
        } else {
          result.add(globStat);
        }
      }
    }
  }
  if (!errors.isEmpty()) {
    throw new InvalidInputException(errors);
  }
  return result;
}
 
Example 11
Project: hadoop   File: FileInputFormat.java   View Source Code Vote up 5 votes
private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs,
    PathFilter inputFilter, boolean recursive) throws IOException {
  List<FileStatus> result = new ArrayList<FileStatus>();
  List<IOException> errors = new ArrayList<IOException>();
  for (int i=0; i < dirs.length; ++i) {
    Path p = dirs[i];
    FileSystem fs = p.getFileSystem(job.getConfiguration()); 
    FileStatus[] matches = fs.globStatus(p, inputFilter);
    if (matches == null) {
      errors.add(new IOException("Input path does not exist: " + p));
    } else if (matches.length == 0) {
      errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
    } else {
      for (FileStatus globStat: matches) {
        if (globStat.isDirectory()) {
          RemoteIterator<LocatedFileStatus> iter =
              fs.listLocatedStatus(globStat.getPath());
          while (iter.hasNext()) {
            LocatedFileStatus stat = iter.next();
            if (inputFilter.accept(stat.getPath())) {
              if (recursive && stat.isDirectory()) {
                addInputPathRecursively(result, fs, stat.getPath(),
                    inputFilter);
              } else {
                result.add(stat);
              }
            }
          }
        } else {
          result.add(globStat);
        }
      }
    }
  }

  if (!errors.isEmpty()) {
    throw new InvalidInputException(errors);
  }
  return result;
}
 
Example 12
Project: hadoop   File: DistCpV1.java   View Source Code Vote up 5 votes
/** Sanity check for srcPath */
private static void checkSrcPath(JobConf jobConf, List<Path> srcPaths) 
throws IOException {
  List<IOException> rslt = new ArrayList<IOException>();
  List<Path> unglobbed = new LinkedList<Path>();
  
  Path[] ps = new Path[srcPaths.size()];
  ps = srcPaths.toArray(ps);
  TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), ps, jobConf);
  
  
  for (Path p : srcPaths) {
    FileSystem fs = p.getFileSystem(jobConf);
    FileStatus[] inputs = fs.globStatus(p);
    
    if(inputs != null && inputs.length > 0) {
      for (FileStatus onePath: inputs) {
        unglobbed.add(onePath.getPath());
      }
    } else {
      rslt.add(new IOException("Input source " + p + " does not exist."));
    }
  }
  if (!rslt.isEmpty()) {
    throw new InvalidInputException(rslt);
  }
  srcPaths.clear();
  srcPaths.addAll(unglobbed);
}
 
Example 13
Project: hadoop   File: GlobbedCopyListing.java   View Source Code Vote up 5 votes
/**
 * Implementation of CopyListing::buildListing().
 * Creates the copy listing by "globbing" all source-paths.
 * @param pathToListingFile The location at which the copy-listing file
 *                           is to be created.
 * @param options Input Options for DistCp (indicating source/target paths.)
 * @throws IOException
 */
@Override
public void doBuildListing(Path pathToListingFile,
                           DistCpOptions options) throws IOException {

  List<Path> globbedPaths = new ArrayList<Path>();
  if (options.getSourcePaths().isEmpty()) {
    throw new InvalidInputException("Nothing to process. Source paths::EMPTY");  
  }

  for (Path p : options.getSourcePaths()) {
    FileSystem fs = p.getFileSystem(getConf());
    FileStatus[] inputs = fs.globStatus(p);

    if(inputs != null && inputs.length > 0) {
      for (FileStatus onePath: inputs) {
        globbedPaths.add(onePath.getPath());
      }
    } else {
      throw new InvalidInputException(p + " doesn't exist");        
    }
  }

  DistCpOptions optionsGlobbed = new DistCpOptions(options);
  optionsGlobbed.setSourcePaths(globbedPaths);
  simpleListing.buildListing(pathToListingFile, optionsGlobbed);
}
 
Example 14
Project: hadoop   File: CopyCommitter.java   View Source Code Vote up 5 votes
private void deleteAttemptTempFiles(Path targetWorkPath,
                                    FileSystem targetFS,
                                    String jobId) throws IOException {

  FileStatus[] tempFiles = targetFS.globStatus(
      new Path(targetWorkPath, ".distcp.tmp." + jobId.replaceAll("job","attempt") + "*"));

  if (tempFiles != null && tempFiles.length > 0) {
    for (FileStatus file : tempFiles) {
      LOG.info("Cleaning up " + file.getPath());
      targetFS.delete(file.getPath(), false);
    }
  }
}
 
Example 15
Project: hadoop   File: PathData.java   View Source Code Vote up 5 votes
/**
 * Expand the given path as a glob pattern.  Non-existent paths do not
 * throw an exception because creation commands like touch and mkdir need
 * to create them.  The "stat" field will be null if the path does not
 * exist.
 * @param pattern the pattern to expand as a glob
 * @param conf the hadoop configuration
 * @return list of {@link PathData} objects.  if the pattern is not a glob,
 * and does not exist, the list will contain a single PathData with a null
 * stat 
 * @throws IOException anything else goes wrong...
 */
public static PathData[] expandAsGlob(String pattern, Configuration conf)
throws IOException {
  Path globPath = new Path(pattern);
  FileSystem fs = globPath.getFileSystem(conf);    
  FileStatus[] stats = fs.globStatus(globPath);
  PathData[] items = null;
  
  if (stats == null) {
    // remove any quoting in the glob pattern
    pattern = pattern.replaceAll("\\\\(.)", "$1");
    // not a glob & file not found, so add the path with a null stat
    items = new PathData[]{ new PathData(fs, pattern, null) };
  } else {
    // figure out what type of glob path was given, will convert globbed
    // paths to match the type to preserve relativity
    PathType globType;
    URI globUri = globPath.toUri();
    if (globUri.getScheme() != null) {
      globType = PathType.HAS_SCHEME;
    } else if (!globUri.getPath().isEmpty() &&
               new Path(globUri.getPath()).isAbsolute()) {
      globType = PathType.SCHEMELESS_ABSOLUTE;
    } else {
      globType = PathType.RELATIVE;
    }

    // convert stats to PathData
    items = new PathData[stats.length];
    int i=0;
    for (FileStatus stat : stats) {
      URI matchUri = stat.getPath().toUri();
      String globMatch = null;
      switch (globType) {
        case HAS_SCHEME: // use as-is, but remove authority if necessary
          if (globUri.getAuthority() == null) {
            matchUri = removeAuthority(matchUri);
          }
          globMatch = uriToString(matchUri, false);
          break;
        case SCHEMELESS_ABSOLUTE: // take just the uri's path
          globMatch = matchUri.getPath();
          break;
        case RELATIVE: // make it relative to the current working dir
          URI cwdUri = fs.getWorkingDirectory().toUri();
          globMatch = relativize(cwdUri, matchUri, stat.isDirectory());
          break;
      }
      items[i++] = new PathData(fs, globMatch, stat);
    }
  }
  Arrays.sort(items);
  return items;
}
 
Example 16
Project: hadoop   File: InMemorySCMStore.java   View Source Code Vote up 4 votes
@VisibleForTesting
Map<String, String> getInitialCachedResources(FileSystem fs,
    Configuration conf) throws IOException {
  // get the root directory for the shared cache
  String location =
      conf.get(YarnConfiguration.SHARED_CACHE_ROOT,
          YarnConfiguration.DEFAULT_SHARED_CACHE_ROOT);
  Path root = new Path(location);
  if (!fs.exists(root)) {
    String message =
        "The shared cache root directory " + location + " was not found";
    LOG.error(message);
    throw new IOException(message);
  }

  int nestedLevel = SharedCacheUtil.getCacheDepth(conf);
  // now traverse individual directories and process them
  // the directory structure is specified by the nested level parameter
  // (e.g. 9/c/d/<checksum>/file)
  String pattern = SharedCacheUtil.getCacheEntryGlobPattern(nestedLevel+1);

  LOG.info("Querying for all individual cached resource files");
  FileStatus[] entries = fs.globStatus(new Path(root, pattern));
  int numEntries = entries == null ? 0 : entries.length;
  LOG.info("Found " + numEntries + " files: processing for one resource per "
      + "key");

  Map<String, String> initialCachedEntries = new HashMap<String, String>();
  if (entries != null) {
    for (FileStatus entry : entries) {
      Path file = entry.getPath();
      String fileName = file.getName();
      if (entry.isFile()) {
        // get the parent to get the checksum
        Path parent = file.getParent();
        if (parent != null) {
          // the name of the immediate parent directory is the checksum
          String key = parent.getName();
          // make sure we insert only one file per checksum whichever comes
          // first
          if (initialCachedEntries.containsKey(key)) {
            LOG.warn("Key " + key + " is already mapped to file "
                + initialCachedEntries.get(key) + "; file " + fileName
                + " will not be added");
          } else {
            initialCachedEntries.put(key, fileName);
          }
        }
      }
    }
  }
  LOG.info("A total of " + initialCachedEntries.size()
      + " files are now mapped");
  return initialCachedEntries;
}
 
Example 17
Project: hadoop   File: ContainerLaunch.java   View Source Code Vote up 4 votes
/**
 * Tries to tail and fetch TAIL_SIZE_IN_BYTES of data from the error log.
 * ErrorLog filename is not fixed and depends upon app, hence file name
 * pattern is used.
 * @param containerID
 * @param ret
 * @param containerLogDir
 * @param diagnosticInfo
 */
@SuppressWarnings("unchecked")
private void handleContainerExitWithFailure(ContainerId containerID, int ret,
    Path containerLogDir, StringBuilder diagnosticInfo) {
  LOG.warn(diagnosticInfo);

  String errorFileNamePattern =
      conf.get(YarnConfiguration.NM_CONTAINER_STDERR_PATTERN,
          YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_PATTERN);
  FSDataInputStream errorFileIS = null;
  try {
    FileSystem fileSystem = FileSystem.getLocal(conf).getRaw();
    FileStatus[] errorFileStatuses = fileSystem
        .globStatus(new Path(containerLogDir, errorFileNamePattern));
    if (errorFileStatuses != null && errorFileStatuses.length != 0) {
      long tailSizeInBytes =
          conf.getLong(YarnConfiguration.NM_CONTAINER_STDERR_BYTES,
              YarnConfiguration.DEFAULT_NM_CONTAINER_STDERR_BYTES);
      Path errorFile = errorFileStatuses[0].getPath();
      long fileSize = errorFileStatuses[0].getLen();

      // if more than one file matches the stderr pattern, take the latest
      // modified file, and also append the file names in the diagnosticInfo
      if (errorFileStatuses.length > 1) {
        String[] errorFileNames = new String[errorFileStatuses.length];
        long latestModifiedTime = errorFileStatuses[0].getModificationTime();
        errorFileNames[0] = errorFileStatuses[0].getPath().getName();
        for (int i = 1; i < errorFileStatuses.length; i++) {
          errorFileNames[i] = errorFileStatuses[i].getPath().getName();
          if (errorFileStatuses[i]
              .getModificationTime() > latestModifiedTime) {
            latestModifiedTime = errorFileStatuses[i].getModificationTime();
            errorFile = errorFileStatuses[i].getPath();
            fileSize = errorFileStatuses[i].getLen();
          }
        }
        diagnosticInfo.append("Error files : ")
            .append(StringUtils.join(", ", errorFileNames)).append(".\n");
      }

      long startPosition =
          (fileSize < tailSizeInBytes) ? 0 : fileSize - tailSizeInBytes;
      int bufferSize =
          (int) ((fileSize < tailSizeInBytes) ? fileSize : tailSizeInBytes);
      byte[] tailBuffer = new byte[bufferSize];
      errorFileIS = fileSystem.open(errorFile);
      errorFileIS.readFully(startPosition, tailBuffer);

      diagnosticInfo.append("Last ").append(tailSizeInBytes)
          .append(" bytes of ").append(errorFile.getName()).append(" :\n")
          .append(new String(tailBuffer, StandardCharsets.UTF_8));
    }
  } catch (IOException e) {
    LOG.error("Failed to get tail of the container's error log file", e);
  } finally {
    IOUtils.cleanup(LOG, errorFileIS);
  }

  this.dispatcher.getEventHandler()
      .handle(new ContainerExitEvent(containerID,
          ContainerEventType.CONTAINER_EXITED_WITH_FAILURE, ret,
          diagnosticInfo.toString()));
}
 
Example 18
Project: hadoop   File: TestCopyFiles.java   View Source Code Vote up 4 votes
public void testMapCount() throws Exception {
  String namenode = null;
  MiniDFSCluster dfs = null;
  MiniDFSCluster mr = null;
  try {
    Configuration conf = new Configuration();
    
    dfs= new MiniDFSCluster.Builder(conf).numDataNodes(3).format(true).build();
    
    FileSystem fs = dfs.getFileSystem();
    final FsShell shell = new FsShell(conf);
    namenode = fs.getUri().toString();
    MyFile[] files = createFiles(fs.getUri(), "/srcdat");
    long totsize = 0;
    for (MyFile f : files) {
      totsize += f.getSize();
    }
    
    Configuration job = new JobConf(conf);
    job.setLong("distcp.bytes.per.map", totsize / 3);
    ToolRunner.run(new DistCpV1(job),
        new String[] {"-m", "100",
                      "-log",
                      namenode+"/logs",
                      namenode+"/srcdat",
                      namenode+"/destdat"});
    assertTrue("Source and destination directories do not match.",
               checkFiles(fs, "/destdat", files));

    String logdir = namenode + "/logs";
    System.out.println(execCmd(shell, "-lsr", logdir));
    FileStatus[] logs = fs.listStatus(new Path(logdir));
    // rare case where splits are exact, logs.length can be 4
    assertTrue( logs.length == 2);

    deldir(fs, "/destdat");
    deldir(fs, "/logs");
    ToolRunner.run(new DistCpV1(job),
        new String[] {"-m", "1",
                      "-log",
                      namenode+"/logs",
                      namenode+"/srcdat",
                      namenode+"/destdat"});

    System.out.println(execCmd(shell, "-lsr", logdir));
    logs = fs.globStatus(new Path(namenode+"/logs/part*"));
    assertTrue("Unexpected map count, logs.length=" + logs.length,
        logs.length == 1);
  } finally {
    if (dfs != null) { dfs.shutdown(); }
    if (mr != null) { mr.shutdown(); }
  }
}
 
Example 19
Project: hadoop   File: TraceBuilder.java   View Source Code Vote up 4 votes
/**
 * Processes the input file/folder argument. If the input is a file,
 * then it is directly considered for further processing by TraceBuilder.
 * If the input is a folder, then all the history logs in the
 * input folder are considered for further processing.
 *
 * If isRecursive is true, then the input path is recursively scanned
 * for job history logs for further processing by TraceBuilder.
 *
 * NOTE: If the input represents a globbed path, then it is first flattened
 *       and then the individual paths represented by the globbed input
 *       path are considered for further processing.
 *
 * @param input        input path, possibly globbed
 * @param conf         configuration
 * @param isRecursive  whether to recursively traverse the input paths to
 *                     find history logs
 * @return the input history log files' paths
 * @throws FileNotFoundException
 * @throws IOException
 */
static List<Path> processInputArgument(String input, Configuration conf,
    boolean isRecursive) throws FileNotFoundException, IOException {
  Path inPath = new Path(input);
  FileSystem fs = inPath.getFileSystem(conf);
  FileStatus[] inStatuses = fs.globStatus(inPath);

  List<Path> inputPaths = new LinkedList<Path>();
  if (inStatuses == null || inStatuses.length == 0) {
    return inputPaths;
  }

  for (FileStatus inStatus : inStatuses) {
    Path thisPath = inStatus.getPath();
    if (inStatus.isDirectory()) {

      // Find list of files in this path(recursively if -recursive option
      // is specified).
      List<FileStatus> historyLogs = new ArrayList<FileStatus>();

      RemoteIterator<LocatedFileStatus> iter =
        fs.listFiles(thisPath, isRecursive);
      while (iter.hasNext()) {
        LocatedFileStatus child = iter.next();
        String fileName = child.getPath().getName();

        if (!(fileName.endsWith(".crc") || fileName.startsWith("."))) {
          historyLogs.add(child);
        }
      }

      if (historyLogs.size() > 0) {
        // Add the sorted history log file names in this path to the
        // inputPaths list
        FileStatus[] sortableNames =
            historyLogs.toArray(new FileStatus[historyLogs.size()]);
        Arrays.sort(sortableNames, new HistoryLogsComparator());

        for (FileStatus historyLog : sortableNames) {
          inputPaths.add(historyLog.getPath());
        }
      }
    } else {
      inputPaths.add(thisPath);
    }
  }

  return inputPaths;
}
 
Example 20
Project: oryx2   File: BatchUpdateFunction.java   View Source Code Vote up 4 votes
@Override
public void call(JavaPairRDD<K,M> newData, Time timestamp)
    throws IOException, InterruptedException {

  if (newData.isEmpty()) {
    log.info("No data in current generation's RDD; nothing to do");
    return;
  }

  log.info("Beginning update at {}", timestamp);

  Configuration hadoopConf = sparkContext.hadoopConfiguration();
  if (hadoopConf.getResource("core-site.xml") == null) {
    log.warn("Hadoop config like core-site.xml was not found; " +
             "is the Hadoop config directory on the classpath?");
  }

  JavaPairRDD<K,M> pastData;
  Path inputPathPattern = new Path(dataDirString + "/*/part-*");
  FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
  FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
  if (inputPathStatuses == null || inputPathStatuses.length == 0) {

    log.info("No past data at path(s) {}", inputPathPattern);
    pastData = null;

  } else {

    log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
    Configuration updatedConf = new Configuration(hadoopConf);
    updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

    @SuppressWarnings("unchecked")
    JavaPairRDD<Writable,Writable> pastWritableData = (JavaPairRDD<Writable,Writable>)
        sparkContext.newAPIHadoopRDD(updatedConf,
                                     SequenceFileInputFormat.class,
                                     keyWritableClass,
                                     messageWritableClass);

    pastData = pastWritableData.mapToPair(
        new WritableToValueFunction<>(keyClass,
                                      messageClass,
                                      keyWritableClass,
                                      messageWritableClass));
  }

  if (updateTopic == null || updateBroker == null) {
    log.info("Not producing updates to update topic since none was configured");
    updateInstance.runUpdate(sparkContext,
                             timestamp.milliseconds(),
                             newData,
                             pastData,
                             modelDirString,
                             null);
  } else {
    // This TopicProducer should not be async; sends one big model generally and
    // needs to occur before other updates reliably rather than be buffered
    try (TopicProducer<String,U> producer =
             new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
      updateInstance.runUpdate(sparkContext,
                               timestamp.milliseconds(),
                               newData,
                               pastData,
                               modelDirString,
                               producer);
    }
  }
}