org.apache.hadoop.mapred.FileInputFormat#getInputPaths

Source File: Job.java From hadoop-gpu with Apache License 2.0

6 votes

/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) {
      FileSystem fs = FileSystem.get(theJobConf);
      Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    RunningJob running = jc.submitJob(theJobConf);
    this.mapredJobID = running.getID();
    this.state = Job.RUNNING;
  } catch (IOException ioe) {
    this.state = Job.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}

Source File: Job.java From RDFS with Apache License 2.0

6 votes

/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) {
      FileSystem fs = FileSystem.get(theJobConf);
      Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    RunningJob running = jc.submitJob(theJobConf);
    this.mapredJobID = running.getID();
    this.state = Job.RUNNING;
  } catch (IOException ioe) {
    this.state = Job.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

6 votes

/**
 * MOD - Just added this for visibility.
 */
Path[] getInputPaths(JobConf job) throws IOException {
  Path[] dirs = FileInputFormat.getInputPaths(job);
  if (dirs.length == 0) {
    // on tez we're avoiding to duplicate the file info in FileInputFormat.
    if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
      try {
        List<Path> paths = Utilities.getInputPathsTez(job, mrwork);
        dirs = paths.toArray(new Path[paths.size()]);
      } catch (Exception e) {
        throw new IOException("Could not create input files", e);
      }
    } else {
      throw new IOException("No input paths specified in job");
    }
  }
  return dirs;
}

Source File: BaseAllocator.java From HiveKa with Apache License 2.0

6 votes

@Override
public InputSplit[] allocateWork(List<CamusRequest> requests,
    JobConf conf) throws IOException {
  int numTasks = conf.getInt("mapred.map.tasks", 30);
  
  reverseSortRequests(requests);

  List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>();
  Path[] tablePaths = FileInputFormat.getInputPaths(conf);

  for (int i = 0; i < numTasks; i++) {
    if (requests.size() > 0) {
      kafkaETLSplits.add(new KafkaSplit(tablePaths[0]));
    }
  }

  for (CamusRequest r : requests) {
    getSmallestMultiSplit(kafkaETLSplits).addRequest(r);
  }

  InputSplit[] inputSplits = new InputSplit[kafkaETLSplits.size()];

  return kafkaETLSplits.toArray(inputSplits);
}

Source File: TableInputFormat.java From hbase with Apache License 2.0

6 votes

public void validateInput(JobConf job) throws IOException {
  // expecting exactly one path
  Path [] tableNames = FileInputFormat.getInputPaths(job);
  if (tableNames == null || tableNames.length > 1) {
    throw new IOException("expecting one table name");
  }

  // connected to table?
  if (getTable() == null) {
    throw new IOException("could not connect to table '" +
      tableNames[0].getName() + "'");
  }

  // expecting at least one column
  String colArg = job.get(COLUMN_LIST);
  if (colArg == null || colArg.length() == 0) {
    throw new IOException("expecting at least one column");
  }
}

Source File: HadoopInputFormatBase.java From flink with Apache License 2.0

5 votes

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}

Source File: ConfigurationProxyTest.java From pentaho-hadoop-shims with Apache License 2.0

5 votes

@Test
public void testSetInputPaths() throws Exception {
  configurationProxy.setInputPaths( null );
  Path[] inputPaths = FileInputFormat.getInputPaths( configurationProxy );
  assertEquals( 0, inputPaths.length );

  PathProxy path1 = new PathProxy( "file://path1" );
  PathProxy path2 = new PathProxy( "file://path2" );
  configurationProxy.setInputPaths( path1, path2 );

  inputPaths = FileInputFormat.getInputPaths( configurationProxy );
  assertEquals( 2, inputPaths.length );
  assertArrayEquals( new Path[] { path1, path2 }, inputPaths );
}

Source File: TableInputFormat.java From hbase with Apache License 2.0

5 votes

@Override
protected void initialize(JobConf job) throws IOException {
  Path[] tableNames = FileInputFormat.getInputPaths(job);
  String colArg = job.get(COLUMN_LIST);
  String[] colNames = colArg.split(" ");
  byte [][] m_cols = new byte[colNames.length][];
  for (int i = 0; i < m_cols.length; i++) {
    m_cols[i] = Bytes.toBytes(colNames[i]);
  }
  setInputColumns(m_cols);
  Connection connection = ConnectionFactory.createConnection(job);
  initializeTable(connection, TableName.valueOf(tableNames[0].getName()));
}

Source File: HadoopInputFormatBase.java From flink with Apache License 2.0

5 votes

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}

Source File: AvroUtils.java From ml-ease with Apache License 2.0

5 votes

/**
 * Obtain the avro input schema from data
 * @param conf
 * @return
 * @throws IOException
 */
public static Schema getAvroInputSchema(JobConf conf) throws IOException
{
  Path[] paths = FileInputFormat.getInputPaths(conf);
  if (paths == null)
  {
    throw new IllegalStateException("input paths do not exist in jobConf!");
  }
  Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  return inputSchema;
}

Source File: TapInputFormat.java From cascading-flink with Apache License 2.0

5 votes

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}

Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0

5 votes

@Override
public Path[] getInputPathsShim(JobConf conf) {
  try {
    return FileInputFormat.getInputPaths(conf);
  } catch (Exception var3) {
    throw new RuntimeException(var3);
  }
}

Source File: HiveDynamoDBSplitGenerator.java From emr-dynamodb-connector with Apache License 2.0

5 votes

@Override
protected Path getInputPath(JobConf conf) {
  Path path = null;

  Path[] paths = FileInputFormat.getInputPaths(conf);
  if ((paths != null) && (paths.length > 0)) {
    path = paths[0];
  }

  return path;
}

Source File: EmoInputFormat.java From emodb with Apache License 2.0

5 votes

@Override
public InputSplit[] getSplits(JobConf job, int numSplits)
        throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(job);

    return FluentIterable.from(BaseInputFormat.getSplits(job, paths))
            .transform(_fromSplit)
            .toArray(InputSplit.class);
}

Source File: HadoopInputFormatBase.java From Flink-CEPplus with Apache License 2.0

5 votes

@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}

Source File: IndexUpdater.java From RDFS with Apache License 2.0

4 votes

JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}

Source File: IndexUpdater.java From hadoop-gpu with Apache License 2.0

4 votes

JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}

Java Code Examples for org.apache.hadoop.mapred.FileInputFormat#getInputPaths()