Java Code Examples for org.apache.hadoop.mapred.FileInputFormat#getInputPaths()

The following examples show how to use org.apache.hadoop.mapred.FileInputFormat#getInputPaths() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Job.java    From hadoop-gpu with Apache License 2.0 6 votes vote down vote up
/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) {
      FileSystem fs = FileSystem.get(theJobConf);
      Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    RunningJob running = jc.submitJob(theJobConf);
    this.mapredJobID = running.getID();
    this.state = Job.RUNNING;
  } catch (IOException ioe) {
    this.state = Job.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}
 
Example 2
Source File: Job.java    From RDFS with Apache License 2.0 6 votes vote down vote up
/**
 * Submit this job to mapred. The state becomes RUNNING if submission 
 * is successful, FAILED otherwise.  
 */
protected synchronized void submit() {
  try {
    if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) {
      FileSystem fs = FileSystem.get(theJobConf);
      Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf);
      for (int i = 0; i < inputPaths.length; i++) {
        if (!fs.exists(inputPaths[i])) {
          try {
            fs.mkdirs(inputPaths[i]);
          } catch (IOException e) {

          }
        }
      }
    }
    RunningJob running = jc.submitJob(theJobConf);
    this.mapredJobID = running.getID();
    this.state = Job.RUNNING;
  } catch (IOException ioe) {
    this.state = Job.FAILED;
    this.message = StringUtils.stringifyException(ioe);
  }
}
 
Example 3
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 6 votes vote down vote up
/**
 * MOD - Just added this for visibility.
 */
Path[] getInputPaths(JobConf job) throws IOException {
  Path[] dirs = FileInputFormat.getInputPaths(job);
  if (dirs.length == 0) {
    // on tez we're avoiding to duplicate the file info in FileInputFormat.
    if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
      try {
        List<Path> paths = Utilities.getInputPathsTez(job, mrwork);
        dirs = paths.toArray(new Path[paths.size()]);
      } catch (Exception e) {
        throw new IOException("Could not create input files", e);
      }
    } else {
      throw new IOException("No input paths specified in job");
    }
  }
  return dirs;
}
 
Example 4
Source File: BaseAllocator.java    From HiveKa with Apache License 2.0 6 votes vote down vote up
@Override
public InputSplit[] allocateWork(List<CamusRequest> requests,
    JobConf conf) throws IOException {
  int numTasks = conf.getInt("mapred.map.tasks", 30);
  
  reverseSortRequests(requests);

  List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>();
  Path[] tablePaths = FileInputFormat.getInputPaths(conf);

  for (int i = 0; i < numTasks; i++) {
    if (requests.size() > 0) {
      kafkaETLSplits.add(new KafkaSplit(tablePaths[0]));
    }
  }

  for (CamusRequest r : requests) {
    getSmallestMultiSplit(kafkaETLSplits).addRequest(r);
  }

  InputSplit[] inputSplits = new InputSplit[kafkaETLSplits.size()];

  return kafkaETLSplits.toArray(inputSplits);
}
 
Example 5
Source File: TableInputFormat.java    From hbase with Apache License 2.0 6 votes vote down vote up
public void validateInput(JobConf job) throws IOException {
  // expecting exactly one path
  Path [] tableNames = FileInputFormat.getInputPaths(job);
  if (tableNames == null || tableNames.length > 1) {
    throw new IOException("expecting one table name");
  }

  // connected to table?
  if (getTable() == null) {
    throw new IOException("could not connect to table '" +
      tableNames[0].getName() + "'");
  }

  // expecting at least one column
  String colArg = job.get(COLUMN_LIST);
  if (colArg == null || colArg.length() == 0) {
    throw new IOException("expecting at least one column");
  }
}
 
Example 6
Source File: HadoopInputFormatBase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 7
Source File: ConfigurationProxyTest.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
@Test
public void testSetInputPaths() throws Exception {
  configurationProxy.setInputPaths( null );
  Path[] inputPaths = FileInputFormat.getInputPaths( configurationProxy );
  assertEquals( 0, inputPaths.length );

  PathProxy path1 = new PathProxy( "file://path1" );
  PathProxy path2 = new PathProxy( "file://path2" );
  configurationProxy.setInputPaths( path1, path2 );

  inputPaths = FileInputFormat.getInputPaths( configurationProxy );
  assertEquals( 2, inputPaths.length );
  assertArrayEquals( new Path[] { path1, path2 }, inputPaths );
}
 
Example 8
Source File: TableInputFormat.java    From hbase with Apache License 2.0 5 votes vote down vote up
@Override
protected void initialize(JobConf job) throws IOException {
  Path[] tableNames = FileInputFormat.getInputPaths(job);
  String colArg = job.get(COLUMN_LIST);
  String[] colNames = colArg.split(" ");
  byte [][] m_cols = new byte[colNames.length][];
  for (int i = 0; i < m_cols.length; i++) {
    m_cols[i] = Bytes.toBytes(colNames[i]);
  }
  setInputColumns(m_cols);
  Connection connection = ConnectionFactory.createConnection(job);
  initializeTable(connection, TableName.valueOf(tableNames[0].getName()));
}
 
Example 9
Source File: HadoopInputFormatBase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 10
Source File: AvroUtils.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
/**
 * Obtain the avro input schema from data
 * @param conf
 * @return
 * @throws IOException
 */
public static Schema getAvroInputSchema(JobConf conf) throws IOException
{
  Path[] paths = FileInputFormat.getInputPaths(conf);
  if (paths == null)
  {
    throw new IllegalStateException("input paths do not exist in jobConf!");
  }
  Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]);
  if (inputSchema == null)
  {
    throw new IllegalStateException("Input does not have schema info and/or input is missing.");
  }
  return inputSchema;
}
 
Example 11
Source File: TapInputFormat.java    From cascading-flink with Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 12
Source File: HoodieCombineHiveInputFormat.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Override
public Path[] getInputPathsShim(JobConf conf) {
  try {
    return FileInputFormat.getInputPaths(conf);
  } catch (Exception var3) {
    throw new RuntimeException(var3);
  }
}
 
Example 13
Source File: HiveDynamoDBSplitGenerator.java    From emr-dynamodb-connector with Apache License 2.0 5 votes vote down vote up
@Override
protected Path getInputPath(JobConf conf) {
  Path path = null;

  Path[] paths = FileInputFormat.getInputPaths(conf);
  if ((paths != null) && (paths.length > 0)) {
    path = paths[0];
  }

  return path;
}
 
Example 14
Source File: EmoInputFormat.java    From emodb with Apache License 2.0 5 votes vote down vote up
@Override
public InputSplit[] getSplits(JobConf job, int numSplits)
        throws IOException {
    Path[] paths = FileInputFormat.getInputPaths(job);

    return FluentIterable.from(BaseInputFormat.getSplits(job, paths))
            .transform(_fromSplit)
            .toArray(InputSplit.class);
}
 
Example 15
Source File: HadoopInputFormatBase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException {
	// only gather base statistics for FileInputFormats
	if (!(mapredInputFormat instanceof FileInputFormat)) {
		return null;
	}

	final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ?
			(FileBaseStatistics) cachedStats : null;

	try {
		final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf);

		return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1));
	} catch (IOException ioex) {
		if (LOG.isWarnEnabled()) {
			LOG.warn("Could not determine statistics due to an io error: "
					+ ioex.getMessage());
		}
	} catch (Throwable t) {
		if (LOG.isErrorEnabled()) {
			LOG.error("Unexpected problem while getting the file statistics: "
					+ t.getMessage(), t);
		}
	}

	// no statistics available
	return null;
}
 
Example 16
Source File: IndexUpdater.java    From RDFS with Apache License 2.0 4 votes vote down vote up
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}
 
Example 17
Source File: IndexUpdater.java    From hadoop-gpu with Apache License 2.0 4 votes vote down vote up
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
    int numMapTasks, Shard[] shards) throws IOException {
  // set the starting generation for each shard
  // when a reduce task fails, a new reduce task
  // has to know where to re-start
  setShardGeneration(conf, shards);

  // iconf.set sets properties in conf
  IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
  Shard.setIndexShards(iconf, shards);

  // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
  // (max buffer size = 1/2 * "io.sort.mb").
  // Here we half-en "io.sort.mb" because we use the other half memory to
  // build an intermediate form/index in Combiner.
  iconf.setIOSortMB(iconf.getIOSortMB() / 2);

  // create the job configuration
  JobConf jobConf = new JobConf(conf, IndexUpdater.class);
  jobConf.setJobName(this.getClass().getName() + "_"
      + System.currentTimeMillis());

  // provided by application
  FileInputFormat.setInputPaths(jobConf, inputPaths);
  FileOutputFormat.setOutputPath(jobConf, outputPath);

  jobConf.setNumMapTasks(numMapTasks);

  // already set shards
  jobConf.setNumReduceTasks(shards.length);

  jobConf.setInputFormat(iconf.getIndexInputFormatClass());

  Path[] inputs = FileInputFormat.getInputPaths(jobConf);
  StringBuilder buffer = new StringBuilder(inputs[0].toString());
  for (int i = 1; i < inputs.length; i++) {
    buffer.append(",");
    buffer.append(inputs[i].toString());
  }
  LOG.info("mapred.input.dir = " + buffer.toString());
  LOG.info("mapred.output.dir = " + 
           FileOutputFormat.getOutputPath(jobConf).toString());
  LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
  LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
  LOG.info(shards.length + " shards = " + iconf.getIndexShards());
  // better if we don't create the input format instance
  LOG.info("mapred.input.format.class = "
      + jobConf.getInputFormat().getClass().getName());

  // set by the system
  jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
  jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
  jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
  jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());

  jobConf.setMapperClass(IndexUpdateMapper.class);
  jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
  jobConf.setCombinerClass(IndexUpdateCombiner.class);
  jobConf.setReducerClass(IndexUpdateReducer.class);

  jobConf.setOutputFormat(IndexUpdateOutputFormat.class);

  return jobConf;
}