Java Code Examples for org.apache.hadoop.mapred.FileInputFormat#getInputPaths()
The following examples show how to use
org.apache.hadoop.mapred.FileInputFormat#getInputPaths() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Job.java From hadoop-gpu with Apache License 2.0 | 6 votes |
/** * Submit this job to mapred. The state becomes RUNNING if submission * is successful, FAILED otherwise. */ protected synchronized void submit() { try { if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) { FileSystem fs = FileSystem.get(theJobConf); Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf); for (int i = 0; i < inputPaths.length; i++) { if (!fs.exists(inputPaths[i])) { try { fs.mkdirs(inputPaths[i]); } catch (IOException e) { } } } } RunningJob running = jc.submitJob(theJobConf); this.mapredJobID = running.getID(); this.state = Job.RUNNING; } catch (IOException ioe) { this.state = Job.FAILED; this.message = StringUtils.stringifyException(ioe); } }
Example 2
Source File: Job.java From RDFS with Apache License 2.0 | 6 votes |
/** * Submit this job to mapred. The state becomes RUNNING if submission * is successful, FAILED otherwise. */ protected synchronized void submit() { try { if (theJobConf.getBoolean("create.empty.dir.if.nonexist", false)) { FileSystem fs = FileSystem.get(theJobConf); Path inputPaths[] = FileInputFormat.getInputPaths(theJobConf); for (int i = 0; i < inputPaths.length; i++) { if (!fs.exists(inputPaths[i])) { try { fs.mkdirs(inputPaths[i]); } catch (IOException e) { } } } } RunningJob running = jc.submitJob(theJobConf); this.mapredJobID = running.getID(); this.state = Job.RUNNING; } catch (IOException ioe) { this.state = Job.FAILED; this.message = StringUtils.stringifyException(ioe); } }
Example 3
Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0 | 6 votes |
/** * MOD - Just added this for visibility. */ Path[] getInputPaths(JobConf job) throws IOException { Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { // on tez we're avoiding to duplicate the file info in FileInputFormat. if (HiveConf.getVar(job, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) { try { List<Path> paths = Utilities.getInputPathsTez(job, mrwork); dirs = paths.toArray(new Path[paths.size()]); } catch (Exception e) { throw new IOException("Could not create input files", e); } } else { throw new IOException("No input paths specified in job"); } } return dirs; }
Example 4
Source File: BaseAllocator.java From HiveKa with Apache License 2.0 | 6 votes |
@Override public InputSplit[] allocateWork(List<CamusRequest> requests, JobConf conf) throws IOException { int numTasks = conf.getInt("mapred.map.tasks", 30); reverseSortRequests(requests); List<InputSplit> kafkaETLSplits = new ArrayList<InputSplit>(); Path[] tablePaths = FileInputFormat.getInputPaths(conf); for (int i = 0; i < numTasks; i++) { if (requests.size() > 0) { kafkaETLSplits.add(new KafkaSplit(tablePaths[0])); } } for (CamusRequest r : requests) { getSmallestMultiSplit(kafkaETLSplits).addRequest(r); } InputSplit[] inputSplits = new InputSplit[kafkaETLSplits.size()]; return kafkaETLSplits.toArray(inputSplits); }
Example 5
Source File: TableInputFormat.java From hbase with Apache License 2.0 | 6 votes |
public void validateInput(JobConf job) throws IOException { // expecting exactly one path Path [] tableNames = FileInputFormat.getInputPaths(job); if (tableNames == null || tableNames.length > 1) { throw new IOException("expecting one table name"); } // connected to table? if (getTable() == null) { throw new IOException("could not connect to table '" + tableNames[0].getName() + "'"); } // expecting at least one column String colArg = job.get(COLUMN_LIST); if (colArg == null || colArg.length() == 0) { throw new IOException("expecting at least one column"); } }
Example 6
Source File: HadoopInputFormatBase.java From flink with Apache License 2.0 | 5 votes |
@Override public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { // only gather base statistics for FileInputFormats if (!(mapredInputFormat instanceof FileInputFormat)) { return null; } final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; try { final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf); return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1)); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t); } } // no statistics available return null; }
Example 7
Source File: ConfigurationProxyTest.java From pentaho-hadoop-shims with Apache License 2.0 | 5 votes |
@Test public void testSetInputPaths() throws Exception { configurationProxy.setInputPaths( null ); Path[] inputPaths = FileInputFormat.getInputPaths( configurationProxy ); assertEquals( 0, inputPaths.length ); PathProxy path1 = new PathProxy( "file://path1" ); PathProxy path2 = new PathProxy( "file://path2" ); configurationProxy.setInputPaths( path1, path2 ); inputPaths = FileInputFormat.getInputPaths( configurationProxy ); assertEquals( 2, inputPaths.length ); assertArrayEquals( new Path[] { path1, path2 }, inputPaths ); }
Example 8
Source File: TableInputFormat.java From hbase with Apache License 2.0 | 5 votes |
@Override protected void initialize(JobConf job) throws IOException { Path[] tableNames = FileInputFormat.getInputPaths(job); String colArg = job.get(COLUMN_LIST); String[] colNames = colArg.split(" "); byte [][] m_cols = new byte[colNames.length][]; for (int i = 0; i < m_cols.length; i++) { m_cols[i] = Bytes.toBytes(colNames[i]); } setInputColumns(m_cols); Connection connection = ConnectionFactory.createConnection(job); initializeTable(connection, TableName.valueOf(tableNames[0].getName())); }
Example 9
Source File: HadoopInputFormatBase.java From flink with Apache License 2.0 | 5 votes |
@Override public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { // only gather base statistics for FileInputFormats if (!(mapredInputFormat instanceof FileInputFormat)) { return null; } final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; try { final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf); return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1)); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t); } } // no statistics available return null; }
Example 10
Source File: AvroUtils.java From ml-ease with Apache License 2.0 | 5 votes |
/** * Obtain the avro input schema from data * @param conf * @return * @throws IOException */ public static Schema getAvroInputSchema(JobConf conf) throws IOException { Path[] paths = FileInputFormat.getInputPaths(conf); if (paths == null) { throw new IllegalStateException("input paths do not exist in jobConf!"); } Schema inputSchema = AvroUtils.getSchemaFromFile(conf, paths[0]); if (inputSchema == null) { throw new IllegalStateException("Input does not have schema info and/or input is missing."); } return inputSchema; }
Example 11
Source File: TapInputFormat.java From cascading-flink with Apache License 2.0 | 5 votes |
@Override public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { // only gather base statistics for FileInputFormats if (!(mapredInputFormat instanceof FileInputFormat)) { return null; } final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; try { final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf); return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1)); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t); } } // no statistics available return null; }
Example 12
Source File: HoodieCombineHiveInputFormat.java From hudi with Apache License 2.0 | 5 votes |
@Override public Path[] getInputPathsShim(JobConf conf) { try { return FileInputFormat.getInputPaths(conf); } catch (Exception var3) { throw new RuntimeException(var3); } }
Example 13
Source File: HiveDynamoDBSplitGenerator.java From emr-dynamodb-connector with Apache License 2.0 | 5 votes |
@Override protected Path getInputPath(JobConf conf) { Path path = null; Path[] paths = FileInputFormat.getInputPaths(conf); if ((paths != null) && (paths.length > 0)) { path = paths[0]; } return path; }
Example 14
Source File: EmoInputFormat.java From emodb with Apache License 2.0 | 5 votes |
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path[] paths = FileInputFormat.getInputPaths(job); return FluentIterable.from(BaseInputFormat.getSplits(job, paths)) .transform(_fromSplit) .toArray(InputSplit.class); }
Example 15
Source File: HadoopInputFormatBase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Override public BaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { // only gather base statistics for FileInputFormats if (!(mapredInputFormat instanceof FileInputFormat)) { return null; } final FileBaseStatistics cachedFileStats = (cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; try { final org.apache.hadoop.fs.Path[] paths = FileInputFormat.getInputPaths(this.jobConf); return getFileStats(cachedFileStats, paths, new ArrayList<FileStatus>(1)); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics: " + t.getMessage(), t); } } // no statistics available return null; }
Example 16
Source File: IndexUpdater.java From RDFS with Apache License 2.0 | 4 votes |
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath, int numMapTasks, Shard[] shards) throws IOException { // set the starting generation for each shard // when a reduce task fails, a new reduce task // has to know where to re-start setShardGeneration(conf, shards); // iconf.set sets properties in conf IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); Shard.setIndexShards(iconf, shards); // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size // (max buffer size = 1/2 * "io.sort.mb"). // Here we half-en "io.sort.mb" because we use the other half memory to // build an intermediate form/index in Combiner. iconf.setIOSortMB(iconf.getIOSortMB() / 2); // create the job configuration JobConf jobConf = new JobConf(conf, IndexUpdater.class); jobConf.setJobName(this.getClass().getName() + "_" + System.currentTimeMillis()); // provided by application FileInputFormat.setInputPaths(jobConf, inputPaths); FileOutputFormat.setOutputPath(jobConf, outputPath); jobConf.setNumMapTasks(numMapTasks); // already set shards jobConf.setNumReduceTasks(shards.length); jobConf.setInputFormat(iconf.getIndexInputFormatClass()); Path[] inputs = FileInputFormat.getInputPaths(jobConf); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } LOG.info("mapred.input.dir = " + buffer.toString()); LOG.info("mapred.output.dir = " + FileOutputFormat.getOutputPath(jobConf).toString()); LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks()); LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks()); LOG.info(shards.length + " shards = " + iconf.getIndexShards()); // better if we don't create the input format instance LOG.info("mapred.input.format.class = " + jobConf.getInputFormat().getClass().getName()); // set by the system jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass()); jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass()); jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass()); jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass()); jobConf.setMapperClass(IndexUpdateMapper.class); jobConf.setPartitionerClass(IndexUpdatePartitioner.class); jobConf.setCombinerClass(IndexUpdateCombiner.class); jobConf.setReducerClass(IndexUpdateReducer.class); jobConf.setOutputFormat(IndexUpdateOutputFormat.class); return jobConf; }
Example 17
Source File: IndexUpdater.java From hadoop-gpu with Apache License 2.0 | 4 votes |
JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath, int numMapTasks, Shard[] shards) throws IOException { // set the starting generation for each shard // when a reduce task fails, a new reduce task // has to know where to re-start setShardGeneration(conf, shards); // iconf.set sets properties in conf IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf); Shard.setIndexShards(iconf, shards); // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size // (max buffer size = 1/2 * "io.sort.mb"). // Here we half-en "io.sort.mb" because we use the other half memory to // build an intermediate form/index in Combiner. iconf.setIOSortMB(iconf.getIOSortMB() / 2); // create the job configuration JobConf jobConf = new JobConf(conf, IndexUpdater.class); jobConf.setJobName(this.getClass().getName() + "_" + System.currentTimeMillis()); // provided by application FileInputFormat.setInputPaths(jobConf, inputPaths); FileOutputFormat.setOutputPath(jobConf, outputPath); jobConf.setNumMapTasks(numMapTasks); // already set shards jobConf.setNumReduceTasks(shards.length); jobConf.setInputFormat(iconf.getIndexInputFormatClass()); Path[] inputs = FileInputFormat.getInputPaths(jobConf); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } LOG.info("mapred.input.dir = " + buffer.toString()); LOG.info("mapred.output.dir = " + FileOutputFormat.getOutputPath(jobConf).toString()); LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks()); LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks()); LOG.info(shards.length + " shards = " + iconf.getIndexShards()); // better if we don't create the input format instance LOG.info("mapred.input.format.class = " + jobConf.getInputFormat().getClass().getName()); // set by the system jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass()); jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass()); jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass()); jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass()); jobConf.setMapperClass(IndexUpdateMapper.class); jobConf.setPartitionerClass(IndexUpdatePartitioner.class); jobConf.setCombinerClass(IndexUpdateCombiner.class); jobConf.setReducerClass(IndexUpdateReducer.class); jobConf.setOutputFormat(IndexUpdateOutputFormat.class); return jobConf; }