Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileInputFormat#setInputPathFilter()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.input.FileInputFormat#setInputPathFilter() .
These examples are extracted from open source projects.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: mrgeo File: HdfsMrsPyramidInputFormat.java License: Apache License 2.0 | 5 votes |
public static void setInputInfo(Job job, String inputWithZoom) throws IOException { // job.setInputFormatClass(HdfsMrsPyramidInputFormat.class); //final String scannedInput = inputs.get(0); //FileInputFormat.addInputPath(job, new Path(scannedInput)); FileInputFormat.addInputPath(job, new Path(inputWithZoom)); FileInputFormat.setInputPathFilter(job, MapFileFilter.class); }
Example 2
Source Project: kylin-on-parquet-v2 File: UHCDictionaryJob.java License: Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_CUBING_JOB_ID); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_INPUT_PATH); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String job_id = getOptionValue(OPTION_CUBING_JOB_ID); String cubeName = getOptionValue(OPTION_CUBE_NAME); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); //add metadata to distributed cache CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); attachCubeMetadata(cube, job.getConfiguration()); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); //Note! handle uhc columns is null. boolean hasUHCValue = false; for (TblColRef tblColRef : uhcColumns) { Path path = new Path(input.toString() + "/" + tblColRef.getIdentity()); if (HadoopUtil.getFileSystem(path).exists(path)) { FileInputFormat.addInputPath(job, path); FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class); hasUHCValue = true; } } if (!hasUHCValue) { isSkipped = true; return 0; } setJobClasspath(job, cube.getConfig()); setupMapper(); setupReducer(output, reducerCount); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id); job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()); job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false"); //8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500"); job.getConfiguration().set("mapred.reduce.child.java.opts", "-Xmx8g"); //Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it) //Waiting the global dict lock maybe also take a long time. //So we set 8 hours here job.getConfiguration().set("mapreduce.task.timeout", "28800000"); //allow user specially set config for uhc step for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) { job.getConfiguration().set(entry.getKey(), entry.getValue()); } return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 3
Source Project: kylin File: UHCDictionaryJob.java License: Apache License 2.0 | 4 votes |
@Override public int run(String[] args) throws Exception { Options options = new Options(); try { options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_CUBING_JOB_ID); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_INPUT_PATH); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String job_id = getOptionValue(OPTION_CUBING_JOB_ID); String cubeName = getOptionValue(OPTION_CUBE_NAME); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); //add metadata to distributed cache CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); attachCubeMetadata(cube, job.getConfiguration()); List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns(); int reducerCount = uhcColumns.size(); //Note! handle uhc columns is null. boolean hasUHCValue = false; for (TblColRef tblColRef : uhcColumns) { Path path = new Path(input.toString() + "/" + tblColRef.getIdentity()); if (HadoopUtil.getFileSystem(path).exists(path)) { FileInputFormat.addInputPath(job, path); FileInputFormat.setInputPathFilter(job, UHCDictPathFilter.class); hasUHCValue = true; } } if (!hasUHCValue) { isSkipped = true; return 0; } setJobClasspath(job, cube.getConfig()); setupMapper(); setupReducer(output, reducerCount); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id); job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory()); job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false"); //8G memory is enough for all global dict, because the input is sequential and we handle global dict slice by slice job.getConfiguration().set("mapreduce.reduce.memory.mb", "8500"); job.getConfiguration().set("mapred.reduce.child.java.opts", "-Xmx8g"); //Copying global dict to working dir in GlobalDictHDFSStore maybe elapsed a long time (Maybe we could improve it) //Waiting the global dict lock maybe also take a long time. //So we set 8 hours here job.getConfiguration().set("mapreduce.task.timeout", "28800000"); //allow user specially set config for uhc step for (Map.Entry<String, String> entry : cube.getConfig().getUHCMRConfigOverride().entrySet()) { job.getConfiguration().set(entry.getKey(), entry.getValue()); } return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
Example 4
Source Project: spork File: PigTestLoader.java License: Apache License 2.0 | 4 votes |
@Override public void setLocation(String location, Job job) throws IOException { super.setLocation(location, job); FileInputFormat.setInputPathFilter(job, TestPathFilter.class); test = true; }
Example 5
Source Project: spork File: HadoopJobHistoryLoader.java License: Apache License 2.0 | 4 votes |
@Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); FileInputFormat.setInputPathFilter(job, JobHistoryPathFilter.class); }