org.apache.hadoop.mapred.TextInputFormat Java Examples
The following examples show how to use
org.apache.hadoop.mapred.TextInputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: S3SelectPushdown.java From presto with Apache License 2.0 | 6 votes |
private static boolean isInputFormatSupported(Properties schema) { String inputFormat = getInputFormatName(schema); if (TextInputFormat.class.getName().equals(inputFormat)) { if (!Objects.equals(schema.getProperty(SKIP_HEADER_COUNT_KEY, "0"), "0")) { // S3 Select supports skipping one line of headers, but it was returning incorrect results for presto-hive-hadoop2/conf/files/test_table_with_header.csv.gz // TODO https://github.com/prestosql/presto/issues/2349 return false; } if (!Objects.equals(schema.getProperty(SKIP_FOOTER_COUNT_KEY, "0"), "0")) { // S3 Select does not support skipping footers return false; } return true; } return false; }
Example #2
Source File: TestMultipleInputs.java From hadoop with Apache License 2.0 | 6 votes |
public void testAddInputPathWithMapper() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); final Map<Path, Class<? extends Mapper>> maps = MultipleInputs .getMapperTypeMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); assertEquals(MapClass.class, maps.get(new Path("/foo"))); assertEquals(MapClass2.class, maps.get(new Path("/bar"))); }
Example #3
Source File: ReaderTextCSVParallel.java From systemds with Apache License 2.0 | 6 votes |
public CSVReadTask(InputSplit split, SplitOffsetInfos offsets, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean hasHeader, String delim, boolean fill, double fillValue, int splitCount) { _split = split; _splitoffsets = offsets; // new SplitOffsetInfos(offsets); _sparse = dest.isInSparseFormat(); _informat = informat; _job = job; _dest = dest; _rlen = rlen; _clen = clen; _isFirstSplit = (splitCount == 0); _hasHeader = hasHeader; _fill = fill; _fillValue = fillValue; _delim = delim; _rc = true; _splitCount = splitCount; }
Example #4
Source File: WordCount.java From attic-apex-malhar with Apache License 2.0 | 6 votes |
public void run(String[] args) throws Exception { JobConf conf = new JobConf(this.getClass()); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); }
Example #5
Source File: TestMultipleInputs.java From big-c with Apache License 2.0 | 6 votes |
public void testAddInputPathWithMapper() { final JobConf conf = new JobConf(); MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class, MapClass.class); MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class, MapClass2.class); final Map<Path, InputFormat> inputs = MultipleInputs .getInputFormatMap(conf); final Map<Path, Class<? extends Mapper>> maps = MultipleInputs .getMapperTypeMap(conf); assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass()); assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")) .getClass()); assertEquals(MapClass.class, maps.get(new Path("/foo"))); assertEquals(MapClass2.class, maps.get(new Path("/bar"))); }
Example #6
Source File: PipeMapper.java From hadoop-gpu with Apache License 2.0 | 6 votes |
public void configure(JobConf job) { super.configure(job); //disable the auto increment of the counter. For streaming, no of //processed records could be different(equal or less) than the no of //records input. SkipBadRecords.setAutoIncrMapperProcCount(job, false); skipping = job.getBoolean("mapred.skip.on", false); String inputFormatClassName = job.getClass("mapred.input.format.class", TextInputFormat.class).getCanonicalName(); ignoreKey = inputFormatClassName.equals(TextInputFormat.class.getCanonicalName()); try { mapOutputFieldSeparator = job.get("stream.map.output.field.separator", "\t").getBytes("UTF-8"); mapInputFieldSeparator = job.get("stream.map.input.field.separator", "\t").getBytes("UTF-8"); numOfMapOutputKeyFields = job.getInt("stream.num.map.output.key.fields", 1); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The current system does not support UTF-8 encoding!", e); } }
Example #7
Source File: PersonVersion.java From blog with MIT License | 6 votes |
private static void runJobPv(String inputDir, String outputDir, String jobName, Class<? extends Mapper> mapClass, Class<? extends Reducer> reduceClass) throws Exception { JobConf conf = new JobConf(PersonVersion.class); conf.setJobName(jobName); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(IntWritable.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(mapClass); conf.setCombinerClass(reduceClass); conf.setReducerClass(reduceClass); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, inputDir); FileOutputFormat.setOutputPath(conf, new Path(outputDir)); JobClient.runJob(conf); }
Example #8
Source File: FrameReaderTextCell.java From systemds with Apache License 2.0 | 6 votes |
protected void readTextCellFrameFromHDFS( Path path, JobConf job, FileSystem fs, FrameBlock dest, ValueType[] schema, String[] names, long rlen, long clen) throws IOException { if( fs.isDirectory(path) ) { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); for(InputSplit split: splits) readTextCellFrameFromInputSplit(split, informat, job, dest); } else { readRawTextCellFrameFromHDFS(path, job, fs, dest, schema, names, rlen, clen); } }
Example #9
Source File: ReadExistingDataJob.java From tracing-framework with BSD 3-Clause "New" or "Revised" License | 6 votes |
public void configure(JobConf job) { // Set the mapper and reducers job.setMapperClass(ReadDataJob.TestMapper.class); // Make sure this jar is included job.setJarByClass(ReadDataJob.TestMapper.class); // Specify the input and output data formats job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); // Turn off speculative execution job.setMapSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); // Add the job input path FileInputFormat.addInputPath(job, new Path(this.input_path)); }
Example #10
Source File: TestUtils.java From circus-train with Apache License 2.0 | 5 votes |
public static Table createUnpartitionedTable( HiveMetaStoreClient metaStoreClient, String database, String table, URI location) throws TException { Table hiveTable = new Table(); hiveTable.setDbName(database); hiveTable.setTableName(table); hiveTable.setTableType(TableType.EXTERNAL_TABLE.name()); hiveTable.putToParameters("EXTERNAL", "TRUE"); StorageDescriptor sd = new StorageDescriptor(); sd.setCols(DATA_COLUMNS); sd.setLocation(location.toString()); sd.setParameters(new HashMap<String, String>()); sd.setInputFormat(TextInputFormat.class.getName()); sd.setOutputFormat(TextOutputFormat.class.getName()); sd.setSerdeInfo(new SerDeInfo()); sd.getSerdeInfo().setSerializationLib("org.apache.hadoop.hive.serde2.OpenCSVSerde"); hiveTable.setSd(sd); metaStoreClient.createTable(hiveTable); ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, database, table); ColumnStatisticsData statsData = new ColumnStatisticsData(_Fields.LONG_STATS, new LongColumnStatsData(1L, 2L)); ColumnStatisticsObj cso1 = new ColumnStatisticsObj("id", "bigint", statsData); List<ColumnStatisticsObj> statsObj = Collections.singletonList(cso1); metaStoreClient.updateTableColumnStatistics(new ColumnStatistics(statsDesc, statsObj)); return hiveTable; }
Example #11
Source File: FrameReaderJSONLParallel.java From systemds with Apache License 2.0 | 5 votes |
public ReadRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, FrameBlock dest, Map<String, Integer> schemaMap, int offset) { _split = split; _inputFormat = inputFormat; _jobConf = jobConf; _dest = dest; _schemaMap = schemaMap; _offset = offset; }
Example #12
Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0 | 5 votes |
@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, _numThreads); splits = IOUtilFunctions.sortInputSplits(splits); // check existence and non-empty file checkValidInputFile(fs, path); // allocate output matrix block // First Read Pass (count rows/cols, determine offsets, allocate matrix block) MatrixBlock ret = computeLIBSVMSizeAndCreateOutputMatrixBlock(splits, path, job, rlen, clen, estnnz); rlen = ret.getNumRows(); clen = ret.getNumColumns(); // Second Read Pass (read, parse strings, append to matrix block) readLIBSVMMatrixFromHDFS(splits, path, job, ret, rlen, clen, blen); //post-processing (representation-specific, change of sparse/dense block representation) // - nnz explicitly maintained in parallel for the individual splits ret.examSparsity(); // sanity check for parallel row count (since determined internally) if (rlen >= 0 && rlen != ret.getNumRows()) throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow="+ rlen + ", real nrow=" + ret.getNumRows()); return ret; }
Example #13
Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0 | 5 votes |
@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, _numThreads); splits = IOUtilFunctions.sortInputSplits(splits); // check existence and non-empty file checkValidInputFile(fs, path); // allocate output matrix block // First Read Pass (count rows/cols, determine offsets, allocate matrix block) MatrixBlock ret = computeLIBSVMSizeAndCreateOutputMatrixBlock(splits, path, job, rlen, clen, estnnz); rlen = ret.getNumRows(); clen = ret.getNumColumns(); // Second Read Pass (read, parse strings, append to matrix block) readLIBSVMMatrixFromHDFS(splits, path, job, ret, rlen, clen, blen); //post-processing (representation-specific, change of sparse/dense block representation) // - nnz explicitly maintained in parallel for the individual splits ret.examSparsity(); // sanity check for parallel row count (since determined internally) if (rlen >= 0 && rlen != ret.getNumRows()) throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow="+ rlen + ", real nrow=" + ret.getNumRows()); return ret; }
Example #14
Source File: TestMROldApiJobs.java From hadoop with Apache License 2.0 | 5 votes |
static boolean runJob(JobConf conf, Path inDir, Path outDir, int numMaps, int numReds) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(conf); if (fs.exists(outDir)) { fs.delete(outDir, true); } if (!fs.exists(inDir)) { fs.mkdirs(inDir); } String input = "The quick brown fox\n" + "has many silly\n" + "red fox sox\n"; for (int i = 0; i < numMaps; ++i) { DataOutputStream file = fs.create(new Path(inDir, "part-" + i)); file.writeBytes(input); file.close(); } DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf, fs); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, inDir); FileOutputFormat.setOutputPath(conf, outDir); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); JobClient jobClient = new JobClient(conf); RunningJob job = jobClient.submitJob(conf); return jobClient.monitorAndPrintJob(conf, job); }
Example #15
Source File: NewWordCountApplication.java From attic-apex-malhar with Apache License 2.0 | 5 votes |
public void NewWordCountApplication() { setMapClass(WordCount.Map.class); setReduceClass(WordCount.Reduce.class); setCombineClass(WordCount.Reduce.class); setInputFormat(TextInputFormat.class); }
Example #16
Source File: HiveMetaStoreBridgeTest.java From incubator-atlas with Apache License 2.0 | 5 votes |
private Table createTestTable(String databaseName, String tableName) throws HiveException { Table table = new Table(databaseName, tableName); table.setInputFormatClass(TextInputFormat.class); table.setFields(new ArrayList<FieldSchema>() {{ add(new FieldSchema("col1", "string", "comment1")); } }); table.setTableType(TableType.EXTERNAL_TABLE); table.setDataLocation(new Path("somehdfspath")); return table; }
Example #17
Source File: TestMRAppWithCombiner.java From hadoop with Apache License 2.0 | 5 votes |
@Test public void testCombinerShouldUpdateTheReporter() throws Exception { JobConf conf = new JobConf(mrCluster.getConfig()); int numMaps = 5; int numReds = 2; Path in = new Path(mrCluster.getTestWorkDir().getAbsolutePath(), "testCombinerShouldUpdateTheReporter-in"); Path out = new Path(mrCluster.getTestWorkDir().getAbsolutePath(), "testCombinerShouldUpdateTheReporter-out"); createInputOutPutFolder(in, out, numMaps); conf.setJobName("test-job-with-combiner"); conf.setMapperClass(IdentityMapper.class); conf.setCombinerClass(MyCombinerToCheckReporter.class); //conf.setJarByClass(MyCombinerToCheckReporter.class); conf.setReducerClass(IdentityReducer.class); DistributedCache.addFileToClassPath(TestMRJobs.APP_JAR, conf); conf.setOutputCommitter(CustomOutputCommitter.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(conf, in); FileOutputFormat.setOutputPath(conf, out); conf.setNumMapTasks(numMaps); conf.setNumReduceTasks(numReds); runJob(conf); }
Example #18
Source File: PipesNonJavaInputFormat.java From big-c with Apache License 2.0 | 5 votes |
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { // Delegate the generation of input splits to the 'original' InputFormat return ReflectionUtils.newInstance( job.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), job).getSplits(job, numSplits); }
Example #19
Source File: FieldSelectionMapReduce.java From big-c with Apache License 2.0 | 5 votes |
public void configure(JobConf job) { this.fieldSeparator = job.get(FieldSelectionHelper.DATA_FIELD_SEPERATOR, "\t"); this.mapOutputKeyValueSpec = job.get( FieldSelectionHelper.MAP_OUTPUT_KEY_VALUE_SPEC, "0-:"); this.ignoreInputKey = TextInputFormat.class.getCanonicalName().equals( job.getInputFormat().getClass().getCanonicalName()); this.reduceOutputKeyValueSpec = job.get( FieldSelectionHelper.REDUCE_OUTPUT_KEY_VALUE_SPEC, "0-:"); parseOutputKeyValueSpec(); LOG.info(specToString()); }
Example #20
Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0 | 5 votes |
private void readLIBSVMMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int blen) throws IOException { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); ExecutorService pool = CommonThreadPool.get(_numThreads); try { // create read tasks for all splits ArrayList<LIBSVMReadTask> tasks = new ArrayList<>(); int splitCount = 0; for (InputSplit split : splits) { tasks.add( new LIBSVMReadTask(split, _offsets, informat, job, dest, rlen, clen, splitCount++) ); } pool.invokeAll(tasks); pool.shutdown(); // check return codes and aggregate nnz long lnnz = 0; for (LIBSVMReadTask rt : tasks) { lnnz += rt.getPartialNnz(); if (!rt.getReturnCode()) { Exception err = rt.getException(); throw new IOException("Read task for libsvm input failed: "+ err.toString(), err); } } dest.setNonZeros(lnnz); } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } }
Example #21
Source File: PipesNonJavaInputFormat.java From hadoop with Apache License 2.0 | 5 votes |
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { // Delegate the generation of input splits to the 'original' InputFormat return ReflectionUtils.newInstance( job.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), job).getSplits(job, numSplits); }
Example #22
Source File: ProfileFactoryTest.java From pxf with Apache License 2.0 | 5 votes |
@Test public void get() throws Exception { // For TextInputFormat when table has no complex types, HiveText profile should be used String profileName = ProfileFactory.get(new TextInputFormat(), false); assertEquals("HiveText", profileName); // For TextInputFormat when table has complex types, Hive profile should be used, HiveText doesn't support complex types yet profileName = ProfileFactory.get(new TextInputFormat(), true); assertEquals("Hive", profileName); // For RCFileInputFormat when table has complex types, HiveRC profile should be used profileName = ProfileFactory.get(new RCFileInputFormat(), true); assertEquals("HiveRC", profileName); // For RCFileInputFormat when table has no complex types, HiveRC profile should be used profileName = ProfileFactory.get(new RCFileInputFormat(), false); assertEquals("HiveRC", profileName); // For OrcInputFormat when table has complex types, HiveORC profile should be used profileName = ProfileFactory.get(new OrcInputFormat(), true); assertEquals("HiveORC", profileName); // For OrcInputFormat when table has no complex types, HiveORC profile should be used profileName = ProfileFactory.get(new OrcInputFormat(), false); assertEquals("HiveORC", profileName); // For other formats Hive profile should be used profileName = ProfileFactory.get(new SequenceFileInputFilter(), false); assertEquals("Hive", profileName); }
Example #23
Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0 | 5 votes |
public CountNnzTask( InputSplit split, TextInputFormat informat, JobConf job, int[] rNnz, boolean isSymmetric ) { _split = split; _informat = informat; _job = job; _rNnz = rNnz; _isSymmetric = isSymmetric; }
Example #24
Source File: CSVIngestMapper.java From hadoop-solr with Apache License 2.0 | 5 votes |
@Override public void init(JobConf conf) throws IOException { boolean override = conf.getBoolean(IngestJob.INPUT_FORMAT_OVERRIDE, false); if (!override) { conf.setInputFormat(TextInputFormat.class); }// else the user has overridden the input format and we assume it is OK. byte[] delimiterBase64 = Base64.encodeBase64(conf.get(CSV_DELIMITER, "").getBytes()); conf.set(CSV_DELIMITER, new String(delimiterBase64)); }
Example #25
Source File: LogsCountApplication.java From attic-apex-malhar with Apache License 2.0 | 5 votes |
public void LogsCountApplication() { setMapClass(LogCountsPerHour.LogMapClass.class); // setCombineClass(LogCountsPerHour.LogReduce.class); setReduceClass(LogCountsPerHour.LogReduce.class); setInputFormat(TextInputFormat.class); }
Example #26
Source File: ReaderTextCellParallel.java From systemds with Apache License 2.0 | 5 votes |
public ReadTask( InputSplit split, TextInputFormat informat, JobConf job, MatrixBlock dest, long rlen, long clen, boolean mm, FileFormatPropertiesMM mmProps ) { _split = split; _sparse = dest.isInSparseFormat(); _informat = informat; _job = job; _dest = dest; _rlen = rlen; _clen = clen; _matrixMarket = mm; _mmProps = mmProps; }
Example #27
Source File: HadoopMapredCompatWordCount.java From flink with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text.flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())) .groupBy(0).reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
Example #28
Source File: WordCountWithHadoopOutputFormat.java From stratosphere with Apache License 2.0 | 5 votes |
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>( new TextInputFormat(), new JobConf(), "Input Lines"); TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput)); MapOperator mapper = MapOperator.builder(new TokenizeLine()) .input(source) .name("Tokenize Lines") .build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0) .input(mapper) .name("Count Words") .build(); HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(),new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class); TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output)); Plan plan = new Plan(out, "Hadoop OutputFormat Example"); plan.setDefaultParallelism(numSubTasks); return plan; }
Example #29
Source File: ReaderTextLIBSVMParallel.java From systemds with Apache License 2.0 | 5 votes |
private void readLIBSVMMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int blen) throws IOException { FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); ExecutorService pool = CommonThreadPool.get(_numThreads); try { // create read tasks for all splits ArrayList<LIBSVMReadTask> tasks = new ArrayList<>(); int splitCount = 0; for (InputSplit split : splits) { tasks.add( new LIBSVMReadTask(split, _offsets, informat, job, dest, rlen, clen, splitCount++) ); } pool.invokeAll(tasks); pool.shutdown(); // check return codes and aggregate nnz long lnnz = 0; for (LIBSVMReadTask rt : tasks) { lnnz += rt.getPartialNnz(); if (!rt.getReturnCode()) { Exception err = rt.getException(); throw new IOException("Read task for libsvm input failed: "+ err.toString(), err); } } dest.setNonZeros(lnnz); } catch (Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } }
Example #30
Source File: S3SelectPushdown.java From presto with Apache License 2.0 | 5 votes |
public static boolean isCompressionCodecSupported(InputFormat<?, ?> inputFormat, Path path) { if (inputFormat instanceof TextInputFormat) { return getCompressionCodec((TextInputFormat) inputFormat, path) .map(codec -> (codec instanceof GzipCodec) || (codec instanceof BZip2Codec)) .orElse(false); // TODO (https://github.com/prestosql/presto/issues/2475) fix S3 Select when file not compressed } return false; }