org.apache.hadoop.mapred.FileSplit Java Examples

The following examples show how to use org.apache.hadoop.mapred.FileSplit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: ImportRecordReaderFactory.java From emr-dynamodb-connector with Apache License 2.0

6 votes

static RecordReader<NullWritable, DynamoDBItemWritable> getRecordReader(
    InputSplit inputSplit, JobConf job, Reporter reporter) throws IOException {
  // CombineFileSplit indicates the new export format which includes a manifest file
  if (inputSplit instanceof CombineFileSplit) {
    int version = job.getInt(DynamoDBConstants.EXPORT_FORMAT_VERSION, -1);
    if (version != ExportManifestRecordWriter.FORMAT_VERSION) {
      throw new IOException("Unknown version: " + job.get(DynamoDBConstants
          .EXPORT_FORMAT_VERSION));
    }
    return new ImportCombineFileRecordReader((CombineFileSplit) inputSplit, job, reporter);
  } else if (inputSplit instanceof FileSplit) {
    // FileSplit indicates the old data pipeline format which doesn't include a manifest file
    Path path = ((FileSplit) inputSplit).getPath();
    return new ImportRecordReader(job, path);
  } else {
    throw new IOException("Expecting CombineFileSplit or FileSplit but the input split type is:"
        + " " + inputSplit.getClass());
  }
}

Example #2

Source File: OrcInputFormat.java From hive-dwrf with Apache License 2.0

6 votes

@Override
public RecordReader<NullWritable, OrcLazyRow>
    getRecordReader(InputSplit inputSplit, JobConf conf,
                    Reporter reporter) throws IOException {
  ReaderWriterProfiler.setProfilerOptions(conf);
  FileSplit fileSplit = (FileSplit) inputSplit;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem(conf);
  reporter.setStatus(fileSplit.toString());

  return new OrcRecordReader(
      OrcFile.createReader(fs, path, conf),
      conf,
      fileSplit.getStart(),
      fileSplit.getLength()
  );
}

Example #3

Source File: TestHiveFileFormats.java From presto with Apache License 2.0

6 votes

@Test(dataProvider = "rowCount")
public void testAvroFileInSymlinkTable(int rowCount)
        throws Exception
{
    File file = File.createTempFile("presto_test", AVRO.name());
    //noinspection ResultOfMethodCallIgnored
    file.delete();
    try {
        FileSplit split = createTestFile(file.getAbsolutePath(), AVRO, HiveCompressionCodec.NONE, getTestColumnsSupportedByAvro(), rowCount);
        Properties splitProperties = new Properties();
        splitProperties.setProperty(FILE_INPUT_FORMAT, SymlinkTextInputFormat.class.getName());
        splitProperties.setProperty(SERIALIZATION_LIB, AVRO.getSerDe());
        testCursorProvider(createGenericHiveRecordCursorProvider(HDFS_ENVIRONMENT), split, splitProperties, getTestColumnsSupportedByAvro(), SESSION, rowCount);
    }
    finally {
        //noinspection ResultOfMethodCallIgnored
        file.delete();
    }
}

Example #4

Source File: LineDocRecordReader.java From RDFS with Apache License 2.0

6 votes

/**
 * Constructor
 * @param job
 * @param split  
 * @throws IOException
 */
public LineDocRecordReader(Configuration job, FileSplit split)
    throws IOException {
  long start = split.getStart();
  long end = start + split.getLength();
  final Path file = split.getPath();

  // open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  InputStream in = fileIn;
  boolean skipFirstLine = false;
  if (start != 0) {
    skipFirstLine = true; // wait till BufferedInputStream to skip
    --start;
    fileIn.seek(start);
  }

  this.in = new BufferedInputStream(in);
  if (skipFirstLine) { // skip first line and re-establish "start".
    start += LineDocRecordReader.readData(this.in, null, EOL);
  }
  this.start = start;
  this.pos = start;
  this.end = end;
}

Example #5

Source File: StormParsedInputFormat.java From incubator-retired-mrql with Apache License 2.0

6 votes

public ParsedRecordReader ( FileSplit split,
    Configuration conf,
    Class<? extends Parser> parser_class,
    Trees args ) throws IOException {
    start = split.getStart();
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    fsin = fs.open(split.getPath());
    try {
        parser = parser_class.newInstance();
    } catch (Exception ex) {
        throw new Error("Unrecognized parser:"+parser_class);
    };
    parser.initialize(args);
    parser.open(fsin,start,end);
    result = null;
}

Example #6

Source File: EsHiveInputFormat.java From elasticsearch-hadoop with Apache License 2.0

6 votes

@Override
public FileSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    // first, merge input table properties (since there's no access to them ...)
    Settings settings = HadoopSettingsManager.loadFrom(job);
    //settings.merge(IOUtils.propsFromString(settings.getProperty(HiveConstants.INPUT_TBL_PROPERTIES)));

    Log log = LogFactory.getLog(getClass());
    // move on to initialization
    InitializationUtils.setValueReaderIfNotSet(settings, HiveValueReader.class, log);
    InitializationUtils.setUserProviderIfNotSet(settings, HadoopUserProvider.class, log);
    if (settings.getOutputAsJson() == false) {
        // Only set the fields if we aren't asking for raw JSON
        settings.setProperty(InternalConfigurationOptions.INTERNAL_ES_TARGET_FIELDS, StringUtils.concatenate(HiveUtils.columnToAlias(settings), ","));
    }

    HiveUtils.init(settings, log);

    // decorate original splits as FileSplit
    InputSplit[] shardSplits = super.getSplits(job, numSplits);
    FileSplit[] wrappers = new FileSplit[shardSplits.length];
    Path path = new Path(job.get(HiveConstants.TABLE_LOCATION));
    for (int i = 0; i < wrappers.length; i++) {
        wrappers[i] = new EsHiveSplit(shardSplits[i], path);
    }
    return wrappers;
}

Example #7

Source File: StreamXmlRecordReader.java From RDFS with Apache License 2.0

6 votes

public StreamXmlRecordReader(FSDataInputStream in, FileSplit split, Reporter reporter,
                             JobConf job, FileSystem fs) throws IOException {
  super(in, split, reporter, job, fs);

  beginMark_ = checkJobGet(CONF_NS + "begin");
  endMark_ = checkJobGet(CONF_NS + "end");

  maxRecSize_ = job_.getInt(CONF_NS + "maxrec", 50 * 1000);
  lookAhead_ = job_.getInt(CONF_NS + "lookahead", 2 * maxRecSize_);
  synched_ = false;

  slowMatch_ = job_.getBoolean(CONF_NS + "slowmatch", false);
  if (slowMatch_) {
    beginPat_ = makePatternCDataOrMark(beginMark_);
    endPat_ = makePatternCDataOrMark(endMark_);
  }
  init();
}

Example #8

Source File: MDSHiveLineInputFormat.java From multiple-dimension-spread with Apache License 2.0

6 votes

@Override
public RecordReader<NullWritable,ColumnAndIndex> getRecordReader( final InputSplit split, final JobConf job, final Reporter reporter ) throws IOException {
  FileSplit fileSplit = (FileSplit)split;
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem( job );
  long fileLength = fs.getLength( path );
  long start = fileSplit.getStart();
  long length = fileSplit.getLength();
  InputStream in = fs.open( path );
  IJobReporter jobReporter = new HadoopJobReporter( reporter );
  jobReporter.setStatus( String.format( "Read file : %s" , path.toString() ) );
  HiveReaderSetting hiveConfig = new HiveReaderSetting( fileSplit , job );
  if ( hiveConfig.isVectorMode() ){
    IVectorizedReaderSetting vectorizedSetting = new HiveVectorizedReaderSetting( fileSplit , job , hiveConfig );
    return (RecordReader)new MDSHiveDirectVectorizedReader( in , fileLength , start , length , vectorizedSetting , jobReporter );
  }
  else{
    return new MDSHiveLineReader( in , fileLength , start , length , hiveConfig , jobReporter , spreadCounter );
  }
}

Example #9

Source File: IOUtilFunctions.java From systemds with Apache License 2.0

6 votes

public static InputSplit[] sortInputSplits(InputSplit[] splits) {
	if (splits[0] instanceof FileSplit) {
		// The splits do not always arrive in order by file name.
		// Sort the splits lexicographically by path so that the header will
		// be in the first split.
		// Note that we're assuming that the splits come in order by offset
		Arrays.sort(splits, new Comparator<InputSplit>() {
			@Override
			public int compare(InputSplit o1, InputSplit o2) {
				Path p1 = ((FileSplit) o1).getPath();
				Path p2 = ((FileSplit) o2).getPath();
				return p1.toString().compareTo(p2.toString());
			}
		});
	}		
	return splits;
}

Example #10

Source File: ExcelCellFileInputFormat.java From hadoopoffice with Apache License 2.0

6 votes

@Override
public RecordReader<Text, SpreadSheetCellDAO> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
		throws IOException {
	/** Create reader **/
	try {
			 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
	 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
			return new ExcelCellRecordReader( (FileSplit) split,job,reporter);
		} catch (FormatNotUnderstoodException e) {
			// log
			LOGIF.error(e);
		} catch (GeneralSecurityException gse) {
			LOGIF.error(gse);
		}
	return null;
}

Example #11

Source File: BackgroundHiveSplitLoader.java From presto with Apache License 2.0

5 votes

private ListenableFuture<?> addSplitsToSource(InputSplit[] targetSplits, InternalHiveSplitFactory splitFactory)
        throws IOException
{
    ListenableFuture<?> lastResult = COMPLETED_FUTURE;
    for (InputSplit inputSplit : targetSplits) {
        Optional<InternalHiveSplit> internalHiveSplit = splitFactory.createInternalHiveSplit((FileSplit) inputSplit);
        if (internalHiveSplit.isPresent()) {
            lastResult = hiveSplitSource.addToQueue(internalHiveSplit.get());
        }
        if (stopped) {
            return COMPLETED_FUTURE;
        }
    }
    return lastResult;
}

Example #12

Source File: WarcFileRecordReader.java From wikireverse with MIT License

5 votes

public WarcFileRecordReader(Configuration conf, InputSplit split) throws IOException {
  if (split instanceof FileSplit) {
    this.filePathList=new Path[1];
    this.filePathList[0]=((FileSplit)split).getPath();
  } else if (split instanceof MultiFileSplit) {
    this.filePathList=((MultiFileSplit)split).getPaths();
  } else {
    throw new IOException("InputSplit is not a file split or a multi-file split - aborting");
  }

  // Use FileSystem.get to open Common Crawl URIs using the S3 protocol.
  URI uri = filePathList[0].toUri();
  this.fs = FileSystem.get(uri, conf);
  
  // get the total file sizes
  for (int i=0; i < filePathList.length; i++) {
    totalFileSize += fs.getFileStatus(filePathList[i]).getLen();
  }

  Class<? extends CompressionCodec> codecClass=null;

  try {
    codecClass=conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class);
    compressionCodec=(CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
  } catch (ClassNotFoundException cnfEx) {
    compressionCodec=null;
    LOG.info("!!! ClassNotFoun Exception thrown setting Gzip codec");
  }

  openNextFile();
}

Example #13

Source File: TestHoodieRealtimeFileSplit.java From hudi with Apache License 2.0

5 votes

@BeforeEach
public void setUp(@TempDir java.nio.file.Path tempDir) throws Exception {
  basePath = tempDir.toAbsolutePath().toString();
  deltaLogPaths = Collections.singletonList(basePath + "/1.log");
  fileSplitName = basePath + "/test.file";
  baseFileSplit = new FileSplit(new Path(fileSplitName), 0, 100, new String[] {});
  maxCommitTime = "10001";

  split = new HoodieRealtimeFileSplit(baseFileSplit, basePath, deltaLogPaths, maxCommitTime);
}

Example #14

Source File: TestNewTextReader.java From dremio-oss with Apache License 2.0

5 votes

@Test
public void testFileNotFound() {
  FileSplit split = mock(FileSplit.class);
  when(split.getPath()).thenReturn(new org.apache.hadoop.fs.Path("/notExist/notExitFile"));
  TextParsingSettings settings = mock(TextParsingSettings.class);
  when(settings.isHeaderExtractionEnabled()).thenReturn(true);
  SchemaPath column = mock(SchemaPath.class);
  List<SchemaPath> columns = new ArrayList<>(1);
  columns.add(column);
  SabotContext context = mock(SabotContext.class);
  try (BufferAllocator allocator = allocatorRule.newAllocator("test-new-text-reader", 0, Long.MAX_VALUE)) {
    when(context.getAllocator()).thenReturn(allocator);

    OptionManager optionManager = mock(OptionManager.class);
    when(optionManager.getOption(ExecConstants.LIMIT_FIELD_SIZE_BYTES))
      .thenReturn(ExecConstants.LIMIT_FIELD_SIZE_BYTES.getDefault().getNumVal());
    when(optionManager.getOptionValidatorListing()).thenReturn(mock(OptionValidatorListing.class));

    Path path = Path.of("/notExist");
    try (BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE);
         OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, optionManager, 1000);
         FileSystem dfs = HadoopFileSystem.get(path, new Configuration(), null);
         SampleMutator mutator = new SampleMutator(sampleAllocator);
         CompliantTextRecordReader reader = new CompliantTextRecordReader(split, HadoopCompressionCodecFactory.DEFAULT, dfs, operatorContext, settings, columns);
    ) {
      reader.setup(mutator);
    } catch (Exception e) {
      // java.io.FileNotFoundException is expected, but memory leak is not expected.
      assertTrue(e.getCause() instanceof FileNotFoundException);
    }
  }
}

Example #15

Source File: RegexIngestMapper.java From hadoop-solr with Apache License 2.0

5 votes

@Override
public LWDocument[] toDocuments(Writable key, Writable value, Reporter reporter,
    Configuration conf) throws IOException {
  if (key != null && value != null) {
    LWDocument doc = createDocument(key.toString() + "-" + System.currentTimeMillis(), null);
    Matcher matcher = regex.matcher(value.toString());
    if (matcher != null) {
      if (match) {
        if (matcher.matches()) {
          processMatch(doc, matcher);
        }
      } else {//
        while (matcher.find()) {
          processMatch(doc, matcher);
          reporter.progress();//do we really even need this?
        }
      }
    }
    // Adding the file path where this record was taken
    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String originalLogFilePath = fileSplit.getPath().toUri().getPath();
    doc.addField(FIELD_PATH, originalLogFilePath);
    String docId = originalLogFilePath + "-" + doc.getId();
    doc.setId(docId);
    return new LWDocument[] {doc};
  }
  return null;
}

Example #16

Source File: HadoopInputFormatTest.java From flink with Apache License 2.0

5 votes

@Test
public void testCreateInputSplits() throws Exception {

	FileSplit[] result = new FileSplit[1];
	result[0] = getFileSplit();
	DummyInputFormat inputFormat = mock(DummyInputFormat.class);
	when(inputFormat.getSplits(any(JobConf.class), anyInt())).thenReturn(result);

	HadoopInputFormat<String, Long> hadoopInputFormat = new HadoopInputFormat<>(inputFormat, String.class, Long.class, new JobConf());
	hadoopInputFormat.createInputSplits(2);

	verify(inputFormat, times(1)).getSplits(any(JobConf.class), anyInt());
}

Example #17

Source File: NLineInputFormat.java From big-c with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}

Example #18

Source File: FileScanFramework.java From Bats with Apache License 2.0

5 votes

@Override
public ManagedReader<? extends SchemaNegotiator> next() {
  FileSplit split = fileFramework.nextSplit();
  if (split == null) {
    return null;
  }
  return newReader(split);
}

Example #19

Source File: LineIndexer.java From attic-apex-malhar with Apache License 2.0

5 votes

public void map(LongWritable key, Text val,
    OutputCollector<Text, Text> output, Reporter reporter) throws IOException
{
  FileSplit fileSplit = (FileSplit)reporter.getInputSplit();
  String fileName = fileSplit.getPath().getName();
  location.set(fileName);

  String line = val.toString();
  StringTokenizer itr = new StringTokenizer(line.toLowerCase());
  while (itr.hasMoreTokens()) {
    word.set(itr.nextToken());
    output.collect(word, location);
  }
}

Example #20

Source File: NLineInputFormat.java From hadoop-gpu with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}

Example #21

Source File: DistCh.java From hadoop with Apache License 2.0

5 votes

/**
 * Produce splits such that each is no greater than the quotient of the
 * total size and the number of splits requested.
 * @param job The handle to the JobConf object
 * @param numSplits Number of splits requested
 */
public InputSplit[] getSplits(JobConf job, int numSplits
    ) throws IOException {
  final int srcCount = job.getInt(OP_COUNT_LABEL, -1);
  final int targetcount = srcCount / numSplits;
  String srclist = job.get(OP_LIST_LABEL, "");
  if (srcCount < 0 || "".equals(srclist)) {
    throw new RuntimeException("Invalid metadata: #files(" + srcCount +
                               ") listuri(" + srclist + ")");
  }
  Path srcs = new Path(srclist);
  FileSystem fs = srcs.getFileSystem(job);

  List<FileSplit> splits = new ArrayList<FileSplit>(numSplits);

  Text key = new Text();
  FileOperation value = new FileOperation();
  long prev = 0L;
  int count = 0; //count src
  try (SequenceFile.Reader in = new SequenceFile.Reader(fs, srcs, job)) {
    for ( ; in.next(key, value); ) {
      long curr = in.getPosition();
      long delta = curr - prev;
      if (++count > targetcount) {
        count = 0;
        splits.add(new FileSplit(srcs, prev, delta, (String[])null));
        prev = curr;
      }
    }
  }
  long remaining = fs.getFileStatus(srcs).getLen() - prev;
  if (remaining != 0) {
    splits.add(new FileSplit(srcs, prev, remaining, (String[])null));
  }
  LOG.info("numSplits="  + numSplits + ", splits.size()=" + splits.size());
  return splits.toArray(new FileSplit[splits.size()]);
}

Example #22

Source File: NLineInputFormat.java From RDFS with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}

Example #23

Source File: StormParsedInputFormat.java From incubator-retired-mrql with Apache License 2.0

5 votes

@Override
public RecordReader<MRContainer, MRContainer> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
    StormEvaluator.load_source_dir();  // load the parsed source parameters from a file
    String path = ((FileSplit)split).getPath().toString();
    ParsedDataSource ds = (ParsedDataSource)DataSource.get(path,Plan.conf);
    return new ParsedRecordReader((FileSplit)split,job,ds.parser,(Trees)ds.args);
}

Example #24

Source File: ExcelFileInputFormat.java From hadoopoffice with Apache License 2.0

5 votes

@Override
public  RecordReader<Text,ArrayWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
/** Create reader **/
try {
		 // send configuration option to ms excel. The format of the Excel (old vs new) is detected automaitcally
 		job.set(HadoopOfficeReadConfiguration.CONF_MIMETYPE,"ms-excel");
		return new ExcelRecordReader( (FileSplit) split,job,reporter);
	} catch (FormatNotUnderstoodException e) {
		// log
		LOGIF.error(e);
	} catch (GeneralSecurityException gse) {
		LOGIF.error(gse);
	}
return null;
}

Example #25

Source File: JsonAccessor.java From pxf with Apache License 2.0

5 votes

@Override
protected Object getReader(JobConf conf, InputSplit split) throws IOException {
    if (!isEmpty(identifier)) {
        conf.set(JsonRecordReader.RECORD_MEMBER_IDENTIFIER, identifier);
        conf.setInt(JsonRecordReader.RECORD_MAX_LENGTH, maxRecordLength);
        return new JsonRecordReader(conf, (FileSplit) split);
    } else {
        return new LineRecordReader(conf, (FileSplit) split);
    }
}

Example #26

Source File: WARCInputFormat.java From warc-hadoop with MIT License

5 votes

/**
 * Opens a WARC file (possibly compressed) for reading, and returns a RecordReader for accessing it.
 */
@Override
public RecordReader<LongWritable, WARCWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
        throws IOException {
    reporter.setStatus(split.toString());
    return new WARCReader(job, (FileSplit) split);
}

Example #27

Source File: HiveTableFileInputFormat.java From flink with Apache License 2.0

5 votes

@VisibleForTesting
static FileSplit toHadoopFileSplit(FileInputSplit fileSplit) throws IOException {
	URI uri = fileSplit.getPath().toUri();
	long length = fileSplit.getLength();
	// Hadoop FileSplit should not have -1 length.
	if (length == -1) {
		length = fileSplit.getPath().getFileSystem().getFileStatus(fileSplit.getPath()).getLen() -
				fileSplit.getStart();
	}
	return new FileSplit(new Path(uri), fileSplit.getStart(), length, (String[]) null);
}

Example #28

Source File: NLineInputFormat.java From hadoop with Apache License 2.0

5 votes

/** 
 * Logically splits the set of input files for the job, splits N lines
 * of the input as one split.
 * 
 * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int)
 */
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
  ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
  for (FileStatus status : listStatus(job)) {
    for (org.apache.hadoop.mapreduce.lib.input.FileSplit split : 
        org.apache.hadoop.mapreduce.lib.input.
        NLineInputFormat.getSplitsForFile(status, job, N)) {
      splits.add(new FileSplit(split));
    }
  }
  return splits.toArray(new FileSplit[splits.size()]);
}

Example #29

Source File: NLineInputFormat.java From hadoop with Apache License 2.0

5 votes

public RecordReader<LongWritable, Text> getRecordReader(
                                          InputSplit genericSplit,
                                          JobConf job,
                                          Reporter reporter) 
throws IOException {
  reporter.setStatus(genericSplit.toString());
  return new LineRecordReader(job, (FileSplit) genericSplit);
}

Example #30

Source File: InvertedIndex.java From hadoop-book with Apache License 2.0

5 votes

public void map(LongWritable key, Text val,
        OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {

    FileSplit fileSplit = (FileSplit) reporter.getInputSplit();
    String fileName = fileSplit.getPath().getName();
    location.set(fileName);

    String line = val.toString();
    StringTokenizer itr = new StringTokenizer(line.toLowerCase());
    while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        output.collect(word, location);
    }
}