Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getPath()

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit#getPath() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MapReduceParsedInputFormat.java    From incubator-retired-mrql with Apache License 2.0 6 votes vote down vote up
public ParsedRecordReader ( FileSplit split,
                            TaskAttemptContext context,
                            Class<? extends Parser> parser_class,
                            Trees args ) throws IOException {
    Configuration conf = context.getConfiguration();
    start = split.getStart();
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    fsin = fs.open(split.getPath());
    try {
        parser = parser_class.newInstance();
    } catch (Exception ex) {
        throw new Error("Unrecognized parser:"+parser_class);
    };
    parser.initialize(args);
    parser.open(fsin,start,end);
    result = null;
}
 
Example 2
Source File: GryoRecordReader.java    From tinkerpop with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    final FileSplit split = (FileSplit) genericSplit;
    final Configuration configuration = context.getConfiguration();
    if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.gryoReader = GryoReader.build().mapper(
            GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create();
    long start = split.getStart();
    final Path file = split.getPath();
    if (null != new CompressionCodecFactory(configuration).getCodec(file)) {
        throw new IllegalStateException("Compression is not supported for the (binary) Gryo format");
    }
    // open the file and seek to the start of the split
    this.inputStream = file.getFileSystem(configuration).open(split.getPath());
    this.splitLength = split.getLength();
    if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start);
}
 
Example 3
Source File: WikipediaRecordReader.java    From datawave with Apache License 2.0 6 votes vote down vote up
private void initializeSuperClass(InputSplit split, TaskAttemptContext context) throws IOException {
    super.initialize(split, context);
    if (split instanceof FileSplit) {
        FileSplit fs = (FileSplit) split;
        Path p = fs.getPath();
        rawFileName = p.getName();
        
        if (log.isDebugEnabled()) {
            log.debug("FileSplit Info: ");
            log.debug("Start: " + fs.getStart());
            log.debug("Length: " + fs.getLength());
            log.debug("Locations: " + Arrays.toString(fs.getLocations()));
            log.debug("Path: " + fs.getPath());
        }
    } else {
        throw new IOException("Input Split unhandled.");
    }
}
 
Example 4
Source File: Bzip2TextInputFormat.java    From spork with Apache License 2.0 6 votes vote down vote up
public BZip2LineRecordReader(Configuration job, FileSplit split)
throws IOException {
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    fileIn.seek(start);

    in = new CBZip2InputStream(fileIn, 9, end);
    if (start != 0) {
        // skip first line and re-establish "start".
        // LineRecordReader.readLine(this.in, null);
        readLine(this.in, null);
        start = in.getPos();
    }
    pos = in.getPos();
}
 
Example 5
Source File: TestLocalRunner.java    From big-c with Apache License 2.0 5 votes vote down vote up
protected void setup(Context context) {
  // Get the thread num from the file number.
  FileSplit split = (FileSplit) context.getInputSplit();
  Path filePath = split.getPath();
  String name = filePath.getName();
  this.threadId = Integer.valueOf(name);

  LOG.info("Thread " + threadId + " : "
      + context.getInputSplit());
}
 
Example 6
Source File: MapReduceGeneratorInputFormat.java    From incubator-retired-mrql with Apache License 2.0 5 votes vote down vote up
public GeneratorRecordReader ( FileSplit split,
                               TaskAttemptContext context ) throws IOException {
    Configuration conf = context.getConfiguration();
    Path path = split.getPath();
    FileSystem fs = path.getFileSystem(conf);
    reader = new SequenceFile.Reader(path.getFileSystem(conf),path,conf);
    MRContainer key = new MRContainer();
    MRContainer value = new MRContainer();
    reader.next(key,value);
    offset = ((MR_long)((Tuple)(value.data())).first()).get();
    size = ((MR_long)((Tuple)(value.data())).second()).get();
    index = -1;
}
 
Example 7
Source File: MultiLineInputFormat.java    From dkpro-c4corpus with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
        throws IOException, InterruptedException
{
    numberOfLinesToProcess = getNumLinesPerSplit(context);
    FileSplit split = (FileSplit) genericSplit;
    final Path file = split.getPath();
    Configuration conf = context.getConfiguration();
    this.maxLineLength = conf
            .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    FileSystem fs = file.getFileSystem(conf);
    start = split.getStart();
    end = start + split.getLength();
    boolean skipFirstLine = false;
    FSDataInputStream filein = fs.open(split.getPath());

    if (start != 0) {
        skipFirstLine = true;
        --start;
        filein.seek(start);
    }
    in = new LineReader(filein, conf);
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}
 
Example 8
Source File: UnenclosedBaseJsonRecordReader.java    From spatial-framework-for-hadoop with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext taskContext)
			throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit)split;
	start = fileSplit.getStart();
	end = fileSplit.getLength() + start;
	Path filePath = fileSplit.getPath();
       commonInit(filePath, taskContext.getConfiguration());
}
 
Example 9
Source File: MainframeVBRecordReader.java    From Cobol-to-Hive with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context)
		throws IOException {
	FileSplit split = (FileSplit) genericSplit;
	Configuration job = context.getConfiguration();
	final Path file = split.getPath();
	initialize(job, split.getStart(), split.getLength(), file);
}
 
Example 10
Source File: QseqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException
{
	setConf(conf);
	file = split.getPath();
	start = split.getStart();
	end = start + split.getLength();

	FileSystem fs = file.getFileSystem(conf);
	FSDataInputStream fileIn = fs.open(file);

	CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
	CompressionCodec        codec        = codecFactory.getCodec(file);

	if (codec == null) // no codec.  Uncompressed file.
	{
		positionAtFirstRecord(fileIn);
		inputStream = fileIn;
	}
	else
	{ // compressed file
		if (start != 0)
			throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");

		inputStream = codec.createInputStream(fileIn);
		end = Long.MAX_VALUE; // read until the end of the file
	}

	lineReader = new LineReader(inputStream);
}
 
Example 11
Source File: ColumnarSplitDataReader.java    From kylin-on-parquet-v2 with Apache License 2.0 5 votes vote down vote up
public void init(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    baseCuboid = Cuboid.getBaseCuboid(cubeDesc);
    rowKeyEncoder = AbstractRowKeyEncoder.createInstance(cubeSegment, baseCuboid);

    FileSystem fs = FileSystem.get(context.getConfiguration());
    FileSplit fSplit = (FileSplit) split;
    Path path = fSplit.getPath();
    rowRecordReader = new RowRecordReader(cubeDesc, path, fs);
    metricsValuesBuffer = ByteBuffer.allocate(BufferedMeasureCodec.DEFAULT_BUFFER_SIZE);

    rowCount = new AtomicInteger(0);
}
 
Example 12
Source File: Excel2007FileRecordReader.java    From components with Apache License 2.0 5 votes vote down vote up
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
  FileSplit split = (FileSplit) genericSplit;
  Configuration job = context.getConfiguration();

  final Path file = split.getPath();

  try {
    InputStream in = createInputStream(job, file);
    
    init4Excel2007(in, job, file);
  } catch (Exception e) {
    closeResource();
    throw e;
  }
}
 
Example 13
Source File: XmlInputFormat.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
public XmlRecordReader(FileSplit split, Configuration conf)
    throws IOException {
  startTag = conf.get(START_TAG_KEY).getBytes("UTF-8");
  endTag = conf.get(END_TAG_KEY).getBytes("UTF-8");

  // open the file and seek to the start of the split
  start = split.getStart();
  end = start + split.getLength();
  Path file = split.getPath();
  FileSystem fs = file.getFileSystem(conf);
  fsin = fs.open(split.getPath());
  fsin.seek(start);
}
 
Example 14
Source File: CsvBlurMapper.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
protected Path getCurrentFile(Context context) throws IOException {
  InputSplit split = context.getInputSplit();
  if (split != null && split instanceof FileSplit) {
    FileSplit inputSplit = (FileSplit) split;
    Path path = inputSplit.getPath();
    FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
    return path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory());
  }
  return null;
}
 
Example 15
Source File: MDSSpreadReader.java    From multiple-dimension-spread with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize( final InputSplit inputSplit, final TaskAttemptContext context ) throws IOException, InterruptedException {
  FileSplit fileSplit = (FileSplit)inputSplit;
  Configuration config = context.getConfiguration();
  Path path = fileSplit.getPath();
  FileSystem fs = path.getFileSystem( config );
  long fileLength = fs.getLength( path );
  long start = fileSplit.getStart();
  long length = fileSplit.getLength();
  InputStream in = fs.open( path );
  setStream( in , fileLength , start , length );
}
 
Example 16
Source File: MapperForNewData.java    From incubator-retired-blur with Apache License 2.0 5 votes vote down vote up
@Override
protected void setup(Context context) throws IOException, InterruptedException {
  InputSplit inputSplit = context.getInputSplit();
  FileSplit fileSplit = getFileSplit(inputSplit);
  Path path = fileSplit.getPath();
  FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
  FileStatus fileStatus = fileSystem.getFileStatus(path);
  _timestamp = fileStatus.getModificationTime();
  _newRecords = context.getCounter("Blur", "New Records Read");
}
 
Example 17
Source File: IndexRRecordReader.java    From indexr with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
        throws IOException, InterruptedException, UnsupportedOperationException {
    Configuration configuration = taskAttemptContext.getConfiguration();
    FileSplit split = (FileSplit) inputSplit;
    Preconditions.checkState(split.getStart() == 0, "Segment should not splited");

    Path filePath = split.getPath();

    String sparkRequestedSchemaString = configuration.get(Config.SPARK_PROJECT_SCHEMA);
    this.sparkSchema = StructType$.MODULE$.fromString(sparkRequestedSchemaString);

    FileSystem fileSystem = filePath.getFileSystem(taskAttemptContext.getConfiguration());
    if (!SegmentHelper.checkSegmentByPath(filePath)) {
        logger.info("ignore: " + filePath);
        return;
    }

    ByteBufferReader.Opener opener = ByteBufferReader.Opener.create(fileSystem, filePath);
    IntegratedSegment.Fd fd = IntegratedSegment.Fd.create(filePath.toString(), opener);
    if (fd == null) {
        logger.warn("illegal segment: " + filePath);
        return;
    }
    segment = fd.open();
    if (segment == null) {
        logger.warn("illegal segment: " + filePath);
        return;
    }
    this.totalRowCount = segment.rowCount();
    this.packCount = segment.packCount();

    SegmentSchema segmentSchema = segment.schema();
    this.projectColumns = IndexRUtil.sparkSchemaToIndexRSchema(
            JavaConversions.seqAsJavaList(sparkSchema),
            a -> false)
            .getColumns().toArray(new ColumnSchema[0]);
    this.projectColIdsInSegment = new int[projectColumns.length];
    this.projectExists = new boolean[projectColumns.length];
    for (int i = 0; i < projectColumns.length; i++) {
        String columnName = projectColumns[i].getName();
        int colId = Trick.indexWhere(segmentSchema.getColumns(), cs -> cs.getName().equalsIgnoreCase(columnName));
        projectColIdsInSegment[i] = colId;
        projectExists[i] = colId >= 0;
    }
}
 
Example 18
Source File: CopybookRecordReader.java    From CopybookInputFormat with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
    throws IOException, InterruptedException {

  String cblPath = context.getConfiguration().get(
      Const.COPYBOOK_INPUTFORMAT_CBL_HDFS_PATH_CONF);

  FileSystem fs = FileSystem.get(context.getConfiguration());

  BufferedInputStream inputStream = new BufferedInputStream(fs.open(new Path(
      cblPath)));

  CobolCopybookLoader copybookInt = new CobolCopybookLoader();
  try {
    externalRecord = copybookInt
        .loadCopyBook(inputStream, "RR", CopybookLoader.SPLIT_NONE, 0,
            "cp037", Convert.FMT_MAINFRAME, 0, null);

    int fileStructure = Constants.IO_FIXED_LENGTH;

    for (ExternalField field : externalRecord.getRecordFields()) {
      recordByteLength += field.getLen();
    }

    // jump to the point in the split that the first whole record of split
    // starts at
    FileSplit fileSplit = (FileSplit) split;

    start = fileSplit.getStart();
    end = start + fileSplit.getLength();
    final Path file = fileSplit.getPath();

    BufferedInputStream fileIn = new BufferedInputStream(fs.open(fileSplit
        .getPath()));

    if (start != 0) {
      pos = start - (start % recordByteLength) + recordByteLength;

      fileIn.skip(pos);
    }

    ret = LineIOProvider.getInstance().getLineReader(
        fileStructure,
        LineIOProvider.getInstance().getLineProvider(fileStructure));

    ret.open(fileIn, externalRecord);
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example 19
Source File: LongLineEventRecordReader.java    From datawave with Apache License 2.0 4 votes vote down vote up
/**
 * @param genericSplit
 * @param context
 * @throws IOException
 */
public void initializeLineReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    
    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LfLineReader(codec.createInputStream(fileIn), job);
        in.setNewLineIncluded(newLineIncluded);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        
        // Hadoop CodecFactory only checks the file suffix, let's double check for gzip since some data producers
        // may not append .gz to their files.
        InputStream iStream = GzipDetectionUtil.decompressTream(fileIn);
        Class streamClass = iStream.getClass();
        if (GZIPInputStream.class == streamClass) {
            end = Long.MAX_VALUE;
        }
        
        in = new LfLineReader(iStream, job);
        in.setNewLineIncluded(newLineIncluded);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}
 
Example 20
Source File: ParquetInputSplit.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * Builds a {@code ParquetInputSplit} from a mapreduce {@link FileSplit}.
 *
 * @param split a mapreduce FileSplit
 * @return a ParquetInputSplit
 * @throws IOException if there is an error while creating the Parquet split
 */
static ParquetInputSplit from(FileSplit split) throws IOException {
  return new ParquetInputSplit(split.getPath(),
      split.getStart(), split.getStart() + split.getLength(),
      split.getLength(), split.getLocations(), null);
}