org.apache.hadoop.mapreduce.lib.input.FileSplit Java Exaples

Source File: MsgpackInputFormat.java From laser with Apache License 2.0

6 votes

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
		InterruptedException {
	Configuration conf = context.getConfiguration();
	long goalSize = goalsize(conf);
	String[] urlList = conf.get("com.b5m.laser.msgpack.host").split(",");
	int numNode = urlList.length;
	List<InputSplit> splits = new ArrayList<InputSplit>();
	long splitLength = (long) Math.ceil((double) goalSize / numNode);

	long bytesRemaining = goalSize;
	for (int i = 0; i < numNode; i++) {
		if (bytesRemaining > splitLength) {
			splits.add(new FileSplit(new Path(urlList[i]), i * splitLength,
					splitLength, null));
		} else {
			splits.add(new FileSplit(new Path(urlList[i]), i * splitLength,
					bytesRemaining, null));
		}
		bytesRemaining -= splitLength;
	}
	return splits;
}

Source File: SSTableSplitRecordReader.java From hadoop-sstable with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException {

    this.context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    this.sstableFile = fileSplit.getPath();
    // The SSTableSplitInputFormat is not splittable, so the split length is the whole file.
    this.totalFileSize = fileSplit.getLength();


    Configuration conf = context.getConfiguration();
    FileSystem fs = sstableFile.getFileSystem(conf);
    this.rawInputStream = fs.open(sstableFile);

    this.splitSize = conf.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB,
            HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024;

    this.index = new IndexOffsetScanner(sstableFile, fs);
}

Source File: IndexedStorage.java From spork with Apache License 2.0

6 votes

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
    IndexManager im = null;
    try {
        FileSystem fs = FileSystem.get(context.getConfiguration());
        Path indexFile = IndexManager.getIndexFileName(((FileSplit)split).getPath());
        im = new IndexManager(fs.getFileStatus(indexFile));
        im.openIndexFile(fs);
        im.ReadIndexHeader();
        im.ReadIndexFooter();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return new IndexedStorageRecordReader(im);
}

Source File: CSVTestFileLoader.java From datawave with Apache License 2.0

6 votes

@Override
public void loadTestData(SequenceFile.Writer seqFile) throws IOException {
    TypeRegistry.reset();
    TypeRegistry.getInstance(this.conf);
    Path path = new Path(this.uri);
    File file = new File(this.uri);
    FileSplit split = new FileSplit(path, 0, file.length(), null);
    TaskAttemptContext ctx = new TaskAttemptContextImpl(this.conf, new TaskAttemptID());
    
    try (CSVRecordReader reader = new CSVRecordReader()) {
        reader.initialize(split, ctx);
        while (reader.nextKeyValue()) {
            RawRecordContainer raw = reader.getEvent();
            seqFile.append(new Text(), raw);
        }
    }
}

Source File: VCFInputFormat.java From Hadoop-BAM with MIT License

6 votes

private void fixBCFSplits(
		List<FileSplit> splits, List<InputSplit> newSplits)
	throws IOException
{
	// addGuessedSplits() requires the given splits to be sorted by file
	// path, so do so. Although FileInputFormat.getSplits() does, at the time
	// of writing this, generate them in that order, we shouldn't rely on it.
	Collections.sort(splits, new Comparator<FileSplit>() {
		public int compare(FileSplit a, FileSplit b) {
			return a.getPath().compareTo(b.getPath());
		}
	});

	for (int i = 0; i < splits.size();)
		i = addGuessedSplits(splits, i, newSplits);
}

Source File: GryoRecordReader.java From tinkerpop with Apache License 2.0

6 votes

@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    final FileSplit split = (FileSplit) genericSplit;
    final Configuration configuration = context.getConfiguration();
    if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.gryoReader = GryoReader.build().mapper(
            GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create();
    long start = split.getStart();
    final Path file = split.getPath();
    if (null != new CompressionCodecFactory(configuration).getCodec(file)) {
        throw new IllegalStateException("Compression is not supported for the (binary) Gryo format");
    }
    // open the file and seek to the start of the split
    this.inputStream = file.getFileSystem(configuration).open(split.getPath());
    this.splitLength = split.getLength();
    if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start);
}

Source File: NYCTLCReader.java From datawave with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    super.initialize(genericSplit, context);
    Configuration job = context.getConfiguration();
    
    // open the file and seek to the start
    final Path file = ((FileSplit) genericSplit).getPath();
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    
    // read the header from the first line
    Text header = new Text();
    LfLineReader in = new LfLineReader(fileIn);
    in.readLine(header);
    in.close();
    
    rawHeader = header.toString();
    ((NYCTLCHelper) helper).parseHeader(rawHeader);
}

Source File: CRAMRecordReader.java From Hadoop-BAM with MIT License

6 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
  if(isInitialized) {
    close();
  }
  isInitialized = true;

  final Configuration conf = context.getConfiguration();
  final FileSplit fileSplit = (FileSplit) split;
  final Path file  = fileSplit.getPath();

  String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
  ReferenceSource refSource = new ReferenceSource(refSourcePath == null ? null :
      NIOFileUtil.asPath(refSourcePath));

  seekableStream = WrapSeekable.openPath(conf, file);
  start = fileSplit.getStart();
  length = fileSplit.getLength();
  long end = start + length;
  // CRAMIterator right shifts boundaries by 16 so we do the reverse here
  // also subtract one from end since CRAMIterator's boundaries are inclusive
  long[] boundaries = new long[] {start << 16, (end - 1) << 16};
  ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
  cramIterator = new CRAMIterator(seekableStream, refSource, boundaries, stringency);
}

Source File: TestEnclosedEsriJsonRecordReader.java From spatial-framework-for-hadoop with Apache License 2.0

6 votes

@Test
public void TestMrv1() throws Exception {
	//long [] recordBreaks = new long[] { 1872, 11284, 0, 0, 0,  };
	//assertArrayEquals(new long[] { 94L }, getRecordIndexesInFile("sample-study-area.json", 0, 208));
	Path path = new Path(this.getClass().getResource("sample-study-area.json").getFile());
	org.apache.hadoop.mapred.JobConf conf = new org.apache.hadoop.mapred.JobConf();
	org.apache.hadoop.mapred.FileSplit split =
		new org.apache.hadoop.mapred.FileSplit(path, 0, 208, new String[0]);
	EnclosedEsriJsonRecordReader reader = new EnclosedEsriJsonRecordReader(split, conf);
	LongWritable key = reader.createKey();
	Text value = reader.createValue();
	assertTrue (reader.next(key, value));
	//System.out.println(key.get() + " - " + value.toString());
	assertFalse (reader.next(key, value));
	reader.close();
}

Source File: TestFastqInputFormat.java From Hadoop-BAM with MIT License

6 votes

@Test
public void testReadFastqWithAmpersandQuality() throws IOException
{
	writeToTempFastq(fastqWithAmpersandQuality);
	// split doesn't start at 0, forcing reader to advance looking for first complete record
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null);

	FastqRecordReader reader = new FastqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());
	assertEquals("###########################################################################################", fragment.getQuality().toString());

	retval = reader.next(key, fragment);
	assertFalse(retval);
}

Source File: Decompress.java From ViraPipe with MIT License

6 votes

public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }

Source File: MsgpackRecordReader.java From laser with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit) split;
	String host = fileSplit.getPath().toString();
	Configuration conf = context.getConfiguration();
	String collection = conf.get("com.b5m.laser.collection");
	int port = conf.getInt("com.b5m.laser.msgpack.port", 0);
	method = conf.get("com.b5m.laser.msgpack.input.method");
	client = new MsgpackClient(host, port, collection);
	start = fileSplit.getStart();
	splitLenth = fileSplit.getLength();
	readLength = 0;
	vClass = conf.getClass("com.b5m.laser.msgpack.input.value.class", null);
	try {
		Object[] req = new Object[1];
		req[0] = start;
		client.writeIgnoreRetValue(req, method + "|start");
	} catch (Exception e) {
		throw new IOException(e.getLocalizedMessage());
	}
}

Source File: WarpScriptRecordReader.java From warp10-platform with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  // Initialize wrapped reader
  reader.initialize(split, context);
  
  Configuration conf = context.getConfiguration();
  
  String code = Warp10InputFormat.getProperty(conf, this.suffix, WarpScriptInputFormat.WARPSCRIPT_INPUTFORMAT_SCRIPT, null);

  // Record the current path in the configuration if the split is a FileSplit
  if (split instanceof FileSplit) {
    conf.set(WarpScriptInputFormat.PATH_CONFIG_KEY, ((FileSplit) split).getPath().toString());      
  }
  
  // Initialize WarpScriptExecutor
  try {
    this.executor = inputFormat.getWarpScriptExecutor(conf, code);
  } catch (WarpScriptException wse) {
    throw new IOException("Error while instatiating WarpScript executor", wse);
  }
  
  done = false;
}

Source File: XmlCollectionWithTagInputFormat.java From vxquery with Apache License 2.0

6 votes

public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
    endTag = ENDING_TAG.getBytes(Charsets.UTF_8);
    startTag = STARTING_TAG.getBytes(Charsets.UTF_8);

    // open the file and seek to the start of the split
    start = split.getStart();
    // set the end of the file
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    FileStatus fStatus = fs.getFileStatus(file);
    blocks = fs.getFileBlockLocations(fStatus, 0, fStatus.getLen());
    // seek the start of file
    fsin = fs.open(split.getPath());
    fsin.seek(start);
}

Source File: JSONInputFormat.java From kite with Apache License 2.0

6 votes

@Override
public RecordReader<E, Void> createRecordReader(InputSplit split,
                                                TaskAttemptContext context)
    throws IOException, InterruptedException {
  Configuration conf = Hadoop.TaskAttemptContext
      .getConfiguration.invoke(context);
  Path path;
  if (split instanceof FileSplit) {
    path = ((FileSplit) split).getPath();
  } else {
    throw new DatasetOperationException(
        "Split is not a FileSplit: %s:%s",
        split.getClass().getCanonicalName(), split);
  }
  JSONFileReader<E> reader = new JSONFileReader<E>(
      path.getFileSystem(conf), path, accessor);
  reader.initialize();
  return reader.asRecordReader();
}

Source File: HadoopFileInputSource.java From incubator-gobblin with Apache License 2.0

6 votes

@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
  if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) {
    throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
  }

  Configuration configuration = new Configuration();
  FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration);

  String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY);
  FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
  TaskAttemptContext taskAttemptContext =
      getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID());
  try {
    RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext);
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS);
    return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}

Source File: IntegrationTestBigLinkedList.java From hbase with Apache License 2.0

6 votes

private static SortedSet<byte[]> readFileToSearch(final Configuration conf,
    final FileSystem fs, final LocatedFileStatus keyFileStatus) throws IOException,
    InterruptedException {
  SortedSet<byte []> result = new TreeSet<>(Bytes.BYTES_COMPARATOR);
  // Return entries that are flagged Counts.UNDEFINED in the value. Return the row. This is
  // what is missing.
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
  try (SequenceFileAsBinaryInputFormat.SequenceFileAsBinaryRecordReader rr =
      new SequenceFileAsBinaryInputFormat.SequenceFileAsBinaryRecordReader()) {
    InputSplit is =
      new FileSplit(keyFileStatus.getPath(), 0, keyFileStatus.getLen(), new String [] {});
    rr.initialize(is, context);
    while (rr.nextKeyValue()) {
      rr.getCurrentKey();
      BytesWritable bw = rr.getCurrentValue();
      if (Verify.VerifyReducer.whichType(bw.getBytes()) == Verify.Counts.UNDEFINED) {
        byte[] key = new byte[rr.getCurrentKey().getLength()];
        System.arraycopy(rr.getCurrentKey().getBytes(), 0, key, 0, rr.getCurrentKey()
            .getLength());
        result.add(key);
      }
    }
  }
  return result;
}

Source File: AggregateXMLReader.java From marklogic-contentpump with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
    initConfig(context);
    initAggConf(context);
    
    f = XMLInputFactory.newInstance();
    setFile(((FileSplit) inSplit).getPath());
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if(status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    initStreamReader(inSplit);
}

Source File: TeraScheduler.java From incubator-tez with Apache License 2.0

6 votes

/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
  solve();
  FileSplit[] result = new FileSplit[realSplits.length];
  int left = 0;
  int right = realSplits.length - 1;
  for(int i=0; i < splits.length; ++i) {
    if (splits[i].isAssigned) {
      // copy the split and fix up the locations
      ((TeraFileSplit) realSplits[i]).setLocations
         (new String[]{splits[i].locations.get(0).hostname});
      result[left++] = realSplits[i];
    } else {
      result[right--] = realSplits[i];
    }
  }
  List<InputSplit> ret = new ArrayList<InputSplit>();
  for (FileSplit fs : result) {
    ret.add(fs);
  }
  return ret;
}

Source File: ArchiveRecordReader.java From marklogic-contentpump with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
    initConfig(context);
    allowEmptyMeta = conf.getBoolean(
        CONF_INPUT_ARCHIVE_METADATA_OPTIONAL, false);
     
    setFile(((FileSplit) inSplit).getPath());
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if(status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    initStream(inSplit);
}

Source File: DelegatingAvroKeyInputFormat.java From incubator-pinot with Apache License 2.0

6 votes

public org.apache.hadoop.mapreduce.RecordReader<org.apache.avro.mapred.AvroKey<T>, NullWritable> createRecordReader(
    InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  LOGGER.info("DelegatingAvroKeyInputFormat.createRecordReader()  for split:{}", split);
  FileSplit fileSplit = (FileSplit) split;
  Configuration configuration = context.getConfiguration();
  String sourceName = getSourceNameFromPath(fileSplit, configuration);
  LOGGER.info("Source Name for path {} : {}", fileSplit.getPath(), sourceName);
  Map<String, String> schemaJSONMapping = new ObjectMapper()
      .readValue(configuration.get("schema.json.mapping"), MAP_STRING_STRING_TYPE);

  LOGGER.info("Schema JSON Mapping: {}", schemaJSONMapping);

  String sourceSchemaJSON = schemaJSONMapping.get(sourceName);

  Schema schema = new Schema.Parser().parse(sourceSchemaJSON);
  return new AvroKeyRecordReader<T>(schema);
}

Source File: JMatrixMultiplicationStep1.java From RecommendationEngine with MIT License

6 votes

public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String parentName = fileSplit.getPath().getParent().getName();

    String[] tokens = null;

    if (parentName.equals("step6")) {
        tokens = DELIMITER.split(value.toString());
        k.set("A");
        v.set(tokens[0] + "," + tokens[1] + "," + tokens[2]);
        context.write(k, v);
    } else if (parentName.equals("step1")) {
        tokens = value.toString().split(" ");
        k.set("B");
        v.set(tokens[0] + "," + tokens[1] + "," + tokens[2]);
        context.write(k, v);
    }
}

Source File: TestJobSplitWriter.java From big-c with Apache License 2.0

6 votes

@Test
public void testMaxBlockLocationsOldSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    org.apache.hadoop.mapred.FileSplit split =
        new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
            new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new org.apache.hadoop.mapred.InputSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}

Source File: CombineDocumentSplit.java From marklogic-contentpump with Apache License 2.0

6 votes

public void readFields(DataInput in) throws IOException {
    // splits
    int splitSize = in.readInt();
    splits = new ArrayList<FileSplit>();
    for (int i = 0; i < splitSize; i++) {
        Path path = new Path(Text.readString(in));
        long start = in.readLong();
        long len = in.readLong();
        FileSplit split = new FileSplit(path, start, len, null);
        splits.add(split);
    }
    // length
    length = in.readLong();
    // locations
    locations = new HashSet<String>();
}

Source File: TeraScheduler.java From pravega-samples with Apache License 2.0

6 votes

public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}

Source File: TestQseqInputFormat.java From Hadoop-BAM with MIT License

6 votes

@Test
public void testGzCompressedInput() throws IOException
{
	// write gzip-compressed data
	GzipCodec codec = new GzipCodec();
	PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) );
	qseqOut.write(twoQseq);
	qseqOut.close();

	// now try to read it
	split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null);
	QseqRecordReader reader = new QseqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());

	retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
}

Source File: TestFastqInputFormat.java From Hadoop-BAM with MIT License

6 votes

@Test
public void testReadStartInMiddle() throws IOException
{
	writeToTempFastq(twoFastq);
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null);

	FastqRecordReader reader = new FastqRecordReader(conf, split);

	assertEquals(oneFastq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq
	assertEquals(0.0, reader.getProgress(), 0.01);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
	assertEquals("BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", fragment.getQuality().toString());

	assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data
	assertEquals(1.0, reader.getProgress(), 0.01);

	retval = reader.next(key, fragment);
	assertFalse(retval);
}

Source File: TestFastqInputFormat.java From Hadoop-BAM with MIT License

5 votes

private FastqRecordReader createReaderForOneFastq() throws IOException
{
	writeToTempFastq(oneFastq);
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, oneFastq.length(), null);

	return new FastqRecordReader(conf, split);
}

Source File: HadoopFileInputSource.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
  try {
    Job job = Job.getInstance(new Configuration());

    if (state.contains(FILE_INPUT_PATHS_KEY)) {
      for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
        FileInputFormat.addInputPath(job, new Path(inputPath));
      }
    }

    FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
    List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
    if (fileSplits == null || fileSplits.isEmpty()) {
      return ImmutableList.of();
    }

    Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)
        ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
    String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);

    List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
    for (InputSplit inputSplit : fileSplits) {
      // Create one WorkUnit per InputSplit
      FileSplit fileSplit = (FileSplit) inputSplit;
      Extract extract = createExtract(tableType, tableNamespace, tableName);
      WorkUnit workUnit = WorkUnit.create(extract);
      workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
      workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
      workUnits.add(workUnit);
    }

    return workUnits;
  } catch (IOException ioe) {
    throw new RuntimeException("Failed to get workunits", ioe);
  }
}

Source File: TestQseqInputFormat.java From Hadoop-BAM with MIT License

5 votes

private void verifySkipFailedQC() throws IOException
{
	writeToTempQseq(twoQseq);
	split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, twoQseq.length(), null);
	QseqRecordReader reader = new QseqRecordReader(conf, split);

	boolean found = reader.next(key, fragment);
	assertTrue(found);
	assertEquals(2, (int)fragment.getRead());

	found = reader.next(key, fragment);
	assertFalse(found);
}

org.apache.hadoop.mapreduce.lib.input.FileSplit Java Examples