org.apache.hadoop.mapreduce.lib.input.FileSplit Java Examples

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: MsgpackInputFormat.java    From laser with Apache License 2.0 6 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
		InterruptedException {
	Configuration conf = context.getConfiguration();
	long goalSize = goalsize(conf);
	String[] urlList = conf.get("com.b5m.laser.msgpack.host").split(",");
	int numNode = urlList.length;
	List<InputSplit> splits = new ArrayList<InputSplit>();
	long splitLength = (long) Math.ceil((double) goalSize / numNode);

	long bytesRemaining = goalSize;
	for (int i = 0; i < numNode; i++) {
		if (bytesRemaining > splitLength) {
			splits.add(new FileSplit(new Path(urlList[i]), i * splitLength,
					splitLength, null));
		} else {
			splits.add(new FileSplit(new Path(urlList[i]), i * splitLength,
					bytesRemaining, null));
		}
		bytesRemaining -= splitLength;
	}
	return splits;
}
 
Example #2
Source File: SSTableSplitRecordReader.java    From hadoop-sstable with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException {

    this.context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    this.sstableFile = fileSplit.getPath();
    // The SSTableSplitInputFormat is not splittable, so the split length is the whole file.
    this.totalFileSize = fileSplit.getLength();


    Configuration conf = context.getConfiguration();
    FileSystem fs = sstableFile.getFileSystem(conf);
    this.rawInputStream = fs.open(sstableFile);

    this.splitSize = conf.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB,
            HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024;

    this.index = new IndexOffsetScanner(sstableFile, fs);
}
 
Example #3
Source File: IndexedStorage.java    From spork with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
    IndexManager im = null;
    try {
        FileSystem fs = FileSystem.get(context.getConfiguration());
        Path indexFile = IndexManager.getIndexFileName(((FileSplit)split).getPath());
        im = new IndexManager(fs.getFileStatus(indexFile));
        im.openIndexFile(fs);
        im.ReadIndexHeader();
        im.ReadIndexFooter();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return new IndexedStorageRecordReader(im);
}
 
Example #4
Source File: CSVTestFileLoader.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Override
public void loadTestData(SequenceFile.Writer seqFile) throws IOException {
    TypeRegistry.reset();
    TypeRegistry.getInstance(this.conf);
    Path path = new Path(this.uri);
    File file = new File(this.uri);
    FileSplit split = new FileSplit(path, 0, file.length(), null);
    TaskAttemptContext ctx = new TaskAttemptContextImpl(this.conf, new TaskAttemptID());
    
    try (CSVRecordReader reader = new CSVRecordReader()) {
        reader.initialize(split, ctx);
        while (reader.nextKeyValue()) {
            RawRecordContainer raw = reader.getEvent();
            seqFile.append(new Text(), raw);
        }
    }
}
 
Example #5
Source File: VCFInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
private void fixBCFSplits(
		List<FileSplit> splits, List<InputSplit> newSplits)
	throws IOException
{
	// addGuessedSplits() requires the given splits to be sorted by file
	// path, so do so. Although FileInputFormat.getSplits() does, at the time
	// of writing this, generate them in that order, we shouldn't rely on it.
	Collections.sort(splits, new Comparator<FileSplit>() {
		public int compare(FileSplit a, FileSplit b) {
			return a.getPath().compareTo(b.getPath());
		}
	});

	for (int i = 0; i < splits.size();)
		i = addGuessedSplits(splits, i, newSplits);
}
 
Example #6
Source File: GryoRecordReader.java    From tinkerpop with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    final FileSplit split = (FileSplit) genericSplit;
    final Configuration configuration = context.getConfiguration();
    if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.gryoReader = GryoReader.build().mapper(
            GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create();
    long start = split.getStart();
    final Path file = split.getPath();
    if (null != new CompressionCodecFactory(configuration).getCodec(file)) {
        throw new IllegalStateException("Compression is not supported for the (binary) Gryo format");
    }
    // open the file and seek to the start of the split
    this.inputStream = file.getFileSystem(configuration).open(split.getPath());
    this.splitLength = split.getLength();
    if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start);
}
 
Example #7
Source File: NYCTLCReader.java    From datawave with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    super.initialize(genericSplit, context);
    Configuration job = context.getConfiguration();
    
    // open the file and seek to the start
    final Path file = ((FileSplit) genericSplit).getPath();
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    
    // read the header from the first line
    Text header = new Text();
    LfLineReader in = new LfLineReader(fileIn);
    in.readLine(header);
    in.close();
    
    rawHeader = header.toString();
    ((NYCTLCHelper) helper).parseHeader(rawHeader);
}
 
Example #8
Source File: CRAMRecordReader.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
  if(isInitialized) {
    close();
  }
  isInitialized = true;

  final Configuration conf = context.getConfiguration();
  final FileSplit fileSplit = (FileSplit) split;
  final Path file  = fileSplit.getPath();

  String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
  ReferenceSource refSource = new ReferenceSource(refSourcePath == null ? null :
      NIOFileUtil.asPath(refSourcePath));

  seekableStream = WrapSeekable.openPath(conf, file);
  start = fileSplit.getStart();
  length = fileSplit.getLength();
  long end = start + length;
  // CRAMIterator right shifts boundaries by 16 so we do the reverse here
  // also subtract one from end since CRAMIterator's boundaries are inclusive
  long[] boundaries = new long[] {start << 16, (end - 1) << 16};
  ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
  cramIterator = new CRAMIterator(seekableStream, refSource, boundaries, stringency);
}
 
Example #9
Source File: TestEnclosedEsriJsonRecordReader.java    From spatial-framework-for-hadoop with Apache License 2.0 6 votes vote down vote up
@Test
public void TestMrv1() throws Exception {
	//long [] recordBreaks = new long[] { 1872, 11284, 0, 0, 0,  };
	//assertArrayEquals(new long[] { 94L }, getRecordIndexesInFile("sample-study-area.json", 0, 208));
	Path path = new Path(this.getClass().getResource("sample-study-area.json").getFile());
	org.apache.hadoop.mapred.JobConf conf = new org.apache.hadoop.mapred.JobConf();
	org.apache.hadoop.mapred.FileSplit split =
		new org.apache.hadoop.mapred.FileSplit(path, 0, 208, new String[0]);
	EnclosedEsriJsonRecordReader reader = new EnclosedEsriJsonRecordReader(split, conf);
	LongWritable key = reader.createKey();
	Text value = reader.createValue();
	assertTrue (reader.next(key, value));
	//System.out.println(key.get() + " - " + value.toString());
	assertFalse (reader.next(key, value));
	reader.close();
}
 
Example #10
Source File: TestFastqInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
@Test
public void testReadFastqWithAmpersandQuality() throws IOException
{
	writeToTempFastq(fastqWithAmpersandQuality);
	// split doesn't start at 0, forcing reader to advance looking for first complete record
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null);

	FastqRecordReader reader = new FastqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());
	assertEquals("###########################################################################################", fragment.getQuality().toString());

	retval = reader.next(key, fragment);
	assertFalse(retval);
}
 
Example #11
Source File: Decompress.java    From ViraPipe with MIT License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example #12
Source File: MsgpackRecordReader.java    From laser with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit) split;
	String host = fileSplit.getPath().toString();
	Configuration conf = context.getConfiguration();
	String collection = conf.get("com.b5m.laser.collection");
	int port = conf.getInt("com.b5m.laser.msgpack.port", 0);
	method = conf.get("com.b5m.laser.msgpack.input.method");
	client = new MsgpackClient(host, port, collection);
	start = fileSplit.getStart();
	splitLenth = fileSplit.getLength();
	readLength = 0;
	vClass = conf.getClass("com.b5m.laser.msgpack.input.value.class", null);
	try {
		Object[] req = new Object[1];
		req[0] = start;
		client.writeIgnoreRetValue(req, method + "|start");
	} catch (Exception e) {
		throw new IOException(e.getLocalizedMessage());
	}
}
 
Example #13
Source File: WarpScriptRecordReader.java    From warp10-platform with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  // Initialize wrapped reader
  reader.initialize(split, context);
  
  Configuration conf = context.getConfiguration();
  
  String code = Warp10InputFormat.getProperty(conf, this.suffix, WarpScriptInputFormat.WARPSCRIPT_INPUTFORMAT_SCRIPT, null);

  // Record the current path in the configuration if the split is a FileSplit
  if (split instanceof FileSplit) {
    conf.set(WarpScriptInputFormat.PATH_CONFIG_KEY, ((FileSplit) split).getPath().toString());      
  }
  
  // Initialize WarpScriptExecutor
  try {
    this.executor = inputFormat.getWarpScriptExecutor(conf, code);
  } catch (WarpScriptException wse) {
    throw new IOException("Error while instatiating WarpScript executor", wse);
  }
  
  done = false;
}
 
Example #14
Source File: XmlCollectionWithTagInputFormat.java    From vxquery with Apache License 2.0 6 votes vote down vote up
public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
    endTag = ENDING_TAG.getBytes(Charsets.UTF_8);
    startTag = STARTING_TAG.getBytes(Charsets.UTF_8);

    // open the file and seek to the start of the split
    start = split.getStart();
    // set the end of the file
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    FileStatus fStatus = fs.getFileStatus(file);
    blocks = fs.getFileBlockLocations(fStatus, 0, fStatus.getLen());
    // seek the start of file
    fsin = fs.open(split.getPath());
    fsin.seek(start);
}
 
Example #15
Source File: JSONInputFormat.java    From kite with Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<E, Void> createRecordReader(InputSplit split,
                                                TaskAttemptContext context)
    throws IOException, InterruptedException {
  Configuration conf = Hadoop.TaskAttemptContext
      .getConfiguration.invoke(context);
  Path path;
  if (split instanceof FileSplit) {
    path = ((FileSplit) split).getPath();
  } else {
    throw new DatasetOperationException(
        "Split is not a FileSplit: %s:%s",
        split.getClass().getCanonicalName(), split);
  }
  JSONFileReader<E> reader = new JSONFileReader<E>(
      path.getFileSystem(conf), path, accessor);
  reader.initialize();
  return reader.asRecordReader();
}
 
Example #16
Source File: HadoopFileInputSource.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
  if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) {
    throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
  }

  Configuration configuration = new Configuration();
  FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration);

  String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY);
  FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
  TaskAttemptContext taskAttemptContext =
      getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID());
  try {
    RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext);
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS);
    return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}
 
Example #17
Source File: IntegrationTestBigLinkedList.java    From hbase with Apache License 2.0 6 votes vote down vote up
private static SortedSet<byte[]> readFileToSearch(final Configuration conf,
    final FileSystem fs, final LocatedFileStatus keyFileStatus) throws IOException,
    InterruptedException {
  SortedSet<byte []> result = new TreeSet<>(Bytes.BYTES_COMPARATOR);
  // Return entries that are flagged Counts.UNDEFINED in the value. Return the row. This is
  // what is missing.
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
  try (SequenceFileAsBinaryInputFormat.SequenceFileAsBinaryRecordReader rr =
      new SequenceFileAsBinaryInputFormat.SequenceFileAsBinaryRecordReader()) {
    InputSplit is =
      new FileSplit(keyFileStatus.getPath(), 0, keyFileStatus.getLen(), new String [] {});
    rr.initialize(is, context);
    while (rr.nextKeyValue()) {
      rr.getCurrentKey();
      BytesWritable bw = rr.getCurrentValue();
      if (Verify.VerifyReducer.whichType(bw.getBytes()) == Verify.Counts.UNDEFINED) {
        byte[] key = new byte[rr.getCurrentKey().getLength()];
        System.arraycopy(rr.getCurrentKey().getBytes(), 0, key, 0, rr.getCurrentKey()
            .getLength());
        result.add(key);
      }
    }
  }
  return result;
}
 
Example #18
Source File: AggregateXMLReader.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
    initConfig(context);
    initAggConf(context);
    
    f = XMLInputFactory.newInstance();
    setFile(((FileSplit) inSplit).getPath());
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if(status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    initStreamReader(inSplit);
}
 
Example #19
Source File: TeraScheduler.java    From incubator-tez with Apache License 2.0 6 votes vote down vote up
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
  solve();
  FileSplit[] result = new FileSplit[realSplits.length];
  int left = 0;
  int right = realSplits.length - 1;
  for(int i=0; i < splits.length; ++i) {
    if (splits[i].isAssigned) {
      // copy the split and fix up the locations
      ((TeraFileSplit) realSplits[i]).setLocations
         (new String[]{splits[i].locations.get(0).hostname});
      result[left++] = realSplits[i];
    } else {
      result[right--] = realSplits[i];
    }
  }
  List<InputSplit> ret = new ArrayList<InputSplit>();
  for (FileSplit fs : result) {
    ret.add(fs);
  }
  return ret;
}
 
Example #20
Source File: ArchiveRecordReader.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
    initConfig(context);
    allowEmptyMeta = conf.getBoolean(
        CONF_INPUT_ARCHIVE_METADATA_OPTIONAL, false);
     
    setFile(((FileSplit) inSplit).getPath());
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if(status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    initStream(inSplit);
}
 
Example #21
Source File: DelegatingAvroKeyInputFormat.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
public org.apache.hadoop.mapreduce.RecordReader<org.apache.avro.mapred.AvroKey<T>, NullWritable> createRecordReader(
    InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  LOGGER.info("DelegatingAvroKeyInputFormat.createRecordReader()  for split:{}", split);
  FileSplit fileSplit = (FileSplit) split;
  Configuration configuration = context.getConfiguration();
  String sourceName = getSourceNameFromPath(fileSplit, configuration);
  LOGGER.info("Source Name for path {} : {}", fileSplit.getPath(), sourceName);
  Map<String, String> schemaJSONMapping = new ObjectMapper()
      .readValue(configuration.get("schema.json.mapping"), MAP_STRING_STRING_TYPE);

  LOGGER.info("Schema JSON Mapping: {}", schemaJSONMapping);

  String sourceSchemaJSON = schemaJSONMapping.get(sourceName);

  Schema schema = new Schema.Parser().parse(sourceSchemaJSON);
  return new AvroKeyRecordReader<T>(schema);
}
 
Example #22
Source File: JMatrixMultiplicationStep1.java    From RecommendationEngine with MIT License 6 votes vote down vote up
public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String parentName = fileSplit.getPath().getParent().getName();

    String[] tokens = null;

    if (parentName.equals("step6")) {
        tokens = DELIMITER.split(value.toString());
        k.set("A");
        v.set(tokens[0] + "," + tokens[1] + "," + tokens[2]);
        context.write(k, v);
    } else if (parentName.equals("step1")) {
        tokens = value.toString().split(" ");
        k.set("B");
        v.set(tokens[0] + "," + tokens[1] + "," + tokens[2]);
        context.write(k, v);
    }
}
 
Example #23
Source File: TestJobSplitWriter.java    From big-c with Apache License 2.0 6 votes vote down vote up
@Test
public void testMaxBlockLocationsOldSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    org.apache.hadoop.mapred.FileSplit split =
        new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
            new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new org.apache.hadoop.mapred.InputSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}
 
Example #24
Source File: CombineDocumentSplit.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
public void readFields(DataInput in) throws IOException {
    // splits
    int splitSize = in.readInt();
    splits = new ArrayList<FileSplit>();
    for (int i = 0; i < splitSize; i++) {
        Path path = new Path(Text.readString(in));
        long start = in.readLong();
        long len = in.readLong();
        FileSplit split = new FileSplit(path, start, len, null);
        splits.add(split);
    }
    // length
    length = in.readLong();
    // locations
    locations = new HashSet<String>();
}
 
Example #25
Source File: TeraScheduler.java    From pravega-samples with Apache License 2.0 6 votes vote down vote up
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}
 
Example #26
Source File: TestQseqInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
@Test
public void testGzCompressedInput() throws IOException
{
	// write gzip-compressed data
	GzipCodec codec = new GzipCodec();
	PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) );
	qseqOut.write(twoQseq);
	qseqOut.close();

	// now try to read it
	split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null);
	QseqRecordReader reader = new QseqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());

	retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
}
 
Example #27
Source File: TestFastqInputFormat.java    From Hadoop-BAM with MIT License 6 votes vote down vote up
@Test
public void testReadStartInMiddle() throws IOException
{
	writeToTempFastq(twoFastq);
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null);

	FastqRecordReader reader = new FastqRecordReader(conf, split);

	assertEquals(oneFastq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq
	assertEquals(0.0, reader.getProgress(), 0.01);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
	assertEquals("BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", fragment.getQuality().toString());

	assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data
	assertEquals(1.0, reader.getProgress(), 0.01);

	retval = reader.next(key, fragment);
	assertFalse(retval);
}
 
Example #28
Source File: TestFastqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
private FastqRecordReader createReaderForOneFastq() throws IOException
{
	writeToTempFastq(oneFastq);
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, oneFastq.length(), null);

	return new FastqRecordReader(conf, split);
}
 
Example #29
Source File: HadoopFileInputSource.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
  try {
    Job job = Job.getInstance(new Configuration());

    if (state.contains(FILE_INPUT_PATHS_KEY)) {
      for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
        FileInputFormat.addInputPath(job, new Path(inputPath));
      }
    }

    FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
    List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
    if (fileSplits == null || fileSplits.isEmpty()) {
      return ImmutableList.of();
    }

    Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)
        ? Extract.TableType.valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null;
    String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);

    List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
    for (InputSplit inputSplit : fileSplits) {
      // Create one WorkUnit per InputSplit
      FileSplit fileSplit = (FileSplit) inputSplit;
      Extract extract = createExtract(tableType, tableNamespace, tableName);
      WorkUnit workUnit = WorkUnit.create(extract);
      workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
      workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
      workUnits.add(workUnit);
    }

    return workUnits;
  } catch (IOException ioe) {
    throw new RuntimeException("Failed to get workunits", ioe);
  }
}
 
Example #30
Source File: TestQseqInputFormat.java    From Hadoop-BAM with MIT License 5 votes vote down vote up
private void verifySkipFailedQC() throws IOException
{
	writeToTempQseq(twoQseq);
	split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, twoQseq.length(), null);
	QseqRecordReader reader = new QseqRecordReader(conf, split);

	boolean found = reader.next(key, fragment);
	assertTrue(found);
	assertEquals(2, (int)fragment.getRead());

	found = reader.next(key, fragment);
	assertFalse(found);
}