Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit

The following examples show how to use org.apache.hadoop.mapreduce.lib.input.FileSplit. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tinkerpop   Source File: GryoRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(final InputSplit genericSplit, final TaskAttemptContext context) throws IOException {
    final FileSplit split = (FileSplit) genericSplit;
    final Configuration configuration = context.getConfiguration();
    if (configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.gryoReader = GryoReader.build().mapper(
            GryoMapper.build().addRegistries(IoRegistryHelper.createRegistries(ConfUtil.makeApacheConfiguration(configuration))).create()).create();
    long start = split.getStart();
    final Path file = split.getPath();
    if (null != new CompressionCodecFactory(configuration).getCodec(file)) {
        throw new IllegalStateException("Compression is not supported for the (binary) Gryo format");
    }
    // open the file and seek to the start of the split
    this.inputStream = file.getFileSystem(configuration).open(split.getPath());
    this.splitLength = split.getLength();
    if (this.splitLength > 0) this.splitLength -= (seekToHeader(this.inputStream, start) - start);
}
 
Example 2
Source Project: Hadoop-BAM   Source File: TestFastqInputFormat.java    License: MIT License 6 votes vote down vote up
@Test
public void testReadStartInMiddle() throws IOException
{
	writeToTempFastq(twoFastq);
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null);

	FastqRecordReader reader = new FastqRecordReader(conf, split);

	assertEquals(oneFastq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq
	assertEquals(0.0, reader.getProgress(), 0.01);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
	assertEquals("BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", fragment.getQuality().toString());

	assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data
	assertEquals(1.0, reader.getProgress(), 0.01);

	retval = reader.next(key, fragment);
	assertFalse(retval);
}
 
Example 3
Source Project: hadoop-sstable   Source File: SSTableSplitRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException {

    this.context = taskAttemptContext;
    FileSplit fileSplit = (FileSplit) genericSplit;
    this.sstableFile = fileSplit.getPath();
    // The SSTableSplitInputFormat is not splittable, so the split length is the whole file.
    this.totalFileSize = fileSplit.getLength();


    Configuration conf = context.getConfiguration();
    FileSystem fs = sstableFile.getFileSystem(conf);
    this.rawInputStream = fs.open(sstableFile);

    this.splitSize = conf.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB,
            HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024;

    this.index = new IndexOffsetScanner(sstableFile, fs);
}
 
Example 4
Source Project: spork   Source File: IndexedStorage.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
    IndexManager im = null;
    try {
        FileSystem fs = FileSystem.get(context.getConfiguration());
        Path indexFile = IndexManager.getIndexFileName(((FileSplit)split).getPath());
        im = new IndexManager(fs.getFileStatus(indexFile));
        im.openIndexFile(fs);
        im.ReadIndexHeader();
        im.ReadIndexFooter();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return new IndexedStorageRecordReader(im);
}
 
Example 5
Source Project: Hadoop-BAM   Source File: VCFInputFormat.java    License: MIT License 6 votes vote down vote up
private void fixBCFSplits(
		List<FileSplit> splits, List<InputSplit> newSplits)
	throws IOException
{
	// addGuessedSplits() requires the given splits to be sorted by file
	// path, so do so. Although FileInputFormat.getSplits() does, at the time
	// of writing this, generate them in that order, we shouldn't rely on it.
	Collections.sort(splits, new Comparator<FileSplit>() {
		public int compare(FileSplit a, FileSplit b) {
			return a.getPath().compareTo(b.getPath());
		}
	});

	for (int i = 0; i < splits.size();)
		i = addGuessedSplits(splits, i, newSplits);
}
 
Example 6
Source Project: datawave   Source File: CSVTestFileLoader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void loadTestData(SequenceFile.Writer seqFile) throws IOException {
    TypeRegistry.reset();
    TypeRegistry.getInstance(this.conf);
    Path path = new Path(this.uri);
    File file = new File(this.uri);
    FileSplit split = new FileSplit(path, 0, file.length(), null);
    TaskAttemptContext ctx = new TaskAttemptContextImpl(this.conf, new TaskAttemptID());
    
    try (CSVRecordReader reader = new CSVRecordReader()) {
        reader.initialize(split, ctx);
        while (reader.nextKeyValue()) {
            RawRecordContainer raw = reader.getEvent();
            seqFile.append(new Text(), raw);
        }
    }
}
 
Example 7
Source Project: datawave   Source File: NYCTLCReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    super.initialize(genericSplit, context);
    Configuration job = context.getConfiguration();
    
    // open the file and seek to the start
    final Path file = ((FileSplit) genericSplit).getPath();
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(file);
    
    // read the header from the first line
    Text header = new Text();
    LfLineReader in = new LfLineReader(fileIn);
    in.readLine(header);
    in.close();
    
    rawHeader = header.toString();
    ((NYCTLCHelper) helper).parseHeader(rawHeader);
}
 
Example 8
Source Project: Hadoop-BAM   Source File: CRAMRecordReader.java    License: MIT License 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
  if(isInitialized) {
    close();
  }
  isInitialized = true;

  final Configuration conf = context.getConfiguration();
  final FileSplit fileSplit = (FileSplit) split;
  final Path file  = fileSplit.getPath();

  String refSourcePath = conf.get(CRAMInputFormat.REFERENCE_SOURCE_PATH_PROPERTY);
  ReferenceSource refSource = new ReferenceSource(refSourcePath == null ? null :
      NIOFileUtil.asPath(refSourcePath));

  seekableStream = WrapSeekable.openPath(conf, file);
  start = fileSplit.getStart();
  length = fileSplit.getLength();
  long end = start + length;
  // CRAMIterator right shifts boundaries by 16 so we do the reverse here
  // also subtract one from end since CRAMIterator's boundaries are inclusive
  long[] boundaries = new long[] {start << 16, (end - 1) << 16};
  ValidationStringency stringency = SAMHeaderReader.getValidationStringency(conf);
  cramIterator = new CRAMIterator(seekableStream, refSource, boundaries, stringency);
}
 
Example 9
@Test
public void TestMrv1() throws Exception {
	//long [] recordBreaks = new long[] { 1872, 11284, 0, 0, 0,  };
	//assertArrayEquals(new long[] { 94L }, getRecordIndexesInFile("sample-study-area.json", 0, 208));
	Path path = new Path(this.getClass().getResource("sample-study-area.json").getFile());
	org.apache.hadoop.mapred.JobConf conf = new org.apache.hadoop.mapred.JobConf();
	org.apache.hadoop.mapred.FileSplit split =
		new org.apache.hadoop.mapred.FileSplit(path, 0, 208, new String[0]);
	EnclosedEsriJsonRecordReader reader = new EnclosedEsriJsonRecordReader(split, conf);
	LongWritable key = reader.createKey();
	Text value = reader.createValue();
	assertTrue (reader.next(key, value));
	//System.out.println(key.get() + " - " + value.toString());
	assertFalse (reader.next(key, value));
	reader.close();
}
 
Example 10
Source Project: Hadoop-BAM   Source File: TestFastqInputFormat.java    License: MIT License 6 votes vote down vote up
@Test
public void testReadFastqWithAmpersandQuality() throws IOException
{
	writeToTempFastq(fastqWithAmpersandQuality);
	// split doesn't start at 0, forcing reader to advance looking for first complete record
	split = new FileSplit(new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null);

	FastqRecordReader reader = new FastqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());
	assertEquals("###########################################################################################", fragment.getQuality().toString());

	retval = reader.next(key, fragment);
	assertFalse(retval);
}
 
Example 11
Source Project: ViraPipe   Source File: Decompress.java    License: MIT License 6 votes vote down vote up
public static void interleaveSplitFastq(FileStatus fst, FileStatus fst2, String splitDir, int splitlen, JavaSparkContext sc) throws IOException {

    List<FileSplit> nlif = NLineInputFormat.getSplitsForFile(fst, sc.hadoopConfiguration(), splitlen);
    List<FileSplit> nlif2 = NLineInputFormat.getSplitsForFile(fst2, sc.hadoopConfiguration(), splitlen);

    JavaRDD<FileSplit> splitRDD = sc.parallelize(nlif);
    JavaRDD<FileSplit> splitRDD2 = sc.parallelize(nlif2);
    JavaPairRDD<FileSplit, FileSplit> zips = splitRDD.zip(splitRDD2);

    zips.foreach( splits ->  {
      Path path = splits._1.getPath();
      FastqRecordReader fqreader = new FastqRecordReader(new Configuration(), splits._1);
      FastqRecordReader fqreader2 = new FastqRecordReader(new Configuration(), splits._2);

      writeInterleavedSplits(fqreader, fqreader2, new Configuration(), splitDir+"/"+path.getParent().getName()+"_"+splits._1.getStart()+".fq");
    });
  }
 
Example 12
Source Project: laser   Source File: MsgpackRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit fileSplit = (FileSplit) split;
	String host = fileSplit.getPath().toString();
	Configuration conf = context.getConfiguration();
	String collection = conf.get("com.b5m.laser.collection");
	int port = conf.getInt("com.b5m.laser.msgpack.port", 0);
	method = conf.get("com.b5m.laser.msgpack.input.method");
	client = new MsgpackClient(host, port, collection);
	start = fileSplit.getStart();
	splitLenth = fileSplit.getLength();
	readLength = 0;
	vClass = conf.getClass("com.b5m.laser.msgpack.input.value.class", null);
	try {
		Object[] req = new Object[1];
		req[0] = start;
		client.writeIgnoreRetValue(req, method + "|start");
	} catch (Exception e) {
		throw new IOException(e.getLocalizedMessage());
	}
}
 
Example 13
Source Project: warp10-platform   Source File: WarpScriptRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  // Initialize wrapped reader
  reader.initialize(split, context);
  
  Configuration conf = context.getConfiguration();
  
  String code = Warp10InputFormat.getProperty(conf, this.suffix, WarpScriptInputFormat.WARPSCRIPT_INPUTFORMAT_SCRIPT, null);

  // Record the current path in the configuration if the split is a FileSplit
  if (split instanceof FileSplit) {
    conf.set(WarpScriptInputFormat.PATH_CONFIG_KEY, ((FileSplit) split).getPath().toString());      
  }
  
  // Initialize WarpScriptExecutor
  try {
    this.executor = inputFormat.getWarpScriptExecutor(conf, code);
  } catch (WarpScriptException wse) {
    throw new IOException("Error while instatiating WarpScript executor", wse);
  }
  
  done = false;
}
 
Example 14
Source Project: vxquery   Source File: XmlCollectionWithTagInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
public XmlRecordReader(FileSplit split, Configuration conf) throws IOException {
    endTag = ENDING_TAG.getBytes(Charsets.UTF_8);
    startTag = STARTING_TAG.getBytes(Charsets.UTF_8);

    // open the file and seek to the start of the split
    start = split.getStart();
    // set the end of the file
    end = start + split.getLength();
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    FileStatus fStatus = fs.getFileStatus(file);
    blocks = fs.getFileBlockLocations(fStatus, 0, fStatus.getLen());
    // seek the start of file
    fsin = fs.open(split.getPath());
    fsin.seek(start);
}
 
Example 15
Source Project: kite   Source File: JSONInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public RecordReader<E, Void> createRecordReader(InputSplit split,
                                                TaskAttemptContext context)
    throws IOException, InterruptedException {
  Configuration conf = Hadoop.TaskAttemptContext
      .getConfiguration.invoke(context);
  Path path;
  if (split instanceof FileSplit) {
    path = ((FileSplit) split).getPath();
  } else {
    throw new DatasetOperationException(
        "Split is not a FileSplit: %s:%s",
        split.getClass().getCanonicalName(), split);
  }
  JSONFileReader<E> reader = new JSONFileReader<E>(
      path.getFileSystem(conf), path, accessor);
  reader.initialize();
  return reader.asRecordReader();
}
 
Example 16
Source Project: incubator-gobblin   Source File: HadoopFileInputSource.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
  if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) {
    throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
  }

  Configuration configuration = new Configuration();
  FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration);

  String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY);
  FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
  TaskAttemptContext taskAttemptContext =
      getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID());
  try {
    RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext);
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS);
    return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}
 
Example 17
Source Project: hbase   Source File: IntegrationTestBigLinkedList.java    License: Apache License 2.0 6 votes vote down vote up
private static SortedSet<byte[]> readFileToSearch(final Configuration conf,
    final FileSystem fs, final LocatedFileStatus keyFileStatus) throws IOException,
    InterruptedException {
  SortedSet<byte []> result = new TreeSet<>(Bytes.BYTES_COMPARATOR);
  // Return entries that are flagged Counts.UNDEFINED in the value. Return the row. This is
  // what is missing.
  TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
  try (SequenceFileAsBinaryInputFormat.SequenceFileAsBinaryRecordReader rr =
      new SequenceFileAsBinaryInputFormat.SequenceFileAsBinaryRecordReader()) {
    InputSplit is =
      new FileSplit(keyFileStatus.getPath(), 0, keyFileStatus.getLen(), new String [] {});
    rr.initialize(is, context);
    while (rr.nextKeyValue()) {
      rr.getCurrentKey();
      BytesWritable bw = rr.getCurrentValue();
      if (Verify.VerifyReducer.whichType(bw.getBytes()) == Verify.Counts.UNDEFINED) {
        byte[] key = new byte[rr.getCurrentKey().getLength()];
        System.arraycopy(rr.getCurrentKey().getBytes(), 0, key, 0, rr.getCurrentKey()
            .getLength());
        result.add(key);
      }
    }
  }
  return result;
}
 
Example 18
Source Project: marklogic-contentpump   Source File: AggregateXMLReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
    initConfig(context);
    initAggConf(context);
    
    f = XMLInputFactory.newInstance();
    setFile(((FileSplit) inSplit).getPath());
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if(status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    initStreamReader(inSplit);
}
 
Example 19
Source Project: incubator-tez   Source File: TeraScheduler.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Solve the schedule and modify the FileSplit array to reflect the new
 * schedule. It will move placed splits to front and unplacable splits
 * to the end.
 * @return a new list of FileSplits that are modified to have the
 *    best host as the only host.
 * @throws IOException
 */
public List<InputSplit> getNewFileSplits() throws IOException {
  solve();
  FileSplit[] result = new FileSplit[realSplits.length];
  int left = 0;
  int right = realSplits.length - 1;
  for(int i=0; i < splits.length; ++i) {
    if (splits[i].isAssigned) {
      // copy the split and fix up the locations
      ((TeraFileSplit) realSplits[i]).setLocations
         (new String[]{splits[i].locations.get(0).hostname});
      result[left++] = realSplits[i];
    } else {
      result[right--] = realSplits[i];
    }
  }
  List<InputSplit> ret = new ArrayList<InputSplit>();
  for (FileSplit fs : result) {
    ret.add(fs);
  }
  return ret;
}
 
Example 20
Source Project: marklogic-contentpump   Source File: ArchiveRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
    throws IOException, InterruptedException {
    initConfig(context);
    allowEmptyMeta = conf.getBoolean(
        CONF_INPUT_ARCHIVE_METADATA_OPTIONAL, false);
     
    setFile(((FileSplit) inSplit).getPath());
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if(status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    initStream(inSplit);
}
 
Example 21
public org.apache.hadoop.mapreduce.RecordReader<org.apache.avro.mapred.AvroKey<T>, NullWritable> createRecordReader(
    InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
  LOGGER.info("DelegatingAvroKeyInputFormat.createRecordReader()  for split:{}", split);
  FileSplit fileSplit = (FileSplit) split;
  Configuration configuration = context.getConfiguration();
  String sourceName = getSourceNameFromPath(fileSplit, configuration);
  LOGGER.info("Source Name for path {} : {}", fileSplit.getPath(), sourceName);
  Map<String, String> schemaJSONMapping = new ObjectMapper()
      .readValue(configuration.get("schema.json.mapping"), MAP_STRING_STRING_TYPE);

  LOGGER.info("Schema JSON Mapping: {}", schemaJSONMapping);

  String sourceSchemaJSON = schemaJSONMapping.get(sourceName);

  Schema schema = new Schema.Parser().parse(sourceSchemaJSON);
  return new AvroKeyRecordReader<T>(schema);
}
 
Example 22
Source Project: RecommendationEngine   Source File: JMatrixMultiplicationStep1.java    License: MIT License 6 votes vote down vote up
public void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String parentName = fileSplit.getPath().getParent().getName();

    String[] tokens = null;

    if (parentName.equals("step6")) {
        tokens = DELIMITER.split(value.toString());
        k.set("A");
        v.set(tokens[0] + "," + tokens[1] + "," + tokens[2]);
        context.write(k, v);
    } else if (parentName.equals("step1")) {
        tokens = value.toString().split(" ");
        k.set("B");
        v.set(tokens[0] + "," + tokens[1] + "," + tokens[2]);
        context.write(k, v);
    }
}
 
Example 23
Source Project: big-c   Source File: TestJobSplitWriter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testMaxBlockLocationsOldSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
    Configuration conf = new Configuration();
    conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
    Path submitDir = new Path(TEST_DIR.getAbsolutePath());
    FileSystem fs = FileSystem.getLocal(conf);
    org.apache.hadoop.mapred.FileSplit split =
        new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
            new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
    JobSplitWriter.createSplitFiles(submitDir, conf, fs,
        new org.apache.hadoop.mapred.InputSplit[] { split });
    JobSplit.TaskSplitMetaInfo[] infos =
        SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
            submitDir);
    assertEquals("unexpected number of splits", 1, infos.length);
    assertEquals("unexpected number of split locations",
        4, infos[0].getLocations().length);
  } finally {
    FileUtil.fullyDelete(TEST_DIR);
  }
}
 
Example 24
public void readFields(DataInput in) throws IOException {
    // splits
    int splitSize = in.readInt();
    splits = new ArrayList<FileSplit>();
    for (int i = 0; i < splitSize; i++) {
        Path path = new Path(Text.readString(in));
        long start = in.readLong();
        long len = in.readLong();
        FileSplit split = new FileSplit(path, start, len, null);
        splits.add(split);
    }
    // length
    length = in.readLong();
    // locations
    locations = new HashSet<String>();
}
 
Example 25
Source Project: pravega-samples   Source File: TeraScheduler.java    License: Apache License 2.0 6 votes vote down vote up
public TeraScheduler(FileSplit[] realSplits,
                     Configuration conf) throws IOException {
  this.realSplits = realSplits;
  this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4);
  Map<String, Host> hostTable = new HashMap<String, Host>();
  splits = new Split[realSplits.length];
  for(FileSplit realSplit: realSplits) {
    Split split = new Split(realSplit.getPath().toString());
    splits[remainingSplits++] = split;
    for(String hostname: realSplit.getLocations()) {
      Host host = hostTable.get(hostname);
      if (host == null) {
        host = new Host(hostname);
        hostTable.put(hostname, host);
        hosts.add(host);
      }
      host.splits.add(split);
      split.locations.add(host);
    }
  }
}
 
Example 26
Source Project: Hadoop-BAM   Source File: TestQseqInputFormat.java    License: MIT License 6 votes vote down vote up
@Test
public void testGzCompressedInput() throws IOException
{
	// write gzip-compressed data
	GzipCodec codec = new GzipCodec();
	PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) );
	qseqOut.write(twoQseq);
	qseqOut.close();

	// now try to read it
	split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null);
	QseqRecordReader reader = new QseqRecordReader(conf, split);

	boolean retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString());
	assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString());

	retval = reader.next(key, fragment);
	assertTrue(retval);
	assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString());
	assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString());
}
 
Example 27
Source Project: laser   Source File: MsgpackInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public List<InputSplit> getSplits(JobContext context) throws IOException,
		InterruptedException {
	Configuration conf = context.getConfiguration();
	long goalSize = goalsize(conf);
	String[] urlList = conf.get("com.b5m.laser.msgpack.host").split(",");
	int numNode = urlList.length;
	List<InputSplit> splits = new ArrayList<InputSplit>();
	long splitLength = (long) Math.ceil((double) goalSize / numNode);

	long bytesRemaining = goalSize;
	for (int i = 0; i < numNode; i++) {
		if (bytesRemaining > splitLength) {
			splits.add(new FileSplit(new Path(urlList[i]), i * splitLength,
					splitLength, null));
		} else {
			splits.add(new FileSplit(new Path(urlList[i]), i * splitLength,
					bytesRemaining, null));
		}
		bytesRemaining -= splitLength;
	}
	return splits;
}
 
Example 28
Source Project: kylin-on-parquet-v2   Source File: UHCDictionaryMapper.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected void doSetup(Context context) throws IOException {
    tmpBuf = ByteBuffer.allocate(4096);

    Configuration conf = context.getConfiguration();
    bindCurrentConfiguration(conf);
    KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata();

    CubeInstance cube = CubeManager.getInstance(config).getCube(conf.get(BatchConstants.CFG_CUBE_NAME));
    List<TblColRef> uhcColumns = cube.getDescriptor().getAllUHCColumns();

    FileSplit fileSplit = (FileSplit) context.getInputSplit();
    String colName = fileSplit.getPath().getParent().getName();

    for (int i = 0; i < uhcColumns.size(); i++) {
        if (uhcColumns.get(i).getIdentity().equalsIgnoreCase(colName)) {
            index = i;
            break;
        }
    }
    type = uhcColumns.get(index).getType();

    //for debug
    logger.info("column name: " + colName);
    logger.info("index: " + index);
    logger.info("type: " + type);
}
 
Example 29
@Test
public void testGetSplits() throws Exception {
  Configuration conf = new Configuration();

  UnshardedExportToCloudStorage export =
      new UnshardedExportToCloudStorage(
          conf,
          "path",
          ExportFileFormat.AVRO,
          new BigQueryHelper(null),
          "project-id",
          null, /* table */
          new InputFormat<LongWritable, Text>() {
            @Override
            public List<InputSplit> getSplits(JobContext jobContext) {
              return ImmutableList.<InputSplit>builder()
                  .add(new FileSplit(new Path("Foo"), 0L, 1L, new String[0]))
                  .add(new FileSplit(new Path("Bar"), 0L, 1L, new String[0]))
                  .build();
            }

            @Override
            public RecordReader<LongWritable, Text> createRecordReader(
                InputSplit inputSplit, TaskAttemptContext taskAttemptContext) {
              throw new UnsupportedOperationException("Not implemented.");
            }
          });

  List<InputSplit> splits = export.getSplits(null);
  UnshardedInputSplit fooSplit = (UnshardedInputSplit) splits.get(0);
  assertThat(fooSplit.getPath().getName()).isEqualTo("Foo");

  UnshardedInputSplit barSplit = (UnshardedInputSplit) splits.get(1);
  assertThat(barSplit.getPath().getName()).isEqualTo("Bar");
}
 
Example 30
Source Project: webarchive-commons   Source File: ResourceRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
		throws IOException, InterruptedException {
	if(inputSplit instanceof FileSplit) {
		FileSplit fs = (FileSplit) inputSplit;
		Path fsPath = fs.getPath();
    	FileSystem fSys = fsPath.getFileSystem(context.getConfiguration());
    	FSDataInputStream fsdis = fSys.open(fsPath);
    	String path = fsPath.getName();
    	name = fsPath.getName();
    	stream = new HDFSStream(fsdis);
    	startOffset = fs.getStart();
		length = fs.getLength();
		long endOffset = startOffset + length;
		stream.setOffset(startOffset);	    	
    	series = new GZIPMemberSeries(stream, name, startOffset);
		GZIPResourceContainer prod = 
			new GZIPResourceContainer(series,endOffset);
		ResourceProducer envelope;
    	if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
    		envelope = new TransformingResourceProducer(prod,wf);
		} else if(path.endsWith(".arc.gz")) {
			envelope = new TransformingResourceProducer(prod,af);
		} else {
			throw new IOException("arguments must be arc.gz or warc.gz");
		}
    	ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
    	producer = new ExtractingResourceProducer(envelope, mapper);

	} else {
		throw new IOException("Need FileSplit input...");
	}
}