org.apache.hadoop.mapred.Merger.Segment Java Examples
The following examples show how to use
org.apache.hadoop.mapred.Merger.Segment.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BlockMapOutputBuffer.java From RDFS with Apache License 2.0 | 6 votes |
public synchronized void flush() throws IOException, ClassNotFoundException, InterruptedException { if (numSpills > 0 && lastSpillInMem) { // if there is already one spills, we can try to hold this last spill in // memory. sortReduceParts(); for (int i = 0; i < partitions; i++) { this.inMemorySegments[i] = new Segment<K, V>(this.reducePartitions[i].getIReader(), true); } hasInMemorySpill=true; } else { sortAndSpill(); } long mergeStartMilli = System.currentTimeMillis(); ProcResourceValues mergeStartProcVals = task.getCurrentProcResourceValues(); mergeParts(); long mergeEndMilli = System.currentTimeMillis(); ProcResourceValues mergeEndProcVals = task.getCurrentProcResourceValues(); mapSpillSortCounter.incMergeCounters(mergeStartProcVals, mergeEndProcVals, mergeEndMilli - mergeStartMilli); }
Example #2
Source File: BackupStore.java From hadoop with Apache License 2.0 | 5 votes |
public void reset() throws IOException { // Create a new segment for the previously written records only if we // are not already in the reset mode if (!inReset) { if (fileCache.isActive) { fileCache.createInDiskSegment(); } else { memCache.createInMemorySegment(); } } inReset = true; // Reset the segments to the correct position from where the next read // should begin. for (int i = 0; i < segmentList.size(); i++) { Segment<K,V> s = segmentList.get(i); if (s.inMemory()) { int offset = (i == 0) ? firstSegmentOffset : 0; s.getReader().reset(offset); } else { s.closeReader(); if (i == 0) { s.reinitReader(firstSegmentOffset); s.getReader().disableChecksumValidation(); } } } currentKVOffset = firstSegmentOffset; nextKVOffset = -1; readSegmentIndex = 0; hasMore = false; lastSegmentEOF = false; LOG.debug("Reset - First segment offset is " + firstSegmentOffset + " Segment List Size is " + segmentList.size()); }
Example #3
Source File: TestMerger.java From big-c with Apache License 2.0 | 5 votes |
private List<Segment<Text, Text>> getUncompressedSegments() throws IOException { List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>(); for (int i = 0; i < 2; i++) { segments.add(getUncompressedSegment(i)); } return segments; }
Example #4
Source File: TestMerger.java From big-c with Apache License 2.0 | 5 votes |
private List<Segment<Text, Text>> getCompressedSegments() throws IOException { List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>(); for (int i = 0; i < 2; i++) { segments.add(getCompressedSegment(i)); } return segments; }
Example #5
Source File: MergeManagerImpl.java From big-c with Apache License 2.0 | 5 votes |
@Override public void merge(List<InMemoryMapOutput<K, V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } TaskAttemptID dummyMapId = inputs.get(0).getMapId(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); InMemoryMapOutput<K, V> mergedMapOutputs = unconditionalReserve(dummyMapId, mergeOutputSize, false); Writer<K, V> writer = new InMemoryWriter<K, V>(mergedMapOutputs.getArrayStream()); LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments + " segments of total-size: " + mergeOutputSize); RawKeyValueIterator rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, null, null, null); Merger.writeFile(rIter, writer, reporter, jobConf); writer.close(); LOG.info(reduceId + " Memory-to-Memory merge of the " + noInMemorySegments + " files in-memory complete."); // Note the output of the merge closeInMemoryMergedFile(mergedMapOutputs); }
Example #6
Source File: BackupStore.java From big-c with Apache License 2.0 | 5 votes |
void createInDiskSegment() throws IOException { assert (writer != null); writer.close(); Segment<K,V> s = new Segment<K, V>(conf, fs, file, null, true); writer = null; segmentList.add(s); LOG.debug("Disk Segment added to List. Size is " + segmentList.size()); }
Example #7
Source File: BackupStore.java From big-c with Apache License 2.0 | 5 votes |
/** * This method creates a memory segment from the existing buffer * @throws IOException */ void createInMemorySegment () throws IOException { // If nothing was written in this block because the record size // was greater than the allocated block size, just return. if (usedSize == 0) { ramManager.unreserve(blockSize); return; } // spaceAvailable would have ensured that there is enough space // left for the EOF markers. assert ((blockSize - usedSize) >= EOF_MARKER_SIZE); WritableUtils.writeVInt(dataOut, IFile.EOF_MARKER); WritableUtils.writeVInt(dataOut, IFile.EOF_MARKER); usedSize += EOF_MARKER_SIZE; ramManager.unreserve(blockSize - usedSize); Reader<K, V> reader = new org.apache.hadoop.mapreduce.task.reduce.InMemoryReader<K, V>(null, (org.apache.hadoop.mapred.TaskAttemptID) tid, dataOut.getData(), 0, usedSize, conf); Segment<K, V> segment = new Segment<K, V>(reader, false); segmentList.add(segment); LOG.debug("Added Memory Segment to List. List Size is " + segmentList.size()); }
Example #8
Source File: BackupStore.java From big-c with Apache License 2.0 | 5 votes |
private void clearSegmentList() throws IOException { for (Segment<K,V> segment: segmentList) { long len = segment.getLength(); segment.close(); if (segment.inMemory()) { memCache.unreserve(len); } } segmentList.clear(); }
Example #9
Source File: BackupStore.java From big-c with Apache License 2.0 | 5 votes |
public void reset() throws IOException { // Create a new segment for the previously written records only if we // are not already in the reset mode if (!inReset) { if (fileCache.isActive) { fileCache.createInDiskSegment(); } else { memCache.createInMemorySegment(); } } inReset = true; // Reset the segments to the correct position from where the next read // should begin. for (int i = 0; i < segmentList.size(); i++) { Segment<K,V> s = segmentList.get(i); if (s.inMemory()) { int offset = (i == 0) ? firstSegmentOffset : 0; s.getReader().reset(offset); } else { s.closeReader(); if (i == 0) { s.reinitReader(firstSegmentOffset); s.getReader().disableChecksumValidation(); } } } currentKVOffset = firstSegmentOffset; nextKVOffset = -1; readSegmentIndex = 0; hasMore = false; lastSegmentEOF = false; LOG.debug("Reset - First segment offset is " + firstSegmentOffset + " Segment List Size is " + segmentList.size()); }
Example #10
Source File: BackupStore.java From big-c with Apache License 2.0 | 5 votes |
public void mark() throws IOException { // We read one KV pair in advance in hasNext. // If hasNext has read the next KV pair from a new segment, but the // user has not called next() for that KV, then reset the readSegmentIndex // to the previous segment if (nextKVOffset == 0) { assert (readSegmentIndex != 0); assert (currentKVOffset != 0); readSegmentIndex --; } // just drop segments before the current active segment int i = 0; Iterator<Segment<K,V>> itr = segmentList.iterator(); while (itr.hasNext()) { Segment<K,V> s = itr.next(); if (i == readSegmentIndex) { break; } s.close(); itr.remove(); i++; LOG.debug("Dropping a segment"); } // FirstSegmentOffset is the offset in the current segment from where we // need to start reading on the next reset firstSegmentOffset = currentKVOffset; readSegmentIndex = 0; LOG.debug("Setting the FirsSegmentOffset to " + currentKVOffset); }
Example #11
Source File: TestMerger.java From hadoop with Apache License 2.0 | 5 votes |
private List<Segment<Text, Text>> getCompressedSegments() throws IOException { List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>(); for (int i = 0; i < 2; i++) { segments.add(getCompressedSegment(i)); } return segments; }
Example #12
Source File: TestMerger.java From hadoop with Apache License 2.0 | 5 votes |
private List<Segment<Text, Text>> getUncompressedSegments() throws IOException { List<Segment<Text, Text>> segments = new ArrayList<Segment<Text, Text>>(); for (int i = 0; i < 2; i++) { segments.add(getUncompressedSegment(i)); } return segments; }
Example #13
Source File: MergeManagerImpl.java From hadoop with Apache License 2.0 | 5 votes |
@Override public void merge(List<InMemoryMapOutput<K, V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } TaskAttemptID dummyMapId = inputs.get(0).getMapId(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); InMemoryMapOutput<K, V> mergedMapOutputs = unconditionalReserve(dummyMapId, mergeOutputSize, false); Writer<K, V> writer = new InMemoryWriter<K, V>(mergedMapOutputs.getArrayStream()); LOG.info("Initiating Memory-to-Memory merge with " + noInMemorySegments + " segments of total-size: " + mergeOutputSize); RawKeyValueIterator rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, null, null, null); Merger.writeFile(rIter, writer, reporter, jobConf); writer.close(); LOG.info(reduceId + " Memory-to-Memory merge of the " + noInMemorySegments + " files in-memory complete."); // Note the output of the merge closeInMemoryMergedFile(mergedMapOutputs); }
Example #14
Source File: BackupStore.java From hadoop with Apache License 2.0 | 5 votes |
void createInDiskSegment() throws IOException { assert (writer != null); writer.close(); Segment<K,V> s = new Segment<K, V>(conf, fs, file, null, true); writer = null; segmentList.add(s); LOG.debug("Disk Segment added to List. Size is " + segmentList.size()); }
Example #15
Source File: BackupStore.java From hadoop with Apache License 2.0 | 5 votes |
/** * This method creates a memory segment from the existing buffer * @throws IOException */ void createInMemorySegment () throws IOException { // If nothing was written in this block because the record size // was greater than the allocated block size, just return. if (usedSize == 0) { ramManager.unreserve(blockSize); return; } // spaceAvailable would have ensured that there is enough space // left for the EOF markers. assert ((blockSize - usedSize) >= EOF_MARKER_SIZE); WritableUtils.writeVInt(dataOut, IFile.EOF_MARKER); WritableUtils.writeVInt(dataOut, IFile.EOF_MARKER); usedSize += EOF_MARKER_SIZE; ramManager.unreserve(blockSize - usedSize); Reader<K, V> reader = new org.apache.hadoop.mapreduce.task.reduce.InMemoryReader<K, V>(null, (org.apache.hadoop.mapred.TaskAttemptID) tid, dataOut.getData(), 0, usedSize, conf); Segment<K, V> segment = new Segment<K, V>(reader, false); segmentList.add(segment); LOG.debug("Added Memory Segment to List. List Size is " + segmentList.size()); }
Example #16
Source File: BackupStore.java From hadoop with Apache License 2.0 | 5 votes |
private void clearSegmentList() throws IOException { for (Segment<K,V> segment: segmentList) { long len = segment.getLength(); segment.close(); if (segment.inMemory()) { memCache.unreserve(len); } } segmentList.clear(); }
Example #17
Source File: BackupStore.java From hadoop with Apache License 2.0 | 5 votes |
public void mark() throws IOException { // We read one KV pair in advance in hasNext. // If hasNext has read the next KV pair from a new segment, but the // user has not called next() for that KV, then reset the readSegmentIndex // to the previous segment if (nextKVOffset == 0) { assert (readSegmentIndex != 0); assert (currentKVOffset != 0); readSegmentIndex --; } // just drop segments before the current active segment int i = 0; Iterator<Segment<K,V>> itr = segmentList.iterator(); while (itr.hasNext()) { Segment<K,V> s = itr.next(); if (i == readSegmentIndex) { break; } s.close(); itr.remove(); i++; LOG.debug("Dropping a segment"); } // FirstSegmentOffset is the offset in the current segment from where we // need to start reading on the next reset firstSegmentOffset = currentKVOffset; readSegmentIndex = 0; LOG.debug("Setting the FirsSegmentOffset to " + currentKVOffset); }
Example #18
Source File: ReduceTask.java From RDFS with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private void doInMemMerge() throws IOException{ if (mapOutputsFilesInMemory.size() == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskID mapId = mapOutputsFilesInMemory.get(0).mapId; List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K,V>>(); long mergeOutputSize = createInMemorySegments(inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapId, reduceTask.getTaskID(), mergeOutputSize); Writer writer = new Writer(conf, rfs, outputPath, conf.getMapOutputKeyClass(), conf.getMapOutputValueClass(), codec, null); RawKeyValueIterator rIter = null; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(conf, rfs, (Class<K>)conf.getMapOutputKeyClass(), (Class<V>)conf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceTask.getTaskID().toString()), conf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null); if (combinerRunner == null) { Merger.writeFile(rIter, writer, reporter, conf); } else { combineCollector.setWriter(writer); combinerRunner.combine(rIter, combineCollector); } writer.close(); LOG.info(reduceTask.getTaskID() + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFileSys.getFileStatus(outputPath).getLen()); } catch (Exception e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes localFileSys.delete(outputPath, true); throw (IOException)new IOException ("Intermediate merge failed").initCause(e); } // Note the output of the merge FileStatus status = localFileSys.getFileStatus(outputPath); synchronized (mapOutputFilesOnDisk) { addToMapOutputFilesOnDisk(status); } }
Example #19
Source File: TestMerger.java From big-c with Apache License 2.0 | 4 votes |
private Segment<Text, Text> getCompressedSegment(int i) throws IOException { return new Segment<Text, Text>(getReader(i, true), false, 3000l); }
Example #20
Source File: BlockMapOutputBuffer.java From RDFS with Apache License 2.0 | 4 votes |
@SuppressWarnings( { "unchecked", "deprecation" }) public BlockMapOutputBuffer(TaskUmbilicalProtocol umbilical, JobConf job, TaskReporter reporter, MapTask task) throws IOException, ClassNotFoundException { this.task = task; this.job = job; this.reporter = reporter; localFs = FileSystem.getLocal(job); partitions = job.getNumReduceTasks(); indexCacheList = new ArrayList<SpillRecord>(); if (partitions > 0) { partitioner = (Partitioner<K, V>) ReflectionUtils.newInstance(job .getPartitionerClass(), job); } else { partitioner = new Partitioner() { @Override public int getPartition(Object key, Object value, int numPartitions) { return -1; } @Override public void configure(JobConf job) { } }; } rfs = ((LocalFileSystem) localFs).getRaw(); float spillper = job.getFloat("io.sort.spill.percent", (float) 0.9); if (spillper > (float) 1.0 || spillper < (float) 0.0) { LOG.error("Invalid \"io.sort.spill.percent\": " + spillper); spillper = 0.8f; } lastSpillInMem = job.getBoolean("mapred.map.lastspill.memory", true); numBigRecordsWarnThreshold = job.getInt("mapred.map.bigrecord.spill.warn.threshold", 500); int sortmb = job.getInt("io.sort.mb", 100); boolean localMode = job.get("mapred.job.tracker", "local").equals("local"); if (localMode) { sortmb = job.getInt("io.sort.mb.localmode", 100); } if ((sortmb & 0x7FF) != sortmb) { throw new IOException("Invalid \"io.sort.mb\": " + sortmb); } LOG.info("io.sort.mb = " + sortmb); // buffers and accounting kvBufferSize = sortmb << 20; kvbuffer = new byte[kvBufferSize]; softBufferLimit = (int) (kvbuffer.length * spillper); // k/v serialization keyClass = (Class<K>) job.getMapOutputKeyClass(); valClass = (Class<V>) job.getMapOutputValueClass(); if (!BytesWritable.class.isAssignableFrom(keyClass) || !BytesWritable.class.isAssignableFrom(valClass)) { throw new IOException(this.getClass().getName() + " only support " + BytesWritable.class.getName() + " as key and value classes, MapOutputKeyClass is " + keyClass.getName() + ", MapOutputValueClass is " + valClass.getName()); } int numMappers = job.getNumMapTasks(); memoryBlockAllocator = new MemoryBlockAllocator(kvBufferSize, softBufferLimit, numMappers, partitions, this); // counters mapOutputByteCounter = reporter.getCounter(MAP_OUTPUT_BYTES); mapOutputRecordCounter = reporter.getCounter(MAP_OUTPUT_RECORDS); mapSpillSortCounter = new MapSpillSortCounters(reporter); reducePartitions = new ReducePartition[partitions]; inMemorySegments = new Segment[partitions]; for (int i = 0; i < partitions; i++) { reducePartitions[i] = new ReducePartition(i, this.memoryBlockAllocator, this.kvbuffer, this, this.reporter); } // compression if (job.getCompressMapOutput()) { Class<? extends CompressionCodec> codecClass = job .getMapOutputCompressorClass(DefaultCodec.class); codec = ReflectionUtils.newInstance(codecClass, job); } }
Example #21
Source File: TestMerger.java From big-c with Apache License 2.0 | 4 votes |
private Segment<Text, Text> getUncompressedSegment(int i) throws IOException { return new Segment<Text, Text>(getReader(i, false), false); }
Example #22
Source File: ReduceTask.java From hadoop-gpu with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private void doInMemMerge() throws IOException{ if (mapOutputsFilesInMemory.size() == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskID mapId = mapOutputsFilesInMemory.get(0).mapId; List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K,V>>(); long mergeOutputSize = createInMemorySegments(inMemorySegments, 0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapId, reduceTask.getTaskID(), mergeOutputSize); Writer writer = new Writer(conf, rfs, outputPath, conf.getMapOutputKeyClass(), conf.getMapOutputValueClass(), codec, null); RawKeyValueIterator rIter = null; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(conf, rfs, (Class<K>)conf.getMapOutputKeyClass(), (Class<V>)conf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceTask.getTaskID().toString()), conf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null); if (combinerRunner == null) { Merger.writeFile(rIter, writer, reporter, conf); } else { combineCollector.setWriter(writer); combinerRunner.combine(rIter, combineCollector); } writer.close(); LOG.info(reduceTask.getTaskID() + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFileSys.getFileStatus(outputPath).getLen()); } catch (Exception e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes localFileSys.delete(outputPath, true); throw (IOException)new IOException ("Intermediate merge failed").initCause(e); } // Note the output of the merge FileStatus status = localFileSys.getFileStatus(outputPath); synchronized (mapOutputFilesOnDisk) { addToMapOutputFilesOnDisk(status); } }
Example #23
Source File: TestMerger.java From big-c with Apache License 2.0 | 4 votes |
@SuppressWarnings( { "unchecked" }) public void testMergeShouldReturnProperProgress( List<Segment<Text, Text>> segments) throws IOException { Path tmpDir = new Path("localpath"); Class<Text> keyClass = (Class<Text>) jobConf.getMapOutputKeyClass(); Class<Text> valueClass = (Class<Text>) jobConf.getMapOutputValueClass(); RawComparator<Text> comparator = jobConf.getOutputKeyComparator(); Counter readsCounter = new Counter(); Counter writesCounter = new Counter(); Progress mergePhase = new Progress(); RawKeyValueIterator mergeQueue = Merger.merge(conf, fs, keyClass, valueClass, segments, 2, tmpDir, comparator, getReporter(), readsCounter, writesCounter, mergePhase); final float epsilon = 0.00001f; // Reading 6 keys total, 3 each in 2 segments, so each key read moves the // progress forward 1/6th of the way. Initially the first keys from each // segment have been read as part of the merge setup, so progress = 2/6. Assert.assertEquals(2/6.0f, mergeQueue.getProgress().get(), epsilon); // The first next() returns one of the keys already read during merge setup Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(2/6.0f, mergeQueue.getProgress().get(), epsilon); // Subsequent next() calls should read one key and move progress Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(3/6.0f, mergeQueue.getProgress().get(), epsilon); Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(4/6.0f, mergeQueue.getProgress().get(), epsilon); // At this point we've exhausted all of the keys in one segment // so getting the next key will return the already cached key from the // other segment Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(4/6.0f, mergeQueue.getProgress().get(), epsilon); // Subsequent next() calls should read one key and move progress Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(5/6.0f, mergeQueue.getProgress().get(), epsilon); Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), epsilon); // Now there should be no more input Assert.assertFalse(mergeQueue.next()); Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), epsilon); Assert.assertTrue(mergeQueue.getKey() == null); Assert.assertEquals(0, mergeQueue.getValue().getData().length); }
Example #24
Source File: MergeManagerImpl.java From big-c with Apache License 2.0 | 4 votes |
@Override public void merge(List<InMemoryMapOutput<K,V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskAttemptID mapId = inputs.get(0).getMapId(); TaskID mapTaskId = mapId.getTaskID(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments,0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapTaskId, mergeOutputSize).suffix( Task.MERGED_OUTPUT_PREFIX); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(jobConf, out, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true); RawKeyValueIterator rIter = null; CompressAwarePath compressAwarePath; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, null); if (null == combinerClass) { Merger.writeFile(rIter, writer, reporter, jobConf); } else { combineCollector.setWriter(writer); combineAndSpill(rIter, reduceCombineInputCounter); } writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); LOG.info(reduceId + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); } catch (IOException e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes localFS.delete(outputPath, true); throw e; } // Note the output of the merge closeOnDiskFile(compressAwarePath); }
Example #25
Source File: BackupStore.java From big-c with Apache License 2.0 | 4 votes |
public boolean hasNext() throws IOException { if (lastSegmentEOF) { return false; } // We read the next KV from the cache to decide if there is any left. // Since hasNext can be called several times before the actual call to // next(), we use hasMore to avoid extra reads. hasMore is set to false // when the user actually consumes this record in next() if (hasMore) { return true; } Segment<K,V> seg = segmentList.get(readSegmentIndex); // Mark the current position. This would be set to currentKVOffset // when the user consumes this record in next(). nextKVOffset = (int) seg.getActualPosition(); if (seg.nextRawKey()) { currentKey = seg.getKey(); seg.getValue(currentValue); hasMore = true; return true; } else { if (!seg.inMemory()) { seg.closeReader(); } } // If this is the last segment, mark the lastSegmentEOF flag and return if (readSegmentIndex == segmentList.size() - 1) { nextKVOffset = -1; lastSegmentEOF = true; return false; } nextKVOffset = 0; readSegmentIndex ++; Segment<K,V> nextSegment = segmentList.get(readSegmentIndex); // We possibly are moving from a memory segment to a disk segment. // Reset so that we do not corrupt the in-memory segment buffer. // See HADOOP-5494 if (!nextSegment.inMemory()) { currentValue.reset(currentDiskValue.getData(), currentDiskValue.getLength()); nextSegment.init(null); } if (nextSegment.nextRawKey()) { currentKey = nextSegment.getKey(); nextSegment.getValue(currentValue); hasMore = true; return true; } else { throw new IOException("New segment did not have even one K/V"); } }
Example #26
Source File: TestMerger.java From hadoop with Apache License 2.0 | 4 votes |
private Segment<Text, Text> getCompressedSegment(int i) throws IOException { return new Segment<Text, Text>(getReader(i, true), false, 3000l); }
Example #27
Source File: TestMerger.java From hadoop with Apache License 2.0 | 4 votes |
private Segment<Text, Text> getUncompressedSegment(int i) throws IOException { return new Segment<Text, Text>(getReader(i, false), false); }
Example #28
Source File: TestMerger.java From hadoop with Apache License 2.0 | 4 votes |
@SuppressWarnings( { "unchecked" }) public void testMergeShouldReturnProperProgress( List<Segment<Text, Text>> segments) throws IOException { Path tmpDir = new Path("localpath"); Class<Text> keyClass = (Class<Text>) jobConf.getMapOutputKeyClass(); Class<Text> valueClass = (Class<Text>) jobConf.getMapOutputValueClass(); RawComparator<Text> comparator = jobConf.getOutputKeyComparator(); Counter readsCounter = new Counter(); Counter writesCounter = new Counter(); Progress mergePhase = new Progress(); RawKeyValueIterator mergeQueue = Merger.merge(conf, fs, keyClass, valueClass, segments, 2, tmpDir, comparator, getReporter(), readsCounter, writesCounter, mergePhase); final float epsilon = 0.00001f; // Reading 6 keys total, 3 each in 2 segments, so each key read moves the // progress forward 1/6th of the way. Initially the first keys from each // segment have been read as part of the merge setup, so progress = 2/6. Assert.assertEquals(2/6.0f, mergeQueue.getProgress().get(), epsilon); // The first next() returns one of the keys already read during merge setup Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(2/6.0f, mergeQueue.getProgress().get(), epsilon); // Subsequent next() calls should read one key and move progress Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(3/6.0f, mergeQueue.getProgress().get(), epsilon); Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(4/6.0f, mergeQueue.getProgress().get(), epsilon); // At this point we've exhausted all of the keys in one segment // so getting the next key will return the already cached key from the // other segment Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(4/6.0f, mergeQueue.getProgress().get(), epsilon); // Subsequent next() calls should read one key and move progress Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(5/6.0f, mergeQueue.getProgress().get(), epsilon); Assert.assertTrue(mergeQueue.next()); Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), epsilon); // Now there should be no more input Assert.assertFalse(mergeQueue.next()); Assert.assertEquals(1.0f, mergeQueue.getProgress().get(), epsilon); Assert.assertTrue(mergeQueue.getKey() == null); Assert.assertEquals(0, mergeQueue.getValue().getData().length); }
Example #29
Source File: MergeManagerImpl.java From hadoop with Apache License 2.0 | 4 votes |
@Override public void merge(List<InMemoryMapOutput<K,V>> inputs) throws IOException { if (inputs == null || inputs.size() == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskAttemptID mapId = inputs.get(0).getMapId(); TaskID mapTaskId = mapId.getTaskID(); List<Segment<K, V>> inMemorySegments = new ArrayList<Segment<K, V>>(); long mergeOutputSize = createInMemorySegments(inputs, inMemorySegments,0); int noInMemorySegments = inMemorySegments.size(); Path outputPath = mapOutputFile.getInputFileForWrite(mapTaskId, mergeOutputSize).suffix( Task.MERGED_OUTPUT_PREFIX); FSDataOutputStream out = CryptoUtils.wrapIfNecessary(jobConf, rfs.create(outputPath)); Writer<K, V> writer = new Writer<K, V>(jobConf, out, (Class<K>) jobConf.getMapOutputKeyClass(), (Class<V>) jobConf.getMapOutputValueClass(), codec, null, true); RawKeyValueIterator rIter = null; CompressAwarePath compressAwarePath; try { LOG.info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.merge(jobConf, rfs, (Class<K>)jobConf.getMapOutputKeyClass(), (Class<V>)jobConf.getMapOutputValueClass(), inMemorySegments, inMemorySegments.size(), new Path(reduceId.toString()), (RawComparator<K>)jobConf.getOutputKeyComparator(), reporter, spilledRecordsCounter, null, null); if (null == combinerClass) { Merger.writeFile(rIter, writer, reporter, jobConf); } else { combineCollector.setWriter(writer); combineAndSpill(rIter, reduceCombineInputCounter); } writer.close(); compressAwarePath = new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()); LOG.info(reduceId + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + localFS.getFileStatus(outputPath).getLen()); } catch (IOException e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes localFS.delete(outputPath, true); throw e; } // Note the output of the merge closeOnDiskFile(compressAwarePath); }
Example #30
Source File: BackupStore.java From hadoop with Apache License 2.0 | 4 votes |
public boolean hasNext() throws IOException { if (lastSegmentEOF) { return false; } // We read the next KV from the cache to decide if there is any left. // Since hasNext can be called several times before the actual call to // next(), we use hasMore to avoid extra reads. hasMore is set to false // when the user actually consumes this record in next() if (hasMore) { return true; } Segment<K,V> seg = segmentList.get(readSegmentIndex); // Mark the current position. This would be set to currentKVOffset // when the user consumes this record in next(). nextKVOffset = (int) seg.getActualPosition(); if (seg.nextRawKey()) { currentKey = seg.getKey(); seg.getValue(currentValue); hasMore = true; return true; } else { if (!seg.inMemory()) { seg.closeReader(); } } // If this is the last segment, mark the lastSegmentEOF flag and return if (readSegmentIndex == segmentList.size() - 1) { nextKVOffset = -1; lastSegmentEOF = true; return false; } nextKVOffset = 0; readSegmentIndex ++; Segment<K,V> nextSegment = segmentList.get(readSegmentIndex); // We possibly are moving from a memory segment to a disk segment. // Reset so that we do not corrupt the in-memory segment buffer. // See HADOOP-5494 if (!nextSegment.inMemory()) { currentValue.reset(currentDiskValue.getData(), currentDiskValue.getLength()); nextSegment.init(null); } if (nextSegment.nextRawKey()) { currentKey = nextSegment.getKey(); nextSegment.getValue(currentValue); hasMore = true; return true; } else { throw new IOException("New segment did not have even one K/V"); } }