/// <exception cref="System.IO.IOException"/> public virtual void Mark() { // We read one KV pair in advance in hasNext. // If hasNext has read the next KV pair from a new segment, but the // user has not called next() for that KV, then reset the readSegmentIndex // to the previous segment if (nextKVOffset == 0) { System.Diagnostics.Debug.Assert((readSegmentIndex != 0)); System.Diagnostics.Debug.Assert((currentKVOffset != 0)); readSegmentIndex--; } // just drop segments before the current active segment int i = 0; IEnumerator <Merger.Segment <K, V> > itr = segmentList.GetEnumerator(); while (itr.HasNext()) { Merger.Segment <K, V> s = itr.Next(); if (i == readSegmentIndex) { break; } s.Close(); itr.Remove(); i++; Log.Debug("Dropping a segment"); } // FirstSegmentOffset is the offset in the current segment from where we // need to start reading on the next reset firstSegmentOffset = currentKVOffset; readSegmentIndex = 0; Log.Debug("Setting the FirsSegmentOffset to " + currentKVOffset); }
public int Compare(Merger.Segment <K, V> o1, Merger.Segment <K, V> o2) { if (o1.GetLength() == o2.GetLength()) { return(0); } return(o1.GetLength() < o2.GetLength() ? -1 : 1); }
/// <exception cref="System.IO.IOException"/> internal virtual void CreateInDiskSegment() { System.Diagnostics.Debug.Assert((this.writer != null)); this.writer.Close(); Merger.Segment <K, V> s = new Merger.Segment <K, V>(this.conf, this.fs, this.file, null, true); this.writer = null; this._enclosing.segmentList.AddItem(s); BackupStore.Log.Debug("Disk Segment added to List. Size is " + this._enclosing.segmentList .Count); }
/// <exception cref="System.IO.IOException"/> public virtual bool Next() { if (Size() == 0) { ResetKeyValue(); return(false); } if (minSegment != null) { //minSegment is non-null for all invocations of next except the first //one. For the first invocation, the priority queue is ready for use //but for the subsequent invocations, first adjust the queue AdjustPriorityQueue(minSegment); if (Size() == 0) { minSegment = null; ResetKeyValue(); return(false); } } minSegment = Top(); long startPos = minSegment.GetReader().bytesRead; key = minSegment.GetKey(); if (!minSegment.InMemory()) { //When we load the value from an inmemory segment, we reset //the "value" DIB in this class to the inmem segment's byte[]. //When we load the value bytes from disk, we shouldn't use //the same byte[] since it would corrupt the data in the inmem //segment. So we maintain an explicit DIB for value bytes //obtained from disk, and if the current segment is a disk //segment, we reset the "value" DIB to the byte[] in that (so //we reuse the disk segment DIB whenever we consider //a disk segment). minSegment.GetValue(diskIFileValue); value.Reset(diskIFileValue.GetData(), diskIFileValue.GetLength()); } else { minSegment.GetValue(value); } long endPos = minSegment.GetReader().bytesRead; totalBytesProcessed += endPos - startPos; mergeProgress.Set(totalBytesProcessed * progPerByte); return(true); }
/// <exception cref="System.IO.IOException"/> private void AdjustPriorityQueue(Merger.Segment <K, V> reader) { long startPos = reader.GetReader().bytesRead; bool hasNext = reader.NextRawKey(); long endPos = reader.GetReader().bytesRead; totalBytesProcessed += endPos - startPos; mergeProgress.Set(totalBytesProcessed * progPerByte); if (hasNext) { AdjustTop(); } else { Pop(); reader.Close(); } }
/// <exception cref="System.IO.IOException"/> public virtual void Reset() { // Create a new segment for the previously written records only if we // are not already in the reset mode if (!inReset) { if (fileCache.isActive) { fileCache.CreateInDiskSegment(); } else { memCache.CreateInMemorySegment(); } } inReset = true; // Reset the segments to the correct position from where the next read // should begin. for (int i = 0; i < segmentList.Count; i++) { Merger.Segment <K, V> s = segmentList[i]; if (s.InMemory()) { int offset = (i == 0) ? firstSegmentOffset : 0; s.GetReader().Reset(offset); } else { s.CloseReader(); if (i == 0) { s.ReinitReader(firstSegmentOffset); s.GetReader().DisableChecksumValidation(); } } } currentKVOffset = firstSegmentOffset; nextKVOffset = -1; readSegmentIndex = 0; hasMore = false; lastSegmentEOF = false; Log.Debug("Reset - First segment offset is " + firstSegmentOffset + " Segment List Size is " + segmentList.Count); }
/// <summary>This method creates a memory segment from the existing buffer</summary> /// <exception cref="System.IO.IOException"/> internal virtual void CreateInMemorySegment() { // If nothing was written in this block because the record size // was greater than the allocated block size, just return. if (this.usedSize == 0) { this.ramManager.Unreserve(this.blockSize); return; } // spaceAvailable would have ensured that there is enough space // left for the EOF markers. System.Diagnostics.Debug.Assert(((this.blockSize - this.usedSize) >= BackupStore. EofMarkerSize)); WritableUtils.WriteVInt(this.dataOut, IFile.EofMarker); WritableUtils.WriteVInt(this.dataOut, IFile.EofMarker); this.usedSize += BackupStore.EofMarkerSize; this.ramManager.Unreserve(this.blockSize - this.usedSize); IFile.Reader <K, V> reader = new InMemoryReader <K, V>(null, (TaskAttemptID)this._enclosing .tid, this.dataOut.GetData(), 0, this.usedSize, this._enclosing.conf); Merger.Segment <K, V> segment = new Merger.Segment <K, V>(reader, false); this._enclosing.segmentList.AddItem(segment); BackupStore.Log.Debug("Added Memory Segment to List. List Size is " + this._enclosing .segmentList.Count); }
/// <exception cref="System.IO.IOException"/> public virtual bool HasNext() { if (lastSegmentEOF) { return(false); } // We read the next KV from the cache to decide if there is any left. // Since hasNext can be called several times before the actual call to // next(), we use hasMore to avoid extra reads. hasMore is set to false // when the user actually consumes this record in next() if (hasMore) { return(true); } Merger.Segment <K, V> seg = segmentList[readSegmentIndex]; // Mark the current position. This would be set to currentKVOffset // when the user consumes this record in next(). nextKVOffset = (int)seg.GetActualPosition(); if (seg.NextRawKey()) { currentKey = seg.GetKey(); seg.GetValue(currentValue); hasMore = true; return(true); } else { if (!seg.InMemory()) { seg.CloseReader(); } } // If this is the last segment, mark the lastSegmentEOF flag and return if (readSegmentIndex == segmentList.Count - 1) { nextKVOffset = -1; lastSegmentEOF = true; return(false); } nextKVOffset = 0; readSegmentIndex++; Merger.Segment <K, V> nextSegment = segmentList[readSegmentIndex]; // We possibly are moving from a memory segment to a disk segment. // Reset so that we do not corrupt the in-memory segment buffer. // See HADOOP-5494 if (!nextSegment.InMemory()) { currentValue.Reset(currentDiskValue.GetData(), currentDiskValue.GetLength()); nextSegment.Init(null); } if (nextSegment.NextRawKey()) { currentKey = nextSegment.GetKey(); nextSegment.GetValue(currentValue); hasMore = true; return(true); } else { throw new IOException("New segment did not have even one K/V"); } }
/// <exception cref="System.IO.IOException"/> internal virtual RawKeyValueIterator Merge(Type keyClass, Type valueClass, int factor , int inMem, Path tmpDir, Counters.Counter readsCounter, Counters.Counter writesCounter , Progress mergePhase) { Log.Info("Merging " + segments.Count + " sorted segments"); /* * If there are inMemory segments, then they come first in the segments * list and then the sorted disk segments. Otherwise(if there are only * disk segments), then they are sorted segments if there are more than * factor segments in the segments list. */ int numSegments = segments.Count; int origFactor = factor; int passNo = 1; if (mergePhase != null) { mergeProgress = mergePhase; } long totalBytes = ComputeBytesInMerges(factor, inMem); if (totalBytes != 0) { progPerByte = 1.0f / (float)totalBytes; } do { //create the MergeStreams from the sorted map created in the constructor //and dump the final output to a file //get the factor for this pass of merge. We assume in-memory segments //are the first entries in the segment list and that the pass factor //doesn't apply to them factor = GetPassFactor(factor, passNo, numSegments - inMem); if (1 == passNo) { factor += inMem; } IList <Merger.Segment <K, V> > segmentsToMerge = new AList <Merger.Segment <K, V> >(); int segmentsConsidered = 0; int numSegmentsToConsider = factor; long startBytes = 0; // starting bytes of segments of this merge while (true) { //extract the smallest 'factor' number of segments //Call cleanup on the empty segments (no key/value data) IList <Merger.Segment <K, V> > mStream = GetSegmentDescriptors(numSegmentsToConsider ); foreach (Merger.Segment <K, V> segment in mStream) { // Initialize the segment at the last possible moment; // this helps in ensuring we don't use buffers until we need them segment.Init(readsCounter); long startPos = segment.GetReader().bytesRead; bool hasNext = segment.NextRawKey(); long endPos = segment.GetReader().bytesRead; if (hasNext) { startBytes += endPos - startPos; segmentsToMerge.AddItem(segment); segmentsConsidered++; } else { segment.Close(); numSegments--; } } //we ignore this segment for the merge //if we have the desired number of segments //or looked at all available segments, we break if (segmentsConsidered == factor || segments.Count == 0) { break; } numSegmentsToConsider = factor - segmentsConsidered; } //feed the streams to the priority queue Initialize(segmentsToMerge.Count); Clear(); foreach (Merger.Segment <K, V> segment_1 in segmentsToMerge) { Put(segment_1); } //if we have lesser number of segments remaining, then just return the //iterator, else do another single level merge if (numSegments <= factor) { if (!includeFinalMerge) { // for reduce task // Reset totalBytesProcessed and recalculate totalBytes from the // remaining segments to track the progress of the final merge. // Final merge is considered as the progress of the reducePhase, // the 3rd phase of reduce task. totalBytesProcessed = 0; totalBytes = 0; for (int i = 0; i < segmentsToMerge.Count; i++) { totalBytes += segmentsToMerge[i].GetRawDataLength(); } } if (totalBytes != 0) { //being paranoid progPerByte = 1.0f / (float)totalBytes; } totalBytesProcessed += startBytes; if (totalBytes != 0) { mergeProgress.Set(totalBytesProcessed * progPerByte); } else { mergeProgress.Set(1.0f); } // Last pass and no segments left - we're done Log.Info("Down to the last merge-pass, with " + numSegments + " segments left of total size: " + (totalBytes - totalBytesProcessed) + " bytes"); return(this); } else { Log.Info("Merging " + segmentsToMerge.Count + " intermediate segments out of a total of " + (segments.Count + segmentsToMerge.Count)); long bytesProcessedInPrevMerges = totalBytesProcessed; totalBytesProcessed += startBytes; //we want to spread the creation of temp files on multiple disks if //available under the space constraints long approxOutputSize = 0; foreach (Merger.Segment <K, V> s in segmentsToMerge) { approxOutputSize += s.GetLength() + ChecksumFileSystem.GetApproxChkSumLength(s.GetLength ()); } Path tmpFilename = new Path(tmpDir, "intermediate").Suffix("." + passNo); Path outputFile = lDirAlloc.GetLocalPathForWrite(tmpFilename.ToString(), approxOutputSize , conf); FSDataOutputStream @out = fs.Create(outputFile); @out = CryptoUtils.WrapIfNecessary(conf, @out); IFile.Writer <K, V> writer = new IFile.Writer <K, V>(conf, @out, keyClass, valueClass , codec, writesCounter, true); WriteFile(this, writer, reporter, conf); writer.Close(); //we finished one single level merge; now clean up the priority //queue this.Close(); // Add the newly create segment to the list of segments to be merged Merger.Segment <K, V> tempSegment = new Merger.Segment <K, V>(conf, fs, outputFile, codec, false); // Insert new merged segment into the sorted list int pos = Sharpen.Collections.BinarySearch(segments, tempSegment, segmentComparator ); if (pos < 0) { // binary search failed. So position to be inserted at is -pos-1 pos = -pos - 1; } segments.Add(pos, tempSegment); numSegments = segments.Count; // Subtract the difference between expected size of new segment and // actual size of new segment(Expected size of new segment is // inputBytesOfThisMerge) from totalBytes. Expected size and actual // size will match(almost) if combiner is not called in merge. long inputBytesOfThisMerge = totalBytesProcessed - bytesProcessedInPrevMerges; totalBytes -= inputBytesOfThisMerge - tempSegment.GetRawDataLength(); if (totalBytes != 0) { progPerByte = 1.0f / (float)totalBytes; } passNo++; } //we are worried about only the first pass merge factor. So reset the //factor to what it originally was factor = origFactor; }while (true); }