/// <exception cref="System.IO.IOException"/> public override void Merge(IList <MergeManagerImpl.CompressAwarePath> inputs) { // sanity check if (inputs == null || inputs.IsEmpty()) { MergeManagerImpl.Log.Info("No ondisk files to merge..."); return; } long approxOutputSize = 0; int bytesPerSum = this._enclosing.jobConf.GetInt("io.bytes.per.checksum", 512); MergeManagerImpl.Log.Info("OnDiskMerger: We have " + inputs.Count + " map outputs on disk. Triggering merge..." ); // 1. Prepare the list of files to be merged. foreach (MergeManagerImpl.CompressAwarePath file in inputs) { approxOutputSize += this._enclosing.localFS.GetFileStatus(file).GetLen(); } // add the checksum length approxOutputSize += ChecksumFileSystem.GetChecksumLength(approxOutputSize, bytesPerSum ); // 2. Start the on-disk merge process Path outputPath = this._enclosing.localDirAllocator.GetLocalPathForWrite(inputs[0 ].ToString(), approxOutputSize, this._enclosing.jobConf).Suffix(Org.Apache.Hadoop.Mapred.Task .MergedOutputPrefix); FSDataOutputStream @out = CryptoUtils.WrapIfNecessary(this._enclosing.jobConf, this ._enclosing.rfs.Create(outputPath)); IFile.Writer <K, V> writer = new IFile.Writer <K, V>(this._enclosing.jobConf, @out, (Type)this._enclosing.jobConf.GetMapOutputKeyClass(), (Type)this._enclosing.jobConf .GetMapOutputValueClass(), this._enclosing.codec, null, true); RawKeyValueIterator iter = null; MergeManagerImpl.CompressAwarePath compressAwarePath; Path tmpDir = new Path(this._enclosing.reduceId.ToString()); try { iter = Merger.Merge(this._enclosing.jobConf, this._enclosing.rfs, (Type)this._enclosing .jobConf.GetMapOutputKeyClass(), (Type)this._enclosing.jobConf.GetMapOutputValueClass (), this._enclosing.codec, Sharpen.Collections.ToArray(inputs, new Path[inputs.Count ]), true, this._enclosing.ioSortFactor, tmpDir, (RawComparator <K>) this._enclosing .jobConf.GetOutputKeyComparator(), this._enclosing.reporter, this._enclosing.spilledRecordsCounter , null, this._enclosing.mergedMapOutputsCounter, null); Merger.WriteFile(iter, writer, this._enclosing.reporter, this._enclosing.jobConf); writer.Close(); compressAwarePath = new MergeManagerImpl.CompressAwarePath(outputPath, writer.GetRawLength (), writer.GetCompressedLength()); } catch (IOException e) { this._enclosing.localFS.Delete(outputPath, true); throw; } this._enclosing.CloseOnDiskFile(compressAwarePath); MergeManagerImpl.Log.Info(this._enclosing.reduceId + " Finished merging " + inputs .Count + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is " + outputPath + " of size " + this._enclosing.localFS.GetFileStatus(outputPath). GetLen()); }
public virtual void TestgetChecksumLength() { Assert.Equal(8, ChecksumFileSystem.GetChecksumLength(0L, 512)); Assert.Equal(12, ChecksumFileSystem.GetChecksumLength(1L, 512) ); Assert.Equal(12, ChecksumFileSystem.GetChecksumLength(512L, 512 )); Assert.Equal(16, ChecksumFileSystem.GetChecksumLength(513L, 512 )); Assert.Equal(16, ChecksumFileSystem.GetChecksumLength(1023L, 512 )); Assert.Equal(16, ChecksumFileSystem.GetChecksumLength(1024L, 512 )); Assert.Equal(408, ChecksumFileSystem.GetChecksumLength(100L, 1 )); Assert.Equal(4000000000008L, ChecksumFileSystem.GetChecksumLength (10000000000000L, 10)); }
/// <exception cref="System.IO.IOException"/> public virtual int Run(string[] args) { // silence the minidfs cluster Log hadoopLog = LogFactory.GetLog("org"); if (hadoopLog is Log4JLogger) { ((Log4JLogger)hadoopLog).GetLogger().SetLevel(Level.Warn); } int reps = 1; if (args.Length == 1) { try { reps = System.Convert.ToInt32(args[0]); } catch (FormatException) { PrintUsage(); return(-1); } } else { if (args.Length > 1) { PrintUsage(); return(-1); } } Configuration conf = GetConf(); // the size of the file to write long Size = conf.GetLong("dfsthroughput.file.size", 10L * 1024 * 1024 * 1024); BufferSize = conf.GetInt("dfsthroughput.buffer.size", 4 * 1024); string localDir = conf.Get("mapred.temp.dir"); if (localDir == null) { localDir = conf.Get("hadoop.tmp.dir"); conf.Set("mapred.temp.dir", localDir); } dir = new LocalDirAllocator("mapred.temp.dir"); Runtime.SetProperty("test.build.data", localDir); System.Console.Out.WriteLine("Local = " + localDir); ChecksumFileSystem checkedLocal = FileSystem.GetLocal(conf); FileSystem rawLocal = checkedLocal.GetRawFileSystem(); for (int i = 0; i < reps; ++i) { WriteAndReadLocalFile("local", conf, Size); WriteAndReadFile(rawLocal, "raw", conf, Size); WriteAndReadFile(checkedLocal, "checked", conf, Size); } MiniDFSCluster cluster = null; try { cluster = new MiniDFSCluster.Builder(conf).Racks(new string[] { "/foo" }).Build(); cluster.WaitActive(); FileSystem dfs = cluster.GetFileSystem(); for (int i_1 = 0; i_1 < reps; ++i_1) { WriteAndReadFile(dfs, "dfs", conf, Size); } } finally { if (cluster != null) { cluster.Shutdown(); // clean up minidfs junk rawLocal.Delete(new Path(localDir, "dfs"), true); } } return(0); }
/// <exception cref="System.IO.IOException"/> internal virtual RawKeyValueIterator Merge(Type keyClass, Type valueClass, int factor , int inMem, Path tmpDir, Counters.Counter readsCounter, Counters.Counter writesCounter , Progress mergePhase) { Log.Info("Merging " + segments.Count + " sorted segments"); /* * If there are inMemory segments, then they come first in the segments * list and then the sorted disk segments. Otherwise(if there are only * disk segments), then they are sorted segments if there are more than * factor segments in the segments list. */ int numSegments = segments.Count; int origFactor = factor; int passNo = 1; if (mergePhase != null) { mergeProgress = mergePhase; } long totalBytes = ComputeBytesInMerges(factor, inMem); if (totalBytes != 0) { progPerByte = 1.0f / (float)totalBytes; } do { //create the MergeStreams from the sorted map created in the constructor //and dump the final output to a file //get the factor for this pass of merge. We assume in-memory segments //are the first entries in the segment list and that the pass factor //doesn't apply to them factor = GetPassFactor(factor, passNo, numSegments - inMem); if (1 == passNo) { factor += inMem; } IList <Merger.Segment <K, V> > segmentsToMerge = new AList <Merger.Segment <K, V> >(); int segmentsConsidered = 0; int numSegmentsToConsider = factor; long startBytes = 0; // starting bytes of segments of this merge while (true) { //extract the smallest 'factor' number of segments //Call cleanup on the empty segments (no key/value data) IList <Merger.Segment <K, V> > mStream = GetSegmentDescriptors(numSegmentsToConsider ); foreach (Merger.Segment <K, V> segment in mStream) { // Initialize the segment at the last possible moment; // this helps in ensuring we don't use buffers until we need them segment.Init(readsCounter); long startPos = segment.GetReader().bytesRead; bool hasNext = segment.NextRawKey(); long endPos = segment.GetReader().bytesRead; if (hasNext) { startBytes += endPos - startPos; segmentsToMerge.AddItem(segment); segmentsConsidered++; } else { segment.Close(); numSegments--; } } //we ignore this segment for the merge //if we have the desired number of segments //or looked at all available segments, we break if (segmentsConsidered == factor || segments.Count == 0) { break; } numSegmentsToConsider = factor - segmentsConsidered; } //feed the streams to the priority queue Initialize(segmentsToMerge.Count); Clear(); foreach (Merger.Segment <K, V> segment_1 in segmentsToMerge) { Put(segment_1); } //if we have lesser number of segments remaining, then just return the //iterator, else do another single level merge if (numSegments <= factor) { if (!includeFinalMerge) { // for reduce task // Reset totalBytesProcessed and recalculate totalBytes from the // remaining segments to track the progress of the final merge. // Final merge is considered as the progress of the reducePhase, // the 3rd phase of reduce task. totalBytesProcessed = 0; totalBytes = 0; for (int i = 0; i < segmentsToMerge.Count; i++) { totalBytes += segmentsToMerge[i].GetRawDataLength(); } } if (totalBytes != 0) { //being paranoid progPerByte = 1.0f / (float)totalBytes; } totalBytesProcessed += startBytes; if (totalBytes != 0) { mergeProgress.Set(totalBytesProcessed * progPerByte); } else { mergeProgress.Set(1.0f); } // Last pass and no segments left - we're done Log.Info("Down to the last merge-pass, with " + numSegments + " segments left of total size: " + (totalBytes - totalBytesProcessed) + " bytes"); return(this); } else { Log.Info("Merging " + segmentsToMerge.Count + " intermediate segments out of a total of " + (segments.Count + segmentsToMerge.Count)); long bytesProcessedInPrevMerges = totalBytesProcessed; totalBytesProcessed += startBytes; //we want to spread the creation of temp files on multiple disks if //available under the space constraints long approxOutputSize = 0; foreach (Merger.Segment <K, V> s in segmentsToMerge) { approxOutputSize += s.GetLength() + ChecksumFileSystem.GetApproxChkSumLength(s.GetLength ()); } Path tmpFilename = new Path(tmpDir, "intermediate").Suffix("." + passNo); Path outputFile = lDirAlloc.GetLocalPathForWrite(tmpFilename.ToString(), approxOutputSize , conf); FSDataOutputStream @out = fs.Create(outputFile); @out = CryptoUtils.WrapIfNecessary(conf, @out); IFile.Writer <K, V> writer = new IFile.Writer <K, V>(conf, @out, keyClass, valueClass , codec, writesCounter, true); WriteFile(this, writer, reporter, conf); writer.Close(); //we finished one single level merge; now clean up the priority //queue this.Close(); // Add the newly create segment to the list of segments to be merged Merger.Segment <K, V> tempSegment = new Merger.Segment <K, V>(conf, fs, outputFile, codec, false); // Insert new merged segment into the sorted list int pos = Sharpen.Collections.BinarySearch(segments, tempSegment, segmentComparator ); if (pos < 0) { // binary search failed. So position to be inserted at is -pos-1 pos = -pos - 1; } segments.Add(pos, tempSegment); numSegments = segments.Count; // Subtract the difference between expected size of new segment and // actual size of new segment(Expected size of new segment is // inputBytesOfThisMerge) from totalBytes. Expected size and actual // size will match(almost) if combiner is not called in merge. long inputBytesOfThisMerge = totalBytesProcessed - bytesProcessedInPrevMerges; totalBytes -= inputBytesOfThisMerge - tempSegment.GetRawDataLength(); if (totalBytes != 0) { progPerByte = 1.0f / (float)totalBytes; } passNo++; } //we are worried about only the first pass merge factor. So reset the //factor to what it originally was factor = origFactor; }while (true); }