public virtual void TestInMemoryAndOnDiskMerger() { JobID jobId = new JobID("a", 0); TaskAttemptID reduceId1 = new TaskAttemptID(new TaskID(jobId, TaskType.Reduce, 0) , 0); TaskAttemptID mapId1 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 1), 0); TaskAttemptID mapId2 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 2), 0); LocalDirAllocator lda = new LocalDirAllocator(MRConfig.LocalDir); MergeManagerImpl <Text, Text> mergeManager = new MergeManagerImpl <Text, Text>(reduceId1 , jobConf, fs, lda, Reporter.Null, null, null, null, null, null, null, null, new Progress(), new MROutputFiles()); // write map outputs IDictionary <string, string> map1 = new SortedDictionary <string, string>(); map1["apple"] = "disgusting"; map1["carrot"] = "delicious"; IDictionary <string, string> map2 = new SortedDictionary <string, string>(); map1["banana"] = "pretty good"; byte[] mapOutputBytes1 = WriteMapOutput(conf, map1); byte[] mapOutputBytes2 = WriteMapOutput(conf, map2); InMemoryMapOutput <Text, Text> mapOutput1 = new InMemoryMapOutput <Text, Text>(conf , mapId1, mergeManager, mapOutputBytes1.Length, null, true); InMemoryMapOutput <Text, Text> mapOutput2 = new InMemoryMapOutput <Text, Text>(conf , mapId2, mergeManager, mapOutputBytes2.Length, null, true); System.Array.Copy(mapOutputBytes1, 0, mapOutput1.GetMemory(), 0, mapOutputBytes1. Length); System.Array.Copy(mapOutputBytes2, 0, mapOutput2.GetMemory(), 0, mapOutputBytes2. Length); // create merger and run merge MergeThread <InMemoryMapOutput <Text, Text>, Text, Text> inMemoryMerger = mergeManager .CreateInMemoryMerger(); IList <InMemoryMapOutput <Text, Text> > mapOutputs1 = new AList <InMemoryMapOutput <Text , Text> >(); mapOutputs1.AddItem(mapOutput1); mapOutputs1.AddItem(mapOutput2); inMemoryMerger.Merge(mapOutputs1); NUnit.Framework.Assert.AreEqual(1, mergeManager.onDiskMapOutputs.Count); TaskAttemptID reduceId2 = new TaskAttemptID(new TaskID(jobId, TaskType.Reduce, 3) , 0); TaskAttemptID mapId3 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 4), 0); TaskAttemptID mapId4 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 5), 0); // write map outputs IDictionary <string, string> map3 = new SortedDictionary <string, string>(); map3["apple"] = "awesome"; map3["carrot"] = "amazing"; IDictionary <string, string> map4 = new SortedDictionary <string, string>(); map4["banana"] = "bla"; byte[] mapOutputBytes3 = WriteMapOutput(conf, map3); byte[] mapOutputBytes4 = WriteMapOutput(conf, map4); InMemoryMapOutput <Text, Text> mapOutput3 = new InMemoryMapOutput <Text, Text>(conf , mapId3, mergeManager, mapOutputBytes3.Length, null, true); InMemoryMapOutput <Text, Text> mapOutput4 = new InMemoryMapOutput <Text, Text>(conf , mapId4, mergeManager, mapOutputBytes4.Length, null, true); System.Array.Copy(mapOutputBytes3, 0, mapOutput3.GetMemory(), 0, mapOutputBytes3. Length); System.Array.Copy(mapOutputBytes4, 0, mapOutput4.GetMemory(), 0, mapOutputBytes4. Length); // // create merger and run merge MergeThread <InMemoryMapOutput <Text, Text>, Text, Text> inMemoryMerger2 = mergeManager .CreateInMemoryMerger(); IList <InMemoryMapOutput <Text, Text> > mapOutputs2 = new AList <InMemoryMapOutput <Text , Text> >(); mapOutputs2.AddItem(mapOutput3); mapOutputs2.AddItem(mapOutput4); inMemoryMerger2.Merge(mapOutputs2); NUnit.Framework.Assert.AreEqual(2, mergeManager.onDiskMapOutputs.Count); IList <MergeManagerImpl.CompressAwarePath> paths = new AList <MergeManagerImpl.CompressAwarePath >(); IEnumerator <MergeManagerImpl.CompressAwarePath> iterator = mergeManager.onDiskMapOutputs .GetEnumerator(); IList <string> keys = new AList <string>(); IList <string> values = new AList <string>(); while (iterator.HasNext()) { MergeManagerImpl.CompressAwarePath next = iterator.Next(); ReadOnDiskMapOutput(conf, fs, next, keys, values); paths.AddItem(next); } NUnit.Framework.Assert.AreEqual(keys, Arrays.AsList("apple", "banana", "carrot", "apple", "banana", "carrot")); NUnit.Framework.Assert.AreEqual(values, Arrays.AsList("awesome", "bla", "amazing" , "disgusting", "pretty good", "delicious")); mergeManager.Close(); mergeManager = new MergeManagerImpl <Text, Text>(reduceId2, jobConf, fs, lda, Reporter .Null, null, null, null, null, null, null, null, new Progress(), new MROutputFiles ()); MergeThread <MergeManagerImpl.CompressAwarePath, Text, Text> onDiskMerger = mergeManager .CreateOnDiskMerger(); onDiskMerger.Merge(paths); NUnit.Framework.Assert.AreEqual(1, mergeManager.onDiskMapOutputs.Count); keys = new AList <string>(); values = new AList <string>(); ReadOnDiskMapOutput(conf, fs, mergeManager.onDiskMapOutputs.GetEnumerator().Next( ), keys, values); NUnit.Framework.Assert.AreEqual(keys, Arrays.AsList("apple", "apple", "banana", "banana" , "carrot", "carrot")); NUnit.Framework.Assert.AreEqual(values, Arrays.AsList("awesome", "disgusting", "pretty good" , "bla", "amazing", "delicious")); mergeManager.Close(); NUnit.Framework.Assert.AreEqual(0, mergeManager.inMemoryMapOutputs.Count); NUnit.Framework.Assert.AreEqual(0, mergeManager.inMemoryMergedMapOutputs.Count); NUnit.Framework.Assert.AreEqual(0, mergeManager.onDiskMapOutputs.Count); }
public MergeManagerImpl(TaskAttemptID reduceId, JobConf jobConf, FileSystem localFS , LocalDirAllocator localDirAllocator, Reporter reporter, CompressionCodec codec , Type combinerClass, Task.CombineOutputCollector <K, V> combineCollector, Counters.Counter spilledRecordsCounter, Counters.Counter reduceCombineInputCounter, Counters.Counter mergedMapOutputsCounter, ExceptionReporter exceptionReporter, Progress mergePhase , MapOutputFile mapOutputFile) { /* Maximum percentage of the in-memory limit that a single shuffle can * consume*/ this.reduceId = reduceId; this.jobConf = jobConf; this.localDirAllocator = localDirAllocator; this.exceptionReporter = exceptionReporter; this.reporter = reporter; this.codec = codec; this.combinerClass = combinerClass; this.combineCollector = combineCollector; this.reduceCombineInputCounter = reduceCombineInputCounter; this.spilledRecordsCounter = spilledRecordsCounter; this.mergedMapOutputsCounter = mergedMapOutputsCounter; this.mapOutputFile = mapOutputFile; this.mapOutputFile.SetConf(jobConf); this.localFS = localFS; this.rfs = ((LocalFileSystem)localFS).GetRaw(); float maxInMemCopyUse = jobConf.GetFloat(MRJobConfig.ShuffleInputBufferPercent, MRJobConfig .DefaultShuffleInputBufferPercent); if (maxInMemCopyUse > 1.0 || maxInMemCopyUse < 0.0) { throw new ArgumentException("Invalid value for " + MRJobConfig.ShuffleInputBufferPercent + ": " + maxInMemCopyUse); } // Allow unit tests to fix Runtime memory this.memoryLimit = (long)(jobConf.GetLong(MRJobConfig.ReduceMemoryTotalBytes, Runtime .GetRuntime().MaxMemory()) * maxInMemCopyUse); this.ioSortFactor = jobConf.GetInt(MRJobConfig.IoSortFactor, 100); float singleShuffleMemoryLimitPercent = jobConf.GetFloat(MRJobConfig.ShuffleMemoryLimitPercent , DefaultShuffleMemoryLimitPercent); if (singleShuffleMemoryLimitPercent <= 0.0f || singleShuffleMemoryLimitPercent > 1.0f) { throw new ArgumentException("Invalid value for " + MRJobConfig.ShuffleMemoryLimitPercent + ": " + singleShuffleMemoryLimitPercent); } usedMemory = 0L; commitMemory = 0L; this.maxSingleShuffleLimit = (long)(memoryLimit * singleShuffleMemoryLimitPercent ); this.memToMemMergeOutputsThreshold = jobConf.GetInt(MRJobConfig.ReduceMemtomemThreshold , ioSortFactor); this.mergeThreshold = (long)(this.memoryLimit * jobConf.GetFloat(MRJobConfig.ShuffleMergePercent , 0.90f)); Log.Info("MergerManager: memoryLimit=" + memoryLimit + ", " + "maxSingleShuffleLimit=" + maxSingleShuffleLimit + ", " + "mergeThreshold=" + mergeThreshold + ", " + "ioSortFactor=" + ioSortFactor + ", " + "memToMemMergeOutputsThreshold=" + memToMemMergeOutputsThreshold ); if (this.maxSingleShuffleLimit >= this.mergeThreshold) { throw new RuntimeException("Invalid configuration: " + "maxSingleShuffleLimit should be less than mergeThreshold " + "maxSingleShuffleLimit: " + this.maxSingleShuffleLimit + "mergeThreshold: " + this.mergeThreshold); } bool allowMemToMemMerge = jobConf.GetBoolean(MRJobConfig.ReduceMemtomemEnabled, false ); if (allowMemToMemMerge) { this.memToMemMerger = new MergeManagerImpl.IntermediateMemoryToMemoryMerger(this, this, memToMemMergeOutputsThreshold); this.memToMemMerger.Start(); } else { this.memToMemMerger = null; } this.inMemoryMerger = CreateInMemoryMerger(); this.inMemoryMerger.Start(); this.onDiskMerger = new MergeManagerImpl.OnDiskMerger(this, this); this.onDiskMerger.Start(); this.mergePhase = mergePhase; }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Sharpen.URISyntaxException"/> /// <exception cref="System.Exception"/> public virtual void TestOnDiskMerger() { JobConf jobConf = new JobConf(); int SortFactor = 5; jobConf.SetInt(MRJobConfig.IoSortFactor, SortFactor); MapOutputFile mapOutputFile = new MROutputFiles(); FileSystem fs = FileSystem.GetLocal(jobConf); MergeManagerImpl <IntWritable, IntWritable> manager = new MergeManagerImpl <IntWritable , IntWritable>(null, jobConf, fs, null, null, null, null, null, null, null, null , null, null, mapOutputFile); MergeThread <MapOutput <IntWritable, IntWritable>, IntWritable, IntWritable> onDiskMerger = (MergeThread <MapOutput <IntWritable, IntWritable>, IntWritable, IntWritable>)Whitebox .GetInternalState(manager, "onDiskMerger"); int mergeFactor = (int)Whitebox.GetInternalState(onDiskMerger, "mergeFactor"); // make sure the io.sort.factor is set properly NUnit.Framework.Assert.AreEqual(mergeFactor, SortFactor); // Stop the onDiskMerger thread so that we can intercept the list of files // waiting to be merged. onDiskMerger.Suspend(); //Send the list of fake files waiting to be merged Random rand = new Random(); for (int i = 0; i < 2 * SortFactor; ++i) { Path path = new Path("somePath"); MergeManagerImpl.CompressAwarePath cap = new MergeManagerImpl.CompressAwarePath(path , 1l, rand.Next()); manager.CloseOnDiskFile(cap); } //Check that the files pending to be merged are in sorted order. List <IList <MergeManagerImpl.CompressAwarePath> > pendingToBeMerged = (List <IList <MergeManagerImpl.CompressAwarePath > >)Whitebox.GetInternalState(onDiskMerger, "pendingToBeMerged"); NUnit.Framework.Assert.IsTrue("No inputs were added to list pending to merge", pendingToBeMerged .Count > 0); for (int i_1 = 0; i_1 < pendingToBeMerged.Count; ++i_1) { IList <MergeManagerImpl.CompressAwarePath> inputs = pendingToBeMerged[i_1]; for (int j = 1; j < inputs.Count; ++j) { NUnit.Framework.Assert.IsTrue("Not enough / too many inputs were going to be merged" , inputs.Count > 0 && inputs.Count <= SortFactor); NUnit.Framework.Assert.IsTrue("Inputs to be merged were not sorted according to size: " , inputs[j].GetCompressedSize() >= inputs[j - 1].GetCompressedSize()); } } }