/// <exception cref="System.IO.IOException"/> public override void Commit() { fs.Rename(tmpOutputPath, outputPath); MergeManagerImpl.CompressAwarePath compressAwarePath = new MergeManagerImpl.CompressAwarePath (outputPath, GetSize(), this.compressedSize); merger.CloseOnDiskFile(compressAwarePath); }
public virtual void CloseOnDiskFile(MergeManagerImpl.CompressAwarePath file) { lock (this) { onDiskMapOutputs.AddItem(file); if (onDiskMapOutputs.Count >= (2 * ioSortFactor - 1)) { onDiskMerger.StartMerge(onDiskMapOutputs); } } }
/// <exception cref="System.IO.IOException"/> /// <exception cref="Sharpen.URISyntaxException"/> /// <exception cref="System.Exception"/> public virtual void TestOnDiskMerger() { JobConf jobConf = new JobConf(); int SortFactor = 5; jobConf.SetInt(MRJobConfig.IoSortFactor, SortFactor); MapOutputFile mapOutputFile = new MROutputFiles(); FileSystem fs = FileSystem.GetLocal(jobConf); MergeManagerImpl <IntWritable, IntWritable> manager = new MergeManagerImpl <IntWritable , IntWritable>(null, jobConf, fs, null, null, null, null, null, null, null, null , null, null, mapOutputFile); MergeThread <MapOutput <IntWritable, IntWritable>, IntWritable, IntWritable> onDiskMerger = (MergeThread <MapOutput <IntWritable, IntWritable>, IntWritable, IntWritable>)Whitebox .GetInternalState(manager, "onDiskMerger"); int mergeFactor = (int)Whitebox.GetInternalState(onDiskMerger, "mergeFactor"); // make sure the io.sort.factor is set properly NUnit.Framework.Assert.AreEqual(mergeFactor, SortFactor); // Stop the onDiskMerger thread so that we can intercept the list of files // waiting to be merged. onDiskMerger.Suspend(); //Send the list of fake files waiting to be merged Random rand = new Random(); for (int i = 0; i < 2 * SortFactor; ++i) { Path path = new Path("somePath"); MergeManagerImpl.CompressAwarePath cap = new MergeManagerImpl.CompressAwarePath(path , 1l, rand.Next()); manager.CloseOnDiskFile(cap); } //Check that the files pending to be merged are in sorted order. List <IList <MergeManagerImpl.CompressAwarePath> > pendingToBeMerged = (List <IList <MergeManagerImpl.CompressAwarePath > >)Whitebox.GetInternalState(onDiskMerger, "pendingToBeMerged"); NUnit.Framework.Assert.IsTrue("No inputs were added to list pending to merge", pendingToBeMerged .Count > 0); for (int i_1 = 0; i_1 < pendingToBeMerged.Count; ++i_1) { IList <MergeManagerImpl.CompressAwarePath> inputs = pendingToBeMerged[i_1]; for (int j = 1; j < inputs.Count; ++j) { NUnit.Framework.Assert.IsTrue("Not enough / too many inputs were going to be merged" , inputs.Count > 0 && inputs.Count <= SortFactor); NUnit.Framework.Assert.IsTrue("Inputs to be merged were not sorted according to size: " , inputs[j].GetCompressedSize() >= inputs[j - 1].GetCompressedSize()); } } }
public override int CompareTo(object obj) { if (obj is MergeManagerImpl.CompressAwarePath) { MergeManagerImpl.CompressAwarePath compPath = (MergeManagerImpl.CompressAwarePath )obj; if (this.compressedSize < compPath.GetCompressedSize()) { return(-1); } else { if (this.GetCompressedSize() > compPath.GetCompressedSize()) { return(1); } } } // Not returning 0 here so that objects with the same size (but // different paths) are still added to the TreeSet. return(base.CompareTo(obj)); }
/// <exception cref="System.IO.IOException"/> public override void Merge(IList <InMemoryMapOutput <K, V> > inputs) { if (inputs == null || inputs.Count == 0) { return; } //name this output file same as the name of the first file that is //there in the current list of inmem files (this is guaranteed to //be absent on the disk currently. So we don't overwrite a prev. //created spill). Also we need to create the output file now since //it is not guaranteed that this file will be present after merge //is called (we delete empty files as soon as we see them //in the merge method) //figure out the mapId TaskAttemptID mapId = inputs[0].GetMapId(); TaskID mapTaskId = mapId.GetTaskID(); IList <Merger.Segment <K, V> > inMemorySegments = new AList <Merger.Segment <K, V> >(); long mergeOutputSize = this._enclosing.CreateInMemorySegments(inputs, inMemorySegments , 0); int noInMemorySegments = inMemorySegments.Count; Path outputPath = this._enclosing.mapOutputFile.GetInputFileForWrite(mapTaskId, mergeOutputSize ).Suffix(Org.Apache.Hadoop.Mapred.Task.MergedOutputPrefix); FSDataOutputStream @out = CryptoUtils.WrapIfNecessary(this._enclosing.jobConf, this ._enclosing.rfs.Create(outputPath)); IFile.Writer <K, V> writer = new IFile.Writer <K, V>(this._enclosing.jobConf, @out, (Type)this._enclosing.jobConf.GetMapOutputKeyClass(), (Type)this._enclosing.jobConf .GetMapOutputValueClass(), this._enclosing.codec, null, true); RawKeyValueIterator rIter = null; MergeManagerImpl.CompressAwarePath compressAwarePath; try { MergeManagerImpl.Log.Info("Initiating in-memory merge with " + noInMemorySegments + " segments..."); rIter = Merger.Merge(this._enclosing.jobConf, this._enclosing.rfs, (Type)this._enclosing .jobConf.GetMapOutputKeyClass(), (Type)this._enclosing.jobConf.GetMapOutputValueClass (), inMemorySegments, inMemorySegments.Count, new Path(this._enclosing.reduceId. ToString()), (RawComparator <K>) this._enclosing.jobConf.GetOutputKeyComparator(), this._enclosing.reporter, this._enclosing.spilledRecordsCounter, null, null); if (null == this._enclosing.combinerClass) { Merger.WriteFile(rIter, writer, this._enclosing.reporter, this._enclosing.jobConf ); } else { this._enclosing.combineCollector.SetWriter(writer); this._enclosing.CombineAndSpill(rIter, this._enclosing.reduceCombineInputCounter); } writer.Close(); compressAwarePath = new MergeManagerImpl.CompressAwarePath(outputPath, writer.GetRawLength (), writer.GetCompressedLength()); MergeManagerImpl.Log.Info(this._enclosing.reduceId + " Merge of the " + noInMemorySegments + " files in-memory complete." + " Local file is " + outputPath + " of size " + this._enclosing.localFS.GetFileStatus(outputPath).GetLen()); } catch (IOException e) { //make sure that we delete the ondisk file that we created //earlier when we invoked cloneFileAttributes this._enclosing.localFS.Delete(outputPath, true); throw; } // Note the output of the merge this._enclosing.CloseOnDiskFile(compressAwarePath); }
public virtual void TestInMemoryAndOnDiskMerger() { JobID jobId = new JobID("a", 0); TaskAttemptID reduceId1 = new TaskAttemptID(new TaskID(jobId, TaskType.Reduce, 0) , 0); TaskAttemptID mapId1 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 1), 0); TaskAttemptID mapId2 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 2), 0); LocalDirAllocator lda = new LocalDirAllocator(MRConfig.LocalDir); MergeManagerImpl <Text, Text> mergeManager = new MergeManagerImpl <Text, Text>(reduceId1 , jobConf, fs, lda, Reporter.Null, null, null, null, null, null, null, null, new Progress(), new MROutputFiles()); // write map outputs IDictionary <string, string> map1 = new SortedDictionary <string, string>(); map1["apple"] = "disgusting"; map1["carrot"] = "delicious"; IDictionary <string, string> map2 = new SortedDictionary <string, string>(); map1["banana"] = "pretty good"; byte[] mapOutputBytes1 = WriteMapOutput(conf, map1); byte[] mapOutputBytes2 = WriteMapOutput(conf, map2); InMemoryMapOutput <Text, Text> mapOutput1 = new InMemoryMapOutput <Text, Text>(conf , mapId1, mergeManager, mapOutputBytes1.Length, null, true); InMemoryMapOutput <Text, Text> mapOutput2 = new InMemoryMapOutput <Text, Text>(conf , mapId2, mergeManager, mapOutputBytes2.Length, null, true); System.Array.Copy(mapOutputBytes1, 0, mapOutput1.GetMemory(), 0, mapOutputBytes1. Length); System.Array.Copy(mapOutputBytes2, 0, mapOutput2.GetMemory(), 0, mapOutputBytes2. Length); // create merger and run merge MergeThread <InMemoryMapOutput <Text, Text>, Text, Text> inMemoryMerger = mergeManager .CreateInMemoryMerger(); IList <InMemoryMapOutput <Text, Text> > mapOutputs1 = new AList <InMemoryMapOutput <Text , Text> >(); mapOutputs1.AddItem(mapOutput1); mapOutputs1.AddItem(mapOutput2); inMemoryMerger.Merge(mapOutputs1); NUnit.Framework.Assert.AreEqual(1, mergeManager.onDiskMapOutputs.Count); TaskAttemptID reduceId2 = new TaskAttemptID(new TaskID(jobId, TaskType.Reduce, 3) , 0); TaskAttemptID mapId3 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 4), 0); TaskAttemptID mapId4 = new TaskAttemptID(new TaskID(jobId, TaskType.Map, 5), 0); // write map outputs IDictionary <string, string> map3 = new SortedDictionary <string, string>(); map3["apple"] = "awesome"; map3["carrot"] = "amazing"; IDictionary <string, string> map4 = new SortedDictionary <string, string>(); map4["banana"] = "bla"; byte[] mapOutputBytes3 = WriteMapOutput(conf, map3); byte[] mapOutputBytes4 = WriteMapOutput(conf, map4); InMemoryMapOutput <Text, Text> mapOutput3 = new InMemoryMapOutput <Text, Text>(conf , mapId3, mergeManager, mapOutputBytes3.Length, null, true); InMemoryMapOutput <Text, Text> mapOutput4 = new InMemoryMapOutput <Text, Text>(conf , mapId4, mergeManager, mapOutputBytes4.Length, null, true); System.Array.Copy(mapOutputBytes3, 0, mapOutput3.GetMemory(), 0, mapOutputBytes3. Length); System.Array.Copy(mapOutputBytes4, 0, mapOutput4.GetMemory(), 0, mapOutputBytes4. Length); // // create merger and run merge MergeThread <InMemoryMapOutput <Text, Text>, Text, Text> inMemoryMerger2 = mergeManager .CreateInMemoryMerger(); IList <InMemoryMapOutput <Text, Text> > mapOutputs2 = new AList <InMemoryMapOutput <Text , Text> >(); mapOutputs2.AddItem(mapOutput3); mapOutputs2.AddItem(mapOutput4); inMemoryMerger2.Merge(mapOutputs2); NUnit.Framework.Assert.AreEqual(2, mergeManager.onDiskMapOutputs.Count); IList <MergeManagerImpl.CompressAwarePath> paths = new AList <MergeManagerImpl.CompressAwarePath >(); IEnumerator <MergeManagerImpl.CompressAwarePath> iterator = mergeManager.onDiskMapOutputs .GetEnumerator(); IList <string> keys = new AList <string>(); IList <string> values = new AList <string>(); while (iterator.HasNext()) { MergeManagerImpl.CompressAwarePath next = iterator.Next(); ReadOnDiskMapOutput(conf, fs, next, keys, values); paths.AddItem(next); } NUnit.Framework.Assert.AreEqual(keys, Arrays.AsList("apple", "banana", "carrot", "apple", "banana", "carrot")); NUnit.Framework.Assert.AreEqual(values, Arrays.AsList("awesome", "bla", "amazing" , "disgusting", "pretty good", "delicious")); mergeManager.Close(); mergeManager = new MergeManagerImpl <Text, Text>(reduceId2, jobConf, fs, lda, Reporter .Null, null, null, null, null, null, null, null, new Progress(), new MROutputFiles ()); MergeThread <MergeManagerImpl.CompressAwarePath, Text, Text> onDiskMerger = mergeManager .CreateOnDiskMerger(); onDiskMerger.Merge(paths); NUnit.Framework.Assert.AreEqual(1, mergeManager.onDiskMapOutputs.Count); keys = new AList <string>(); values = new AList <string>(); ReadOnDiskMapOutput(conf, fs, mergeManager.onDiskMapOutputs.GetEnumerator().Next( ), keys, values); NUnit.Framework.Assert.AreEqual(keys, Arrays.AsList("apple", "apple", "banana", "banana" , "carrot", "carrot")); NUnit.Framework.Assert.AreEqual(values, Arrays.AsList("awesome", "disgusting", "pretty good" , "bla", "amazing", "delicious")); mergeManager.Close(); NUnit.Framework.Assert.AreEqual(0, mergeManager.inMemoryMapOutputs.Count); NUnit.Framework.Assert.AreEqual(0, mergeManager.inMemoryMergedMapOutputs.Count); NUnit.Framework.Assert.AreEqual(0, mergeManager.onDiskMapOutputs.Count); }