public virtual void TestIFileReaderWithCodec() { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.GetLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).GetRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.SetConf(conf); FSDataOutputStream @out = rfs.Create(path); IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, @out, typeof( Text), typeof(Text), codec, null); writer.Close(); FSDataInputStream @in = rfs.Open(path); IFile.Reader <Text, Text> reader = new IFile.Reader <Text, Text>(conf, @in, rfs.GetFileStatus (path).GetLen(), codec, null); reader.Close(); // test check sum byte[] ab = new byte[100]; int readed = reader.checksumIn.ReadWithChecksum(ab, 0, ab.Length); NUnit.Framework.Assert.AreEqual(readed, reader.checksumIn.GetChecksum().Length); }
public virtual void TestIFileWriterWithCodec() { Configuration conf = new Configuration(); FileSystem localFs = FileSystem.GetLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).GetRaw(); Path path = new Path(new Path("build/test.ifile"), "data"); DefaultCodec codec = new GzipCodec(); codec.SetConf(conf); IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, rfs.Create(path ), typeof(Text), typeof(Text), codec, null); writer.Close(); }
/// <exception cref="System.IO.IOException"/> public virtual void RunValueIterator(Path tmpDir, TestReduceTask.Pair[] vals, Configuration conf, CompressionCodec codec) { FileSystem localFs = FileSystem.GetLocal(conf); FileSystem rfs = ((LocalFileSystem)localFs).GetRaw(); Path path = new Path(tmpDir, "data.in"); IFile.Writer <Text, Text> writer = new IFile.Writer <Text, Text>(conf, rfs.Create(path ), typeof(Text), typeof(Text), codec, null); foreach (TestReduceTask.Pair p in vals) { writer.Append(new Text(p.key), new Text(p.value)); } writer.Close(); RawKeyValueIterator rawItr = Merger.Merge <Text, Text>(conf, rfs, codec, new Path[] { path }, false, conf.GetInt(JobContext.IoSortFactor, 100), tmpDir, new Text.Comparator (), new TestReduceTask.NullProgress(), null, null, null); Task.ValuesIterator valItr = new Task.ValuesIterator <Text, Text>(rawItr, WritableComparator .Get(typeof(Text)), typeof(Text), typeof(Text), conf, new TestReduceTask.NullProgress ()); // WritableComparators are not generic int i = 0; while (valItr.More()) { object key = valItr.GetKey(); string keyString = key.ToString(); // make sure it matches! NUnit.Framework.Assert.AreEqual(vals[i].key, keyString); // must have at least 1 value! NUnit.Framework.Assert.IsTrue(valItr.HasNext()); while (valItr.HasNext()) { string valueString = valItr.Next().ToString(); // make sure the values match NUnit.Framework.Assert.AreEqual(vals[i].value, valueString); // make sure the keys match NUnit.Framework.Assert.AreEqual(vals[i].key, valItr.GetKey().ToString()); i += 1; } // make sure the key hasn't changed under the hood NUnit.Framework.Assert.AreEqual(keyString, valItr.GetKey().ToString()); valItr.NextKey(); } NUnit.Framework.Assert.AreEqual(vals.Length, i); // make sure we have progress equal to 1.0 NUnit.Framework.Assert.AreEqual(1.0f, rawItr.GetProgress().Get()); }
/// <exception cref="System.IO.IOException"/> internal virtual RawKeyValueIterator Merge(Type keyClass, Type valueClass, int factor , int inMem, Path tmpDir, Counters.Counter readsCounter, Counters.Counter writesCounter , Progress mergePhase) { Log.Info("Merging " + segments.Count + " sorted segments"); /* * If there are inMemory segments, then they come first in the segments * list and then the sorted disk segments. Otherwise(if there are only * disk segments), then they are sorted segments if there are more than * factor segments in the segments list. */ int numSegments = segments.Count; int origFactor = factor; int passNo = 1; if (mergePhase != null) { mergeProgress = mergePhase; } long totalBytes = ComputeBytesInMerges(factor, inMem); if (totalBytes != 0) { progPerByte = 1.0f / (float)totalBytes; } do { //create the MergeStreams from the sorted map created in the constructor //and dump the final output to a file //get the factor for this pass of merge. We assume in-memory segments //are the first entries in the segment list and that the pass factor //doesn't apply to them factor = GetPassFactor(factor, passNo, numSegments - inMem); if (1 == passNo) { factor += inMem; } IList <Merger.Segment <K, V> > segmentsToMerge = new AList <Merger.Segment <K, V> >(); int segmentsConsidered = 0; int numSegmentsToConsider = factor; long startBytes = 0; // starting bytes of segments of this merge while (true) { //extract the smallest 'factor' number of segments //Call cleanup on the empty segments (no key/value data) IList <Merger.Segment <K, V> > mStream = GetSegmentDescriptors(numSegmentsToConsider ); foreach (Merger.Segment <K, V> segment in mStream) { // Initialize the segment at the last possible moment; // this helps in ensuring we don't use buffers until we need them segment.Init(readsCounter); long startPos = segment.GetReader().bytesRead; bool hasNext = segment.NextRawKey(); long endPos = segment.GetReader().bytesRead; if (hasNext) { startBytes += endPos - startPos; segmentsToMerge.AddItem(segment); segmentsConsidered++; } else { segment.Close(); numSegments--; } } //we ignore this segment for the merge //if we have the desired number of segments //or looked at all available segments, we break if (segmentsConsidered == factor || segments.Count == 0) { break; } numSegmentsToConsider = factor - segmentsConsidered; } //feed the streams to the priority queue Initialize(segmentsToMerge.Count); Clear(); foreach (Merger.Segment <K, V> segment_1 in segmentsToMerge) { Put(segment_1); } //if we have lesser number of segments remaining, then just return the //iterator, else do another single level merge if (numSegments <= factor) { if (!includeFinalMerge) { // for reduce task // Reset totalBytesProcessed and recalculate totalBytes from the // remaining segments to track the progress of the final merge. // Final merge is considered as the progress of the reducePhase, // the 3rd phase of reduce task. totalBytesProcessed = 0; totalBytes = 0; for (int i = 0; i < segmentsToMerge.Count; i++) { totalBytes += segmentsToMerge[i].GetRawDataLength(); } } if (totalBytes != 0) { //being paranoid progPerByte = 1.0f / (float)totalBytes; } totalBytesProcessed += startBytes; if (totalBytes != 0) { mergeProgress.Set(totalBytesProcessed * progPerByte); } else { mergeProgress.Set(1.0f); } // Last pass and no segments left - we're done Log.Info("Down to the last merge-pass, with " + numSegments + " segments left of total size: " + (totalBytes - totalBytesProcessed) + " bytes"); return(this); } else { Log.Info("Merging " + segmentsToMerge.Count + " intermediate segments out of a total of " + (segments.Count + segmentsToMerge.Count)); long bytesProcessedInPrevMerges = totalBytesProcessed; totalBytesProcessed += startBytes; //we want to spread the creation of temp files on multiple disks if //available under the space constraints long approxOutputSize = 0; foreach (Merger.Segment <K, V> s in segmentsToMerge) { approxOutputSize += s.GetLength() + ChecksumFileSystem.GetApproxChkSumLength(s.GetLength ()); } Path tmpFilename = new Path(tmpDir, "intermediate").Suffix("." + passNo); Path outputFile = lDirAlloc.GetLocalPathForWrite(tmpFilename.ToString(), approxOutputSize , conf); FSDataOutputStream @out = fs.Create(outputFile); @out = CryptoUtils.WrapIfNecessary(conf, @out); IFile.Writer <K, V> writer = new IFile.Writer <K, V>(conf, @out, keyClass, valueClass , codec, writesCounter, true); WriteFile(this, writer, reporter, conf); writer.Close(); //we finished one single level merge; now clean up the priority //queue this.Close(); // Add the newly create segment to the list of segments to be merged Merger.Segment <K, V> tempSegment = new Merger.Segment <K, V>(conf, fs, outputFile, codec, false); // Insert new merged segment into the sorted list int pos = Sharpen.Collections.BinarySearch(segments, tempSegment, segmentComparator ); if (pos < 0) { // binary search failed. So position to be inserted at is -pos-1 pos = -pos - 1; } segments.Add(pos, tempSegment); numSegments = segments.Count; // Subtract the difference between expected size of new segment and // actual size of new segment(Expected size of new segment is // inputBytesOfThisMerge) from totalBytes. Expected size and actual // size will match(almost) if combiner is not called in merge. long inputBytesOfThisMerge = totalBytesProcessed - bytesProcessedInPrevMerges; totalBytes -= inputBytesOfThisMerge - tempSegment.GetRawDataLength(); if (totalBytes != 0) { progPerByte = 1.0f / (float)totalBytes; } passNo++; } //we are worried about only the first pass merge factor. So reset the //factor to what it originally was factor = origFactor; }while (true); }