Ejemplo n.º 1
0
            /// <exception cref="System.IO.IOException"/>
            public override void Merge(IList <MergeManagerImpl.CompressAwarePath> inputs)
            {
                // sanity check
                if (inputs == null || inputs.IsEmpty())
                {
                    MergeManagerImpl.Log.Info("No ondisk files to merge...");
                    return;
                }
                long approxOutputSize = 0;
                int  bytesPerSum      = this._enclosing.jobConf.GetInt("io.bytes.per.checksum", 512);

                MergeManagerImpl.Log.Info("OnDiskMerger: We have  " + inputs.Count + " map outputs on disk. Triggering merge..."
                                          );
                // 1. Prepare the list of files to be merged.
                foreach (MergeManagerImpl.CompressAwarePath file in inputs)
                {
                    approxOutputSize += this._enclosing.localFS.GetFileStatus(file).GetLen();
                }
                // add the checksum length
                approxOutputSize += ChecksumFileSystem.GetChecksumLength(approxOutputSize, bytesPerSum
                                                                         );
                // 2. Start the on-disk merge process
                Path outputPath = this._enclosing.localDirAllocator.GetLocalPathForWrite(inputs[0
                                                                                         ].ToString(), approxOutputSize, this._enclosing.jobConf).Suffix(Org.Apache.Hadoop.Mapred.Task
                                                                                                                                                         .MergedOutputPrefix);
                FSDataOutputStream @out = CryptoUtils.WrapIfNecessary(this._enclosing.jobConf, this
                                                                      ._enclosing.rfs.Create(outputPath));

                IFile.Writer <K, V> writer = new IFile.Writer <K, V>(this._enclosing.jobConf, @out,
                                                                     (Type)this._enclosing.jobConf.GetMapOutputKeyClass(), (Type)this._enclosing.jobConf
                                                                     .GetMapOutputValueClass(), this._enclosing.codec, null, true);
                RawKeyValueIterator iter = null;

                MergeManagerImpl.CompressAwarePath compressAwarePath;
                Path tmpDir = new Path(this._enclosing.reduceId.ToString());

                try
                {
                    iter = Merger.Merge(this._enclosing.jobConf, this._enclosing.rfs, (Type)this._enclosing
                                        .jobConf.GetMapOutputKeyClass(), (Type)this._enclosing.jobConf.GetMapOutputValueClass
                                            (), this._enclosing.codec, Sharpen.Collections.ToArray(inputs, new Path[inputs.Count
                                                                                                   ]), true, this._enclosing.ioSortFactor, tmpDir, (RawComparator <K>) this._enclosing
                                        .jobConf.GetOutputKeyComparator(), this._enclosing.reporter, this._enclosing.spilledRecordsCounter
                                        , null, this._enclosing.mergedMapOutputsCounter, null);
                    Merger.WriteFile(iter, writer, this._enclosing.reporter, this._enclosing.jobConf);
                    writer.Close();
                    compressAwarePath = new MergeManagerImpl.CompressAwarePath(outputPath, writer.GetRawLength
                                                                                   (), writer.GetCompressedLength());
                }
                catch (IOException e)
                {
                    this._enclosing.localFS.Delete(outputPath, true);
                    throw;
                }
                this._enclosing.CloseOnDiskFile(compressAwarePath);
                MergeManagerImpl.Log.Info(this._enclosing.reduceId + " Finished merging " + inputs
                                          .Count + " map output files on disk of total-size " + approxOutputSize + "." + " Local output file is "
                                          + outputPath + " of size " + this._enclosing.localFS.GetFileStatus(outputPath).
                                          GetLen());
            }
Ejemplo n.º 2
0
 public virtual void TestgetChecksumLength()
 {
     Assert.Equal(8, ChecksumFileSystem.GetChecksumLength(0L, 512));
     Assert.Equal(12, ChecksumFileSystem.GetChecksumLength(1L, 512)
                  );
     Assert.Equal(12, ChecksumFileSystem.GetChecksumLength(512L, 512
                                                           ));
     Assert.Equal(16, ChecksumFileSystem.GetChecksumLength(513L, 512
                                                           ));
     Assert.Equal(16, ChecksumFileSystem.GetChecksumLength(1023L, 512
                                                           ));
     Assert.Equal(16, ChecksumFileSystem.GetChecksumLength(1024L, 512
                                                           ));
     Assert.Equal(408, ChecksumFileSystem.GetChecksumLength(100L, 1
                                                            ));
     Assert.Equal(4000000000008L, ChecksumFileSystem.GetChecksumLength
                      (10000000000000L, 10));
 }
Ejemplo n.º 3
0
        /// <exception cref="System.IO.IOException"/>
        public virtual int Run(string[] args)
        {
            // silence the minidfs cluster
            Log hadoopLog = LogFactory.GetLog("org");

            if (hadoopLog is Log4JLogger)
            {
                ((Log4JLogger)hadoopLog).GetLogger().SetLevel(Level.Warn);
            }
            int reps = 1;

            if (args.Length == 1)
            {
                try
                {
                    reps = System.Convert.ToInt32(args[0]);
                }
                catch (FormatException)
                {
                    PrintUsage();
                    return(-1);
                }
            }
            else
            {
                if (args.Length > 1)
                {
                    PrintUsage();
                    return(-1);
                }
            }
            Configuration conf = GetConf();
            // the size of the file to write
            long Size = conf.GetLong("dfsthroughput.file.size", 10L * 1024 * 1024 * 1024);

            BufferSize = conf.GetInt("dfsthroughput.buffer.size", 4 * 1024);
            string localDir = conf.Get("mapred.temp.dir");

            if (localDir == null)
            {
                localDir = conf.Get("hadoop.tmp.dir");
                conf.Set("mapred.temp.dir", localDir);
            }
            dir = new LocalDirAllocator("mapred.temp.dir");
            Runtime.SetProperty("test.build.data", localDir);
            System.Console.Out.WriteLine("Local = " + localDir);
            ChecksumFileSystem checkedLocal = FileSystem.GetLocal(conf);
            FileSystem         rawLocal     = checkedLocal.GetRawFileSystem();

            for (int i = 0; i < reps; ++i)
            {
                WriteAndReadLocalFile("local", conf, Size);
                WriteAndReadFile(rawLocal, "raw", conf, Size);
                WriteAndReadFile(checkedLocal, "checked", conf, Size);
            }
            MiniDFSCluster cluster = null;

            try
            {
                cluster = new MiniDFSCluster.Builder(conf).Racks(new string[] { "/foo" }).Build();
                cluster.WaitActive();
                FileSystem dfs = cluster.GetFileSystem();
                for (int i_1 = 0; i_1 < reps; ++i_1)
                {
                    WriteAndReadFile(dfs, "dfs", conf, Size);
                }
            }
            finally
            {
                if (cluster != null)
                {
                    cluster.Shutdown();
                    // clean up minidfs junk
                    rawLocal.Delete(new Path(localDir, "dfs"), true);
                }
            }
            return(0);
        }
Ejemplo n.º 4
0
            /// <exception cref="System.IO.IOException"/>
            internal virtual RawKeyValueIterator Merge(Type keyClass, Type valueClass, int factor
                                                       , int inMem, Path tmpDir, Counters.Counter readsCounter, Counters.Counter writesCounter
                                                       , Progress mergePhase)
            {
                Log.Info("Merging " + segments.Count + " sorted segments");

                /*
                 * If there are inMemory segments, then they come first in the segments
                 * list and then the sorted disk segments. Otherwise(if there are only
                 * disk segments), then they are sorted segments if there are more than
                 * factor segments in the segments list.
                 */
                int numSegments = segments.Count;
                int origFactor  = factor;
                int passNo      = 1;

                if (mergePhase != null)
                {
                    mergeProgress = mergePhase;
                }
                long totalBytes = ComputeBytesInMerges(factor, inMem);

                if (totalBytes != 0)
                {
                    progPerByte = 1.0f / (float)totalBytes;
                }
                do
                {
                    //create the MergeStreams from the sorted map created in the constructor
                    //and dump the final output to a file
                    //get the factor for this pass of merge. We assume in-memory segments
                    //are the first entries in the segment list and that the pass factor
                    //doesn't apply to them
                    factor = GetPassFactor(factor, passNo, numSegments - inMem);
                    if (1 == passNo)
                    {
                        factor += inMem;
                    }
                    IList <Merger.Segment <K, V> > segmentsToMerge = new AList <Merger.Segment <K, V> >();
                    int  segmentsConsidered    = 0;
                    int  numSegmentsToConsider = factor;
                    long startBytes            = 0;
                    // starting bytes of segments of this merge
                    while (true)
                    {
                        //extract the smallest 'factor' number of segments
                        //Call cleanup on the empty segments (no key/value data)
                        IList <Merger.Segment <K, V> > mStream = GetSegmentDescriptors(numSegmentsToConsider
                                                                                       );
                        foreach (Merger.Segment <K, V> segment in mStream)
                        {
                            // Initialize the segment at the last possible moment;
                            // this helps in ensuring we don't use buffers until we need them
                            segment.Init(readsCounter);
                            long startPos = segment.GetReader().bytesRead;
                            bool hasNext  = segment.NextRawKey();
                            long endPos   = segment.GetReader().bytesRead;
                            if (hasNext)
                            {
                                startBytes += endPos - startPos;
                                segmentsToMerge.AddItem(segment);
                                segmentsConsidered++;
                            }
                            else
                            {
                                segment.Close();
                                numSegments--;
                            }
                        }
                        //we ignore this segment for the merge
                        //if we have the desired number of segments
                        //or looked at all available segments, we break
                        if (segmentsConsidered == factor || segments.Count == 0)
                        {
                            break;
                        }
                        numSegmentsToConsider = factor - segmentsConsidered;
                    }
                    //feed the streams to the priority queue
                    Initialize(segmentsToMerge.Count);
                    Clear();
                    foreach (Merger.Segment <K, V> segment_1 in segmentsToMerge)
                    {
                        Put(segment_1);
                    }
                    //if we have lesser number of segments remaining, then just return the
                    //iterator, else do another single level merge
                    if (numSegments <= factor)
                    {
                        if (!includeFinalMerge)
                        {
                            // for reduce task
                            // Reset totalBytesProcessed and recalculate totalBytes from the
                            // remaining segments to track the progress of the final merge.
                            // Final merge is considered as the progress of the reducePhase,
                            // the 3rd phase of reduce task.
                            totalBytesProcessed = 0;
                            totalBytes          = 0;
                            for (int i = 0; i < segmentsToMerge.Count; i++)
                            {
                                totalBytes += segmentsToMerge[i].GetRawDataLength();
                            }
                        }
                        if (totalBytes != 0)
                        {
                            //being paranoid
                            progPerByte = 1.0f / (float)totalBytes;
                        }
                        totalBytesProcessed += startBytes;
                        if (totalBytes != 0)
                        {
                            mergeProgress.Set(totalBytesProcessed * progPerByte);
                        }
                        else
                        {
                            mergeProgress.Set(1.0f);
                        }
                        // Last pass and no segments left - we're done
                        Log.Info("Down to the last merge-pass, with " + numSegments + " segments left of total size: "
                                 + (totalBytes - totalBytesProcessed) + " bytes");
                        return(this);
                    }
                    else
                    {
                        Log.Info("Merging " + segmentsToMerge.Count + " intermediate segments out of a total of "
                                 + (segments.Count + segmentsToMerge.Count));
                        long bytesProcessedInPrevMerges = totalBytesProcessed;
                        totalBytesProcessed += startBytes;
                        //we want to spread the creation of temp files on multiple disks if
                        //available under the space constraints
                        long approxOutputSize = 0;
                        foreach (Merger.Segment <K, V> s in segmentsToMerge)
                        {
                            approxOutputSize += s.GetLength() + ChecksumFileSystem.GetApproxChkSumLength(s.GetLength
                                                                                                             ());
                        }
                        Path tmpFilename = new Path(tmpDir, "intermediate").Suffix("." + passNo);
                        Path outputFile  = lDirAlloc.GetLocalPathForWrite(tmpFilename.ToString(), approxOutputSize
                                                                          , conf);
                        FSDataOutputStream @out = fs.Create(outputFile);
                        @out = CryptoUtils.WrapIfNecessary(conf, @out);
                        IFile.Writer <K, V> writer = new IFile.Writer <K, V>(conf, @out, keyClass, valueClass
                                                                             , codec, writesCounter, true);
                        WriteFile(this, writer, reporter, conf);
                        writer.Close();
                        //we finished one single level merge; now clean up the priority
                        //queue
                        this.Close();
                        // Add the newly create segment to the list of segments to be merged
                        Merger.Segment <K, V> tempSegment = new Merger.Segment <K, V>(conf, fs, outputFile,
                                                                                      codec, false);
                        // Insert new merged segment into the sorted list
                        int pos = Sharpen.Collections.BinarySearch(segments, tempSegment, segmentComparator
                                                                   );
                        if (pos < 0)
                        {
                            // binary search failed. So position to be inserted at is -pos-1
                            pos = -pos - 1;
                        }
                        segments.Add(pos, tempSegment);
                        numSegments = segments.Count;
                        // Subtract the difference between expected size of new segment and
                        // actual size of new segment(Expected size of new segment is
                        // inputBytesOfThisMerge) from totalBytes. Expected size and actual
                        // size will match(almost) if combiner is not called in merge.
                        long inputBytesOfThisMerge = totalBytesProcessed - bytesProcessedInPrevMerges;
                        totalBytes -= inputBytesOfThisMerge - tempSegment.GetRawDataLength();
                        if (totalBytes != 0)
                        {
                            progPerByte = 1.0f / (float)totalBytes;
                        }
                        passNo++;
                    }
                    //we are worried about only the first pass merge factor. So reset the
                    //factor to what it originally was
                    factor = origFactor;
                }while (true);
            }