Ejemplo n.º 1
0
        /// <exception cref="System.IO.IOException"/>
        private void CombineAndSpill(RawKeyValueIterator kvIter, Counters.Counter inCounter
                                     )
        {
            JobConf           job        = jobConf;
            Reducer           combiner   = ReflectionUtils.NewInstance(combinerClass, job);
            Type              keyClass   = (Type)job.GetMapOutputKeyClass();
            Type              valClass   = (Type)job.GetMapOutputValueClass();
            RawComparator <K> comparator = (RawComparator <K>)job.GetCombinerKeyGroupingComparator
                                               ();

            try
            {
                Task.CombineValuesIterator values = new Task.CombineValuesIterator(kvIter, comparator
                                                                                   , keyClass, valClass, job, Reporter.Null, inCounter);
                while (values.More())
                {
                    combiner.Reduce(values.GetKey(), values, combineCollector, Reporter.Null);
                    values.NextKey();
                }
            }
            finally
            {
                combiner.Close();
            }
        }
Ejemplo n.º 2
0
 public MergeQueue(Configuration conf, FileSystem fs, IList <Merger.Segment <K, V> >
                   segments, RawComparator <K> comparator, Progressable reporter, bool sortSegments,
                   CompressionCodec codec, TaskType taskType)
     : this(conf, fs, segments, comparator, reporter, sortSegments, taskType)
 {
     this.codec = codec;
 }
Ejemplo n.º 3
0
 /// <exception cref="System.Exception"/>
 /// <exception cref="System.IO.IOException"/>
 public ReduceContextImpl(Configuration conf, TaskAttemptID taskid, RawKeyValueIterator
                          input, Counter inputKeyCounter, Counter inputValueCounter, RecordWriter <KEYOUT,
                                                                                                   VALUEOUT> output, OutputCommitter committer, StatusReporter reporter, RawComparator
                          <KEYIN> comparator, Type keyClass, Type valueClass)
     : base(conf, taskid, output, committer, reporter)
 {
     iterable = new ReduceContextImpl.ValueIterable(this);
     // current key
     // current value
     // first value in key
     // more w/ this key
     // more in file
     this.input                = input;
     this.inputKeyCounter      = inputKeyCounter;
     this.inputValueCounter    = inputValueCounter;
     this.comparator           = comparator;
     this.serializationFactory = new SerializationFactory(conf);
     this.keyDeserializer      = serializationFactory.GetDeserializer(keyClass);
     this.keyDeserializer.Open(buffer);
     this.valueDeserializer = serializationFactory.GetDeserializer(valueClass);
     this.valueDeserializer.Open(buffer);
     hasMore         = input.Next();
     this.keyClass   = keyClass;
     this.valueClass = valueClass;
     this.conf       = conf;
     this.taskid     = taskid;
 }
Ejemplo n.º 4
0
 internal BinarySearchNode(TotalOrderPartitioner <K, V> _enclosing, K[] splitPoints
                           , RawComparator <K> comparator)
 {
     this._enclosing  = _enclosing;
     this.splitPoints = splitPoints;
     this.comparator  = comparator;
 }
Ejemplo n.º 5
0
 /// <exception cref="System.IO.IOException"/>
 public static RawKeyValueIterator Merge <K, V>(Configuration conf, FileSystem fs,
                                                IList <Merger.Segment <K, V> > segments, int mergeFactor, int inMemSegments, Path tmpDir
                                                , RawComparator <K> comparator, Progressable reporter, bool sortSegments, Counters.Counter
                                                readsCounter, Counters.Counter writesCounter, Progress mergePhase)
 {
     System.Type keyClass   = typeof(K);
     System.Type valueClass = typeof(V);
     return(new Merger.MergeQueue <K, V>(conf, fs, segments, comparator, reporter, sortSegments
                                         , TaskType.Reduce).Merge(keyClass, valueClass, mergeFactor, inMemSegments, tmpDir
                                                                  , readsCounter, writesCounter, mergePhase));
 }
Ejemplo n.º 6
0
 // Local directories
 /// <exception cref="System.IO.IOException"/>
 public static RawKeyValueIterator Merge <K, V>(Configuration conf, FileSystem fs,
                                                CompressionCodec codec, Path[] inputs, bool deleteInputs, int mergeFactor, Path
                                                tmpDir, RawComparator <K> comparator, Progressable reporter, Counters.Counter readsCounter
                                                , Counters.Counter writesCounter, Progress mergePhase)
 {
     System.Type keyClass   = typeof(K);
     System.Type valueClass = typeof(V);
     return(new Merger.MergeQueue <K, V>(conf, fs, inputs, deleteInputs, codec, comparator
                                         , reporter, null, TaskType.Reduce).Merge(keyClass, valueClass, mergeFactor, tmpDir
                                                                                  , readsCounter, writesCounter, mergePhase));
 }
Ejemplo n.º 7
0
 /// <summary>Read in the partition file and build indexing data structures.</summary>
 /// <remarks>
 /// Read in the partition file and build indexing data structures.
 /// If the keytype is
 /// <see cref="Org.Apache.Hadoop.IO.BinaryComparable"/>
 /// and
 /// <tt>total.order.partitioner.natural.order</tt> is not false, a trie
 /// of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
 /// will be built. Otherwise, keys will be located using a binary search of
 /// the partition keyset using the
 /// <see cref="Org.Apache.Hadoop.IO.RawComparator{T}"/>
 /// defined for this job. The input file must be sorted with the same
 /// comparator and contain
 /// <see cref="Org.Apache.Hadoop.Mapreduce.Task.JobContextImpl.GetNumReduceTasks()"/>
 /// - 1 keys.
 /// </remarks>
 public virtual void SetConf(Configuration conf)
 {
     // keytype from conf not static
     try
     {
         this.conf = conf;
         string     parts    = GetPartitionFile(conf);
         Path       partFile = new Path(parts);
         FileSystem fs       = (DefaultPath.Equals(parts)) ? FileSystem.GetLocal(conf) : partFile
                               .GetFileSystem(conf);
         // assume in DistributedCache
         Job  job         = Job.GetInstance(conf);
         Type keyClass    = (Type)job.GetMapOutputKeyClass();
         K[]  splitPoints = ReadPartitions(fs, partFile, keyClass, conf);
         if (splitPoints.Length != job.GetNumReduceTasks() - 1)
         {
             throw new IOException("Wrong number of partitions in keyset");
         }
         RawComparator <K> comparator = (RawComparator <K>)job.GetSortComparator();
         for (int i = 0; i < splitPoints.Length - 1; ++i)
         {
             if (comparator.Compare(splitPoints[i], splitPoints[i + 1]) >= 0)
             {
                 throw new IOException("Split points are out of order");
             }
         }
         bool natOrder = conf.GetBoolean(NaturalOrder, true);
         if (natOrder && typeof(BinaryComparable).IsAssignableFrom(keyClass))
         {
             partitions = BuildTrie((BinaryComparable[])splitPoints, 0, splitPoints.Length, new
                                    byte[0], conf.GetInt(MaxTrieDepth, 200));
         }
         else
         {
             // Now that blocks of identical splitless trie nodes are
             // represented reentrantly, and we develop a leaf for any trie
             // node with only one split point, the only reason for a depth
             // limit is to refute stack overflow or bloat in the pathological
             // case where the split points are long and mostly look like bytes
             // iii...iixii...iii   .  Therefore, we make the default depth
             // limit large but not huge.
             partitions = new TotalOrderPartitioner.BinarySearchNode(this, splitPoints, comparator
                                                                     );
         }
     }
     catch (IOException e)
     {
         throw new ArgumentException("Can't read partitions file", e);
     }
 }
Ejemplo n.º 8
0
        /// <exception cref="System.IO.IOException"/>
        public virtual void TestMergeShouldReturnProperProgress(IList <Merger.Segment <Text
                                                                                       , Text> > segments)
        {
            Path tmpDir     = new Path("localpath");
            Type keyClass   = (Type)jobConf.GetMapOutputKeyClass();
            Type valueClass = (Type)jobConf.GetMapOutputValueClass();
            RawComparator <Text> comparator = jobConf.GetOutputKeyComparator();

            Counters.Counter    readsCounter  = new Counters.Counter();
            Counters.Counter    writesCounter = new Counters.Counter();
            Progress            mergePhase    = new Progress();
            RawKeyValueIterator mergeQueue    = Merger.Merge(conf, fs, keyClass, valueClass, segments
                                                             , 2, tmpDir, comparator, GetReporter(), readsCounter, writesCounter, mergePhase);
            float epsilon = 0.00001f;

            // Reading 6 keys total, 3 each in 2 segments, so each key read moves the
            // progress forward 1/6th of the way. Initially the first keys from each
            // segment have been read as part of the merge setup, so progress = 2/6.
            NUnit.Framework.Assert.AreEqual(2 / 6.0f, mergeQueue.GetProgress().Get(), epsilon
                                            );
            // The first next() returns one of the keys already read during merge setup
            NUnit.Framework.Assert.IsTrue(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(2 / 6.0f, mergeQueue.GetProgress().Get(), epsilon
                                            );
            // Subsequent next() calls should read one key and move progress
            NUnit.Framework.Assert.IsTrue(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(3 / 6.0f, mergeQueue.GetProgress().Get(), epsilon
                                            );
            NUnit.Framework.Assert.IsTrue(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(4 / 6.0f, mergeQueue.GetProgress().Get(), epsilon
                                            );
            // At this point we've exhausted all of the keys in one segment
            // so getting the next key will return the already cached key from the
            // other segment
            NUnit.Framework.Assert.IsTrue(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(4 / 6.0f, mergeQueue.GetProgress().Get(), epsilon
                                            );
            // Subsequent next() calls should read one key and move progress
            NUnit.Framework.Assert.IsTrue(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(5 / 6.0f, mergeQueue.GetProgress().Get(), epsilon
                                            );
            NUnit.Framework.Assert.IsTrue(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(1.0f, mergeQueue.GetProgress().Get(), epsilon);
            // Now there should be no more input
            NUnit.Framework.Assert.IsFalse(mergeQueue.Next());
            NUnit.Framework.Assert.AreEqual(1.0f, mergeQueue.GetProgress().Get(), epsilon);
            NUnit.Framework.Assert.IsTrue(mergeQueue.GetKey() == null);
            NUnit.Framework.Assert.AreEqual(0, mergeQueue.GetValue().GetData().Length);
        }
Ejemplo n.º 9
0
 public MergeQueue(Configuration conf, FileSystem fs, IList <Merger.Segment <K, V> >
                   segments, RawComparator <K> comparator, Progressable reporter, bool sortSegments,
                   TaskType taskType)
 {
     this.conf       = conf;
     this.fs         = fs;
     this.comparator = comparator;
     this.segments   = segments;
     this.reporter   = reporter;
     if (taskType == TaskType.Map)
     {
         ConsiderFinalMergeForProgress();
     }
     if (sortSegments)
     {
         segments.Sort(segmentComparator);
     }
 }
Ejemplo n.º 10
0
 /// <exception cref="System.IO.IOException"/>
 public SkippingReduceValuesIterator(ReduceTask _enclosing, RawKeyValueIterator @in
                                     , RawComparator <KEY> comparator, Type keyClass, Type valClass, Configuration conf
                                     , Task.TaskReporter reporter, TaskUmbilicalProtocol umbilical)
     : base(_enclosing)
 {
     this._enclosing       = _enclosing;
     this.umbilical        = umbilical;
     this.skipGroupCounter = ((Counters.Counter)reporter.GetCounter(TaskCounter.ReduceSkippedGroups
                                                                    ));
     this.skipRecCounter = ((Counters.Counter)reporter.GetCounter(TaskCounter.ReduceSkippedRecords
                                                                  ));
     this.toWriteSkipRecs = this._enclosing.ToWriteSkipRecs() && SkipBadRecords.GetSkipOutputPath
                                (conf) != null;
     this.keyClass = keyClass;
     this.valClass = valClass;
     this.reporter = reporter;
     this.skipIt   = this._enclosing.GetSkipRanges().SkipRangeIterator();
     this.MayBeSkip();
 }
Ejemplo n.º 11
0
        /// <summary>Write a partition file for the given job, using the Sampler provided.</summary>
        /// <remarks>
        /// Write a partition file for the given job, using the Sampler provided.
        /// Queries the sampler for a sample keyset, sorts by the output key
        /// comparator, selects the keys for each rank, and writes to the destination
        /// returned from
        /// <see cref="TotalOrderPartitioner{K, V}.GetPartitionFile(Org.Apache.Hadoop.Conf.Configuration)
        ///     "/>
        /// .
        /// </remarks>
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        /// <exception cref="System.Exception"/>
        public static void WritePartitionFile <K, V>(Job job, InputSampler.Sampler <K, V> sampler
                                                     )
        {
            // getInputFormat, getOutputKeyComparator
            Configuration conf          = job.GetConfiguration();
            InputFormat   inf           = ReflectionUtils.NewInstance(job.GetInputFormatClass(), conf);
            int           numPartitions = job.GetNumReduceTasks();

            K[] samples = (K[])sampler.GetSample(inf, job);
            Log.Info("Using " + samples.Length + " samples");
            RawComparator <K> comparator = (RawComparator <K>)job.GetSortComparator();

            Arrays.Sort(samples, comparator);
            Path       dst = new Path(TotalOrderPartitioner.GetPartitionFile(conf));
            FileSystem fs  = dst.GetFileSystem(conf);

            if (fs.Exists(dst))
            {
                fs.Delete(dst, false);
            }
            SequenceFile.Writer writer = SequenceFile.CreateWriter(fs, conf, dst, job.GetMapOutputKeyClass
                                                                       (), typeof(NullWritable));
            NullWritable nullValue = NullWritable.Get();
            float        stepSize  = samples.Length / (float)numPartitions;
            int          last      = -1;

            for (int i = 1; i < numPartitions; ++i)
            {
                int k = Math.Round(stepSize * i);
                while (last >= k && comparator.Compare(samples[last], samples[k]) == 0)
                {
                    ++k;
                }
                writer.Append(samples[k], nullValue);
                last = k;
            }
            writer.Close();
        }
Ejemplo n.º 12
0
 /// <exception cref="System.IO.IOException"/>
 public MergeQueue(Configuration conf, FileSystem fs, Path[] inputs, bool deleteInputs
                   , CompressionCodec codec, RawComparator <K> comparator, Progressable reporter, Counters.Counter
                   mergedMapOutputsCounter, TaskType taskType)
 {
     this.conf       = conf;
     this.fs         = fs;
     this.codec      = codec;
     this.comparator = comparator;
     this.reporter   = reporter;
     if (taskType == TaskType.Map)
     {
         ConsiderFinalMergeForProgress();
     }
     foreach (Path file in inputs)
     {
         Log.Debug("MergeQ: adding: " + file);
         segments.AddItem(new Merger.Segment <K, V>(conf, fs, file, codec, !deleteInputs, (
                                                        file.ToString().EndsWith(Task.MergedOutputPrefix) ? null : mergedMapOutputsCounter
                                                        )));
     }
     // Sort segments on file-lengths
     segments.Sort(segmentComparator);
 }
Ejemplo n.º 13
0
 /// <exception cref="System.IO.IOException"/>
 public MergeQueue(Configuration conf, FileSystem fs, Path[] inputs, bool deleteInputs
                   , CompressionCodec codec, RawComparator <K> comparator, Progressable reporter)
     : this(conf, fs, inputs, deleteInputs, codec, comparator, reporter, null, TaskType
            .Reduce)
 {
 }
Ejemplo n.º 14
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="System.TypeLoadException"/>
        public override void Run(JobConf job, TaskUmbilicalProtocol umbilical)
        {
            job.SetBoolean(JobContext.SkipRecords, IsSkipping());
            if (IsMapOrReduce())
            {
                copyPhase   = GetProgress().AddPhase("copy");
                sortPhase   = GetProgress().AddPhase("sort");
                reducePhase = GetProgress().AddPhase("reduce");
            }
            // start thread that will handle communication with parent
            Task.TaskReporter reporter = StartReporter(umbilical);
            bool useNewApi             = job.GetUseNewReducer();

            Initialize(job, GetJobID(), reporter, useNewApi);
            // check if it is a cleanupJobTask
            if (jobCleanup)
            {
                RunJobCleanupTask(umbilical, reporter);
                return;
            }
            if (jobSetup)
            {
                RunJobSetupTask(umbilical, reporter);
                return;
            }
            if (taskCleanup)
            {
                RunTaskCleanupTask(umbilical, reporter);
                return;
            }
            // Initialize the codec
            codec = InitCodec();
            RawKeyValueIterator   rIter = null;
            ShuffleConsumerPlugin shuffleConsumerPlugin = null;
            Type combinerClass = conf.GetCombinerClass();

            Task.CombineOutputCollector combineCollector = (null != combinerClass) ? new Task.CombineOutputCollector
                                                               (reduceCombineOutputCounter, reporter, conf) : null;
            Type clazz = job.GetClass <ShuffleConsumerPlugin>(MRConfig.ShuffleConsumerPlugin,
                                                              typeof(Shuffle));

            shuffleConsumerPlugin = ReflectionUtils.NewInstance(clazz, job);
            Log.Info("Using ShuffleConsumerPlugin: " + shuffleConsumerPlugin);
            ShuffleConsumerPlugin.Context shuffleContext = new ShuffleConsumerPlugin.Context(
                GetTaskID(), job, FileSystem.GetLocal(job), umbilical, base.lDirAlloc, reporter,
                codec, combinerClass, combineCollector, spilledRecordsCounter, reduceCombineInputCounter
                , shuffledMapsCounter, reduceShuffleBytes, failedShuffleCounter, mergedMapOutputsCounter
                , taskStatus, copyPhase, sortPhase, this, mapOutputFile, localMapFiles);
            shuffleConsumerPlugin.Init(shuffleContext);
            rIter = shuffleConsumerPlugin.Run();
            // free up the data structures
            mapOutputFilesOnDisk.Clear();
            sortPhase.Complete();
            // sort is complete
            SetPhase(TaskStatus.Phase.Reduce);
            StatusUpdate(umbilical);
            Type          keyClass   = job.GetMapOutputKeyClass();
            Type          valueClass = job.GetMapOutputValueClass();
            RawComparator comparator = job.GetOutputValueGroupingComparator();

            if (useNewApi)
            {
                RunNewReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass);
            }
            else
            {
                RunOldReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass);
            }
            shuffleConsumerPlugin.Close();
            Done(umbilical, reporter);
        }
Ejemplo n.º 15
0
        /// <exception cref="System.IO.IOException"/>
        private void RunOldReducer <Inkey, Invalue, Outkey, Outvalue>(JobConf job, TaskUmbilicalProtocol
                                                                      umbilical, Task.TaskReporter reporter, RawKeyValueIterator rIter, RawComparator
                                                                      <INKEY> comparator)
        {
            System.Type keyClass   = typeof(INKEY);
            System.Type valueClass = typeof(INVALUE);
            Reducer <INKEY, INVALUE, OUTKEY, OUTVALUE> reducer = ReflectionUtils.NewInstance(job
                                                                                             .GetReducerClass(), job);
            // make output collector
            string finalName = GetOutputName(GetPartition());
            RecordWriter <OUTKEY, OUTVALUE> @out = new ReduceTask.OldTrackingRecordWriter <OUTKEY
                                                                                           , OUTVALUE>(this, job, reporter, finalName);
            RecordWriter <OUTKEY, OUTVALUE>    finalOut  = @out;
            OutputCollector <OUTKEY, OUTVALUE> collector = new _OutputCollector_419(finalOut,
                                                                                    reporter);

            // indicate that progress update needs to be sent
            // apply reduce function
            try
            {
                //increment processed counter only if skipping feature is enabled
                bool incrProcCount = SkipBadRecords.GetReducerMaxSkipGroups(job) > 0 && SkipBadRecords
                                     .GetAutoIncrReducerProcCount(job);
                ReduceTask.ReduceValuesIterator <INKEY, INVALUE> values = IsSkipping() ? new ReduceTask.SkippingReduceValuesIterator
                                                                          <INKEY, INVALUE>(this, rIter, comparator, keyClass, valueClass, job, reporter, umbilical
                                                                                           ) : new ReduceTask.ReduceValuesIterator <INKEY, INVALUE>(this, rIter, job.GetOutputValueGroupingComparator
                                                                                                                                                        (), keyClass, valueClass, job, reporter);
                values.InformReduceProgress();
                while (values.More())
                {
                    reduceInputKeyCounter.Increment(1);
                    reducer.Reduce(values.GetKey(), values, collector, reporter);
                    if (incrProcCount)
                    {
                        reporter.IncrCounter(SkipBadRecords.CounterGroup, SkipBadRecords.CounterReduceProcessedGroups
                                             , 1);
                    }
                    values.NextKey();
                    values.InformReduceProgress();
                }
                reducer.Close();
                reducer = null;
                @out.Close(reporter);
                @out = null;
            }
            finally
            {
                IOUtils.Cleanup(Log, reducer);
                CloseQuietly(@out, reporter);
            }
        }
Ejemplo n.º 16
0
 //the buffer used for storing
 //key/values
 //the array used to store the start offsets of
 //keys in keyValBuffer
 //the array used to store the lengths of
 //keys
 //the array used to store the value lengths
 //the array of startOffsets's indices. This will
 //be sorted at the end to contain a sorted array of
 //indices to offsets
 //the comparator for the map output
 //the number of key/values
 //the overhead of the arrays in memory
 //12 => 4 for keyoffsets, 4 for keylengths, 4 for valueLengths, and
 //4 for indices into startOffsets array in the
 //pointers array (ignored the partpointers list itself)
 //we maintain the max lengths of the key/val that we encounter.  During
 //iteration of the sorted results, we will create a DataOutputBuffer to
 //return the keys. The max size of the DataOutputBuffer will be the max
 //keylength that we encounter. Expose this value to model memory more
 //accurately.
 //Reference to the Progressable object for sending KeepAlive
 //Implementation of methods of the SorterBase interface
 //
 public virtual void Configure(JobConf conf)
 {
     comparator = conf.GetOutputKeyComparator();
 }
Ejemplo n.º 17
0
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.Exception"/>
        /// <exception cref="System.TypeLoadException"/>
        private void RunNewReducer <Inkey, Invalue, Outkey, Outvalue>(JobConf job, TaskUmbilicalProtocol
                                                                      umbilical, Task.TaskReporter reporter, RawKeyValueIterator rIter, RawComparator
                                                                      <INKEY> comparator)
        {
            System.Type keyClass   = typeof(INKEY);
            System.Type valueClass = typeof(INVALUE);
            // wrap value iterator to report progress.
            RawKeyValueIterator rawIter = rIter;

            rIter = new _RawKeyValueIterator_587(rawIter, reporter);
            // make a task context so we can get the classes
            TaskAttemptContext taskContext = new TaskAttemptContextImpl(job, GetTaskID(), reporter
                                                                        );
            // make a reducer
            Reducer <INKEY, INVALUE, OUTKEY, OUTVALUE> reducer = (Reducer <INKEY, INVALUE, OUTKEY
                                                                           , OUTVALUE>)ReflectionUtils.NewInstance(taskContext.GetReducerClass(), job);
            RecordWriter <OUTKEY, OUTVALUE> trackedRW = new ReduceTask.NewTrackingRecordWriter
                                                        <OUTKEY, OUTVALUE>(this, taskContext);

            job.SetBoolean("mapred.skip.on", IsSkipping());
            job.SetBoolean(JobContext.SkipRecords, IsSkipping());
            Reducer.Context reducerContext = CreateReduceContext(reducer, job, GetTaskID(), rIter
                                                                 , reduceInputKeyCounter, reduceInputValueCounter, trackedRW, committer, reporter
                                                                 , comparator, keyClass, valueClass);
            try
            {
                reducer.Run(reducerContext);
            }
            finally
            {
                trackedRW.Close(reducerContext);
            }
        }
Ejemplo n.º 18
0
 /// <exception cref="System.IO.IOException"/>
 public static RawKeyValueIterator Merge <K, V>(Configuration conf, FileSystem fs,
                                                IList <Merger.Segment <K, V> > segments, int mergeFactor, Path tmpDir, RawComparator
                                                <K> comparator, Progressable reporter, Counters.Counter readsCounter, Counters.Counter
                                                writesCounter, Progress mergePhase)
 {
     System.Type keyClass   = typeof(K);
     System.Type valueClass = typeof(V);
     return(Merge(conf, fs, keyClass, valueClass, segments, mergeFactor, tmpDir, comparator
                  , reporter, false, readsCounter, writesCounter, mergePhase));
 }
Ejemplo n.º 19
0
        /// <exception cref="System.IO.IOException"/>
        private RawKeyValueIterator FinalMerge(JobConf job, FileSystem fs, IList <InMemoryMapOutput
                                                                                  <K, V> > inMemoryMapOutputs, IList <MergeManagerImpl.CompressAwarePath> onDiskMapOutputs
                                               )
        {
            Log.Info("finalMerge called with " + inMemoryMapOutputs.Count + " in-memory map-outputs and "
                     + onDiskMapOutputs.Count + " on-disk map-outputs");
            long maxInMemReduce = GetMaxInMemReduceLimit();
            // merge config params
            Type keyClass   = (Type)job.GetMapOutputKeyClass();
            Type valueClass = (Type)job.GetMapOutputValueClass();
            bool keepInputs = job.GetKeepFailedTaskFiles();
            Path tmpDir     = new Path(reduceId.ToString());
            RawComparator <K> comparator = (RawComparator <K>)job.GetOutputKeyComparator();
            // segments required to vacate memory
            IList <Merger.Segment <K, V> > memDiskSegments = new AList <Merger.Segment <K, V> >();
            long inMemToDiskBytes   = 0;
            bool mergePhaseFinished = false;

            if (inMemoryMapOutputs.Count > 0)
            {
                TaskID mapId = inMemoryMapOutputs[0].GetMapId().GetTaskID();
                inMemToDiskBytes = CreateInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce
                                                          );
                int numMemDiskSegments = memDiskSegments.Count;
                if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.Count)
                {
                    // If we reach here, it implies that we have less than io.sort.factor
                    // disk segments and this will be incremented by 1 (result of the
                    // memory segments merge). Since this total would still be
                    // <= io.sort.factor, we will not do any more intermediate merges,
                    // the merge of all these disk segments would be directly fed to the
                    // reduce method
                    mergePhaseFinished = true;
                    // must spill to disk, but can't retain in-mem for intermediate merge
                    Path outputPath = mapOutputFile.GetInputFileForWrite(mapId, inMemToDiskBytes).Suffix
                                          (Org.Apache.Hadoop.Mapred.Task.MergedOutputPrefix);
                    RawKeyValueIterator rIter = Merger.Merge(job, fs, keyClass, valueClass, memDiskSegments
                                                             , numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null,
                                                             mergePhase);
                    FSDataOutputStream  @out   = CryptoUtils.WrapIfNecessary(job, fs.Create(outputPath));
                    IFile.Writer <K, V> writer = new IFile.Writer <K, V>(job, @out, keyClass, valueClass
                                                                         , codec, null, true);
                    try
                    {
                        Merger.WriteFile(rIter, writer, reporter, job);
                        writer.Close();
                        onDiskMapOutputs.AddItem(new MergeManagerImpl.CompressAwarePath(outputPath, writer
                                                                                        .GetRawLength(), writer.GetCompressedLength()));
                        writer = null;
                    }
                    catch (IOException e)
                    {
                        // add to list of final disk outputs.
                        if (null != outputPath)
                        {
                            try
                            {
                                fs.Delete(outputPath, true);
                            }
                            catch (IOException)
                            {
                            }
                        }
                        // NOTHING
                        throw;
                    }
                    finally
                    {
                        if (null != writer)
                        {
                            writer.Close();
                        }
                    }
                    Log.Info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy "
                             + "reduce memory limit");
                    inMemToDiskBytes = 0;
                    memDiskSegments.Clear();
                }
                else
                {
                    if (inMemToDiskBytes != 0)
                    {
                        Log.Info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for "
                                 + "intermediate, on-disk merge");
                    }
                }
            }
            // segments on disk
            IList <Merger.Segment <K, V> > diskSegments = new AList <Merger.Segment <K, V> >();
            long onDiskBytes = inMemToDiskBytes;
            long rawBytes    = inMemToDiskBytes;

            MergeManagerImpl.CompressAwarePath[] onDisk = Sharpen.Collections.ToArray(onDiskMapOutputs
                                                                                      , new MergeManagerImpl.CompressAwarePath[onDiskMapOutputs.Count]);
            foreach (MergeManagerImpl.CompressAwarePath file in onDisk)
            {
                long fileLength = fs.GetFileStatus(file).GetLen();
                onDiskBytes += fileLength;
                rawBytes    += (file.GetRawDataLength() > 0) ? file.GetRawDataLength() : fileLength;
                Log.Debug("Disk file: " + file + " Length is " + fileLength);
                diskSegments.AddItem(new Merger.Segment <K, V>(job, fs, file, codec, keepInputs, (
                                                                   file.ToString().EndsWith(Org.Apache.Hadoop.Mapred.Task.MergedOutputPrefix) ? null
                                         : mergedMapOutputsCounter), file.GetRawDataLength()));
            }
            Log.Info("Merging " + onDisk.Length + " files, " + onDiskBytes + " bytes from disk"
                     );
            diskSegments.Sort(new _IComparer_786());
            // build final list of segments from merged backed by disk + in-mem
            IList <Merger.Segment <K, V> > finalSegments = new AList <Merger.Segment <K, V> >();
            long inMemBytes = CreateInMemorySegments(inMemoryMapOutputs, finalSegments, 0);

            Log.Info("Merging " + finalSegments.Count + " segments, " + inMemBytes + " bytes from memory into reduce"
                     );
            if (0 != onDiskBytes)
            {
                int numInMemSegments = memDiskSegments.Count;
                diskSegments.AddRange(0, memDiskSegments);
                memDiskSegments.Clear();
                // Pass mergePhase only if there is a going to be intermediate
                // merges. See comment where mergePhaseFinished is being set
                Progress            thisPhase = (mergePhaseFinished) ? null : mergePhase;
                RawKeyValueIterator diskMerge = Merger.Merge(job, fs, keyClass, valueClass, codec
                                                             , diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false
                                                             , spilledRecordsCounter, null, thisPhase);
                diskSegments.Clear();
                if (0 == finalSegments.Count)
                {
                    return(diskMerge);
                }
                finalSegments.AddItem(new Merger.Segment <K, V>(new MergeManagerImpl.RawKVIteratorReader
                                                                    (this, diskMerge, onDiskBytes), true, rawBytes));
            }
            return(Merger.Merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.Count
                                , tmpDir, comparator, reporter, spilledRecordsCounter, null, null));
        }
Ejemplo n.º 20
0
 public BytesComparator(RawComparator <object> cmp)
 {
     // nothing
     this.cmp = cmp;
 }
Ejemplo n.º 21
0
 public MergeQueue(Configuration conf, FileSystem fs, IList <Merger.Segment <K, V> >
                   segments, RawComparator <K> comparator, Progressable reporter)
     : this(conf, fs, segments, comparator, reporter, false, TaskType.Reduce)
 {
 }
Ejemplo n.º 22
0
 /// <exception cref="System.IO.IOException"/>
 public ReduceValuesIterator(ReduceTask _enclosing, RawKeyValueIterator @in, RawComparator
                             <KEY> comparator, Type keyClass, Type valClass, Configuration conf, Progressable
                             reporter)
     : base(@in, comparator, keyClass, valClass, conf, reporter)
 {
     this._enclosing = _enclosing;
 }