Exemplo n.º 1
0
        /// <summary>
        /// Expand globs in the given <code>filePattern</code> into a collection of
        /// file patterns so that in the expanded set no file pattern has a
        /// slash character ("/") in a curly bracket pair.
        /// </summary>
        /// <param name="filePattern"/>
        /// <returns>expanded file patterns</returns>
        /// <exception cref="System.IO.IOException"></exception>
        public static IList <string> Expand(string filePattern)
        {
            IList <string> fullyExpanded = new AList <string>();
            IList <GlobExpander.StringWithOffset> toExpand = new AList <GlobExpander.StringWithOffset
                                                                        >();

            toExpand.AddItem(new GlobExpander.StringWithOffset(filePattern, 0));
            while (!toExpand.IsEmpty())
            {
                GlobExpander.StringWithOffset         path     = toExpand.Remove(0);
                IList <GlobExpander.StringWithOffset> expanded = ExpandLeftmost(path);
                if (expanded == null)
                {
                    fullyExpanded.AddItem(path.@string);
                }
                else
                {
                    toExpand.AddRange(0, expanded);
                }
            }
            return(fullyExpanded);
        }
Exemplo n.º 2
0
        /// <exception cref="System.IO.IOException"/>
        private RawKeyValueIterator FinalMerge(JobConf job, FileSystem fs, IList <InMemoryMapOutput
                                                                                  <K, V> > inMemoryMapOutputs, IList <MergeManagerImpl.CompressAwarePath> onDiskMapOutputs
                                               )
        {
            Log.Info("finalMerge called with " + inMemoryMapOutputs.Count + " in-memory map-outputs and "
                     + onDiskMapOutputs.Count + " on-disk map-outputs");
            long maxInMemReduce = GetMaxInMemReduceLimit();
            // merge config params
            Type keyClass   = (Type)job.GetMapOutputKeyClass();
            Type valueClass = (Type)job.GetMapOutputValueClass();
            bool keepInputs = job.GetKeepFailedTaskFiles();
            Path tmpDir     = new Path(reduceId.ToString());
            RawComparator <K> comparator = (RawComparator <K>)job.GetOutputKeyComparator();
            // segments required to vacate memory
            IList <Merger.Segment <K, V> > memDiskSegments = new AList <Merger.Segment <K, V> >();
            long inMemToDiskBytes   = 0;
            bool mergePhaseFinished = false;

            if (inMemoryMapOutputs.Count > 0)
            {
                TaskID mapId = inMemoryMapOutputs[0].GetMapId().GetTaskID();
                inMemToDiskBytes = CreateInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce
                                                          );
                int numMemDiskSegments = memDiskSegments.Count;
                if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.Count)
                {
                    // If we reach here, it implies that we have less than io.sort.factor
                    // disk segments and this will be incremented by 1 (result of the
                    // memory segments merge). Since this total would still be
                    // <= io.sort.factor, we will not do any more intermediate merges,
                    // the merge of all these disk segments would be directly fed to the
                    // reduce method
                    mergePhaseFinished = true;
                    // must spill to disk, but can't retain in-mem for intermediate merge
                    Path outputPath = mapOutputFile.GetInputFileForWrite(mapId, inMemToDiskBytes).Suffix
                                          (Org.Apache.Hadoop.Mapred.Task.MergedOutputPrefix);
                    RawKeyValueIterator rIter = Merger.Merge(job, fs, keyClass, valueClass, memDiskSegments
                                                             , numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null,
                                                             mergePhase);
                    FSDataOutputStream  @out   = CryptoUtils.WrapIfNecessary(job, fs.Create(outputPath));
                    IFile.Writer <K, V> writer = new IFile.Writer <K, V>(job, @out, keyClass, valueClass
                                                                         , codec, null, true);
                    try
                    {
                        Merger.WriteFile(rIter, writer, reporter, job);
                        writer.Close();
                        onDiskMapOutputs.AddItem(new MergeManagerImpl.CompressAwarePath(outputPath, writer
                                                                                        .GetRawLength(), writer.GetCompressedLength()));
                        writer = null;
                    }
                    catch (IOException e)
                    {
                        // add to list of final disk outputs.
                        if (null != outputPath)
                        {
                            try
                            {
                                fs.Delete(outputPath, true);
                            }
                            catch (IOException)
                            {
                            }
                        }
                        // NOTHING
                        throw;
                    }
                    finally
                    {
                        if (null != writer)
                        {
                            writer.Close();
                        }
                    }
                    Log.Info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy "
                             + "reduce memory limit");
                    inMemToDiskBytes = 0;
                    memDiskSegments.Clear();
                }
                else
                {
                    if (inMemToDiskBytes != 0)
                    {
                        Log.Info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for "
                                 + "intermediate, on-disk merge");
                    }
                }
            }
            // segments on disk
            IList <Merger.Segment <K, V> > diskSegments = new AList <Merger.Segment <K, V> >();
            long onDiskBytes = inMemToDiskBytes;
            long rawBytes    = inMemToDiskBytes;

            MergeManagerImpl.CompressAwarePath[] onDisk = Sharpen.Collections.ToArray(onDiskMapOutputs
                                                                                      , new MergeManagerImpl.CompressAwarePath[onDiskMapOutputs.Count]);
            foreach (MergeManagerImpl.CompressAwarePath file in onDisk)
            {
                long fileLength = fs.GetFileStatus(file).GetLen();
                onDiskBytes += fileLength;
                rawBytes    += (file.GetRawDataLength() > 0) ? file.GetRawDataLength() : fileLength;
                Log.Debug("Disk file: " + file + " Length is " + fileLength);
                diskSegments.AddItem(new Merger.Segment <K, V>(job, fs, file, codec, keepInputs, (
                                                                   file.ToString().EndsWith(Org.Apache.Hadoop.Mapred.Task.MergedOutputPrefix) ? null
                                         : mergedMapOutputsCounter), file.GetRawDataLength()));
            }
            Log.Info("Merging " + onDisk.Length + " files, " + onDiskBytes + " bytes from disk"
                     );
            diskSegments.Sort(new _IComparer_786());
            // build final list of segments from merged backed by disk + in-mem
            IList <Merger.Segment <K, V> > finalSegments = new AList <Merger.Segment <K, V> >();
            long inMemBytes = CreateInMemorySegments(inMemoryMapOutputs, finalSegments, 0);

            Log.Info("Merging " + finalSegments.Count + " segments, " + inMemBytes + " bytes from memory into reduce"
                     );
            if (0 != onDiskBytes)
            {
                int numInMemSegments = memDiskSegments.Count;
                diskSegments.AddRange(0, memDiskSegments);
                memDiskSegments.Clear();
                // Pass mergePhase only if there is a going to be intermediate
                // merges. See comment where mergePhaseFinished is being set
                Progress            thisPhase = (mergePhaseFinished) ? null : mergePhase;
                RawKeyValueIterator diskMerge = Merger.Merge(job, fs, keyClass, valueClass, codec
                                                             , diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false
                                                             , spilledRecordsCounter, null, thisPhase);
                diskSegments.Clear();
                if (0 == finalSegments.Count)
                {
                    return(diskMerge);
                }
                finalSegments.AddItem(new Merger.Segment <K, V>(new MergeManagerImpl.RawKVIteratorReader
                                                                    (this, diskMerge, onDiskBytes), true, rawBytes));
            }
            return(Merger.Merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.Count
                                , tmpDir, comparator, reporter, spilledRecordsCounter, null, null));
        }