Example #1
0
 /// <exception cref="System.IO.IOException"/>
 internal OneFileInfo(FileStatus stat, Configuration conf, bool isSplitable, Dictionary
                      <string, IList <CombineFileInputFormat.OneBlockInfo> > rackToBlocks, Dictionary <CombineFileInputFormat.OneBlockInfo
                                                                                                       , string[]> blockToNodes, Dictionary <string, ICollection <CombineFileInputFormat.OneBlockInfo
                                                                                                                                                                  > > nodeToBlocks, Dictionary <string, ICollection <string> > rackToNodes, long maxSize
                      )
 {
     // size of the file
     // all blocks in this file
     this.fileSize = 0;
     // get block locations from file system
     BlockLocation[] locations;
     if (stat is LocatedFileStatus)
     {
         locations = ((LocatedFileStatus)stat).GetBlockLocations();
     }
     else
     {
         FileSystem fs = stat.GetPath().GetFileSystem(conf);
         locations = fs.GetFileBlockLocations(stat, 0, stat.GetLen());
     }
     // create a list of all block and their locations
     if (locations == null)
     {
         blocks = new CombineFileInputFormat.OneBlockInfo[0];
     }
     else
     {
         if (locations.Length == 0 && !stat.IsDirectory())
         {
             locations = new BlockLocation[] { new BlockLocation() };
         }
         if (!isSplitable)
         {
             // if the file is not splitable, just create the one block with
             // full file length
             blocks    = new CombineFileInputFormat.OneBlockInfo[1];
             fileSize  = stat.GetLen();
             blocks[0] = new CombineFileInputFormat.OneBlockInfo(stat.GetPath(), 0, fileSize,
                                                                 locations[0].GetHosts(), locations[0].GetTopologyPaths());
         }
         else
         {
             AList <CombineFileInputFormat.OneBlockInfo> blocksList = new AList <CombineFileInputFormat.OneBlockInfo
                                                                                 >(locations.Length);
             for (int i = 0; i < locations.Length; i++)
             {
                 fileSize += locations[i].GetLength();
                 // each split can be a maximum of maxSize
                 long left     = locations[i].GetLength();
                 long myOffset = locations[i].GetOffset();
                 long myLength = 0;
                 do
                 {
                     if (maxSize == 0)
                     {
                         myLength = left;
                     }
                     else
                     {
                         if (left > maxSize && left < 2 * maxSize)
                         {
                             // if remainder is between max and 2*max - then
                             // instead of creating splits of size max, left-max we
                             // create splits of size left/2 and left/2. This is
                             // a heuristic to avoid creating really really small
                             // splits.
                             myLength = left / 2;
                         }
                         else
                         {
                             myLength = Math.Min(maxSize, left);
                         }
                     }
                     CombineFileInputFormat.OneBlockInfo oneblock = new CombineFileInputFormat.OneBlockInfo
                                                                        (stat.GetPath(), myOffset, myLength, locations[i].GetHosts(), locations[i].GetTopologyPaths
                                                                            ());
                     left     -= myLength;
                     myOffset += myLength;
                     blocksList.AddItem(oneblock);
                 }while (left > 0);
             }
             blocks = Sharpen.Collections.ToArray(blocksList, new CombineFileInputFormat.OneBlockInfo
                                                  [blocksList.Count]);
         }
         PopulateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes);
     }
 }
Example #2
0
        internal virtual void CreateSplits(IDictionary <string, ICollection <CombineFileInputFormat.OneBlockInfo
                                                                             > > nodeToBlocks, IDictionary <CombineFileInputFormat.OneBlockInfo, string[]> blockToNodes
                                           , IDictionary <string, IList <CombineFileInputFormat.OneBlockInfo> > rackToBlocks,
                                           long totLength, long maxSize, long minSizeNode, long minSizeRack, IList <InputSplit
                                                                                                                    > splits)
        {
            AList <CombineFileInputFormat.OneBlockInfo> validBlocks = new AList <CombineFileInputFormat.OneBlockInfo
                                                                                 >();
            long curSplitSize = 0;
            int  totalNodes   = nodeToBlocks.Count;
            long totalLength  = totLength;
            Multiset <string>    splitsPerNode  = HashMultiset.Create();
            ICollection <string> completedNodes = new HashSet <string>();

            while (true)
            {
                // it is allowed for maxSize to be 0. Disable smoothing load for such cases
                // process all nodes and create splits that are local to a node. Generate
                // one split per node iteration, and walk over nodes multiple times to
                // distribute the splits across nodes.
                for (IEnumerator <KeyValuePair <string, ICollection <CombineFileInputFormat.OneBlockInfo
                                                                     > > > iter = nodeToBlocks.GetEnumerator(); iter.HasNext();)
                {
                    KeyValuePair <string, ICollection <CombineFileInputFormat.OneBlockInfo> > one = iter
                                                                                                    .Next();
                    string node = one.Key;
                    // Skip the node if it has previously been marked as completed.
                    if (completedNodes.Contains(node))
                    {
                        continue;
                    }
                    ICollection <CombineFileInputFormat.OneBlockInfo> blocksInCurrentNode = one.Value;
                    // for each block, copy it into validBlocks. Delete it from
                    // blockToNodes so that the same block does not appear in
                    // two different splits.
                    IEnumerator <CombineFileInputFormat.OneBlockInfo> oneBlockIter = blocksInCurrentNode
                                                                                     .GetEnumerator();
                    while (oneBlockIter.HasNext())
                    {
                        CombineFileInputFormat.OneBlockInfo oneblock = oneBlockIter.Next();
                        // Remove all blocks which may already have been assigned to other
                        // splits.
                        if (!blockToNodes.Contains(oneblock))
                        {
                            oneBlockIter.Remove();
                            continue;
                        }
                        validBlocks.AddItem(oneblock);
                        Sharpen.Collections.Remove(blockToNodes, oneblock);
                        curSplitSize += oneblock.length;
                        // if the accumulated split size exceeds the maximum, then
                        // create this split.
                        if (maxSize != 0 && curSplitSize >= maxSize)
                        {
                            // create an input split and add it to the splits array
                            AddCreatedSplit(splits, Sharpen.Collections.Singleton(node), validBlocks);
                            totalLength -= curSplitSize;
                            curSplitSize = 0;
                            splitsPerNode.AddItem(node);
                            // Remove entries from blocksInNode so that we don't walk these
                            // again.
                            blocksInCurrentNode.RemoveAll(validBlocks);
                            validBlocks.Clear();
                            // Done creating a single split for this node. Move on to the next
                            // node so that splits are distributed across nodes.
                            break;
                        }
                    }
                    if (validBlocks.Count != 0)
                    {
                        // This implies that the last few blocks (or all in case maxSize=0)
                        // were not part of a split. The node is complete.
                        // if there were any blocks left over and their combined size is
                        // larger than minSplitNode, then combine them into one split.
                        // Otherwise add them back to the unprocessed pool. It is likely
                        // that they will be combined with other blocks from the
                        // same rack later on.
                        // This condition also kicks in when max split size is not set. All
                        // blocks on a node will be grouped together into a single split.
                        if (minSizeNode != 0 && curSplitSize >= minSizeNode && splitsPerNode.Count(node)
                            == 0)
                        {
                            // haven't created any split on this machine. so its ok to add a
                            // smaller one for parallelism. Otherwise group it in the rack for
                            // balanced size create an input split and add it to the splits
                            // array
                            AddCreatedSplit(splits, Sharpen.Collections.Singleton(node), validBlocks);
                            totalLength -= curSplitSize;
                            splitsPerNode.AddItem(node);
                            // Remove entries from blocksInNode so that we don't walk this again.
                            blocksInCurrentNode.RemoveAll(validBlocks);
                        }
                        else
                        {
                            // The node is done. This was the last set of blocks for this node.
                            // Put the unplaced blocks back into the pool for later rack-allocation.
                            foreach (CombineFileInputFormat.OneBlockInfo oneblock in validBlocks)
                            {
                                blockToNodes[oneblock] = oneblock.hosts;
                            }
                        }
                        validBlocks.Clear();
                        curSplitSize = 0;
                        completedNodes.AddItem(node);
                    }
                    else
                    {
                        // No in-flight blocks.
                        if (blocksInCurrentNode.Count == 0)
                        {
                            // Node is done. All blocks were fit into node-local splits.
                            completedNodes.AddItem(node);
                        }
                    }
                }
                // else Run through the node again.
                // Check if node-local assignments are complete.
                if (completedNodes.Count == totalNodes || totalLength == 0)
                {
                    // All nodes have been walked over and marked as completed or all blocks
                    // have been assigned. The rest should be handled via rackLock assignment.
                    Log.Info("DEBUG: Terminated node allocation with : CompletedNodes: " + completedNodes
                             .Count + ", size left: " + totalLength);
                    break;
                }
            }
            // if blocks in a rack are below the specified minimum size, then keep them
            // in 'overflow'. After the processing of all racks is complete, these
            // overflow blocks will be combined into splits.
            AList <CombineFileInputFormat.OneBlockInfo> overflowBlocks = new AList <CombineFileInputFormat.OneBlockInfo
                                                                                    >();
            ICollection <string> racks = new HashSet <string>();

            // Process all racks over and over again until there is no more work to do.
            while (blockToNodes.Count > 0)
            {
                // Create one split for this rack before moving over to the next rack.
                // Come back to this rack after creating a single split for each of the
                // remaining racks.
                // Process one rack location at a time, Combine all possible blocks that
                // reside on this rack as one split. (constrained by minimum and maximum
                // split size).
                // iterate over all racks
                for (IEnumerator <KeyValuePair <string, IList <CombineFileInputFormat.OneBlockInfo> >
                                  > iter = rackToBlocks.GetEnumerator(); iter.HasNext();)
                {
                    KeyValuePair <string, IList <CombineFileInputFormat.OneBlockInfo> > one = iter.Next(
                        );
                    racks.AddItem(one.Key);
                    IList <CombineFileInputFormat.OneBlockInfo> blocks = one.Value;
                    // for each block, copy it into validBlocks. Delete it from
                    // blockToNodes so that the same block does not appear in
                    // two different splits.
                    bool createdSplit = false;
                    foreach (CombineFileInputFormat.OneBlockInfo oneblock in blocks)
                    {
                        if (blockToNodes.Contains(oneblock))
                        {
                            validBlocks.AddItem(oneblock);
                            Sharpen.Collections.Remove(blockToNodes, oneblock);
                            curSplitSize += oneblock.length;
                            // if the accumulated split size exceeds the maximum, then
                            // create this split.
                            if (maxSize != 0 && curSplitSize >= maxSize)
                            {
                                // create an input split and add it to the splits array
                                AddCreatedSplit(splits, GetHosts(racks), validBlocks);
                                createdSplit = true;
                                break;
                            }
                        }
                    }
                    // if we created a split, then just go to the next rack
                    if (createdSplit)
                    {
                        curSplitSize = 0;
                        validBlocks.Clear();
                        racks.Clear();
                        continue;
                    }
                    if (!validBlocks.IsEmpty())
                    {
                        if (minSizeRack != 0 && curSplitSize >= minSizeRack)
                        {
                            // if there is a minimum size specified, then create a single split
                            // otherwise, store these blocks into overflow data structure
                            AddCreatedSplit(splits, GetHosts(racks), validBlocks);
                        }
                        else
                        {
                            // There were a few blocks in this rack that
                            // remained to be processed. Keep them in 'overflow' block list.
                            // These will be combined later.
                            Sharpen.Collections.AddAll(overflowBlocks, validBlocks);
                        }
                    }
                    curSplitSize = 0;
                    validBlocks.Clear();
                    racks.Clear();
                }
            }
            System.Diagnostics.Debug.Assert(blockToNodes.IsEmpty());
            System.Diagnostics.Debug.Assert(curSplitSize == 0);
            System.Diagnostics.Debug.Assert(validBlocks.IsEmpty());
            System.Diagnostics.Debug.Assert(racks.IsEmpty());
            // Process all overflow blocks
            foreach (CombineFileInputFormat.OneBlockInfo oneblock_1 in overflowBlocks)
            {
                validBlocks.AddItem(oneblock_1);
                curSplitSize += oneblock_1.length;
                // This might cause an exiting rack location to be re-added,
                // but it should be ok.
                for (int i = 0; i < oneblock_1.racks.Length; i++)
                {
                    racks.AddItem(oneblock_1.racks[i]);
                }
                // if the accumulated split size exceeds the maximum, then
                // create this split.
                if (maxSize != 0 && curSplitSize >= maxSize)
                {
                    // create an input split and add it to the splits array
                    AddCreatedSplit(splits, GetHosts(racks), validBlocks);
                    curSplitSize = 0;
                    validBlocks.Clear();
                    racks.Clear();
                }
            }
            // Process any remaining blocks, if any.
            if (!validBlocks.IsEmpty())
            {
                AddCreatedSplit(splits, GetHosts(racks), validBlocks);
            }
        }