/// <exception cref="System.IO.IOException"/> internal OneFileInfo(FileStatus stat, Configuration conf, bool isSplitable, Dictionary <string, IList <CombineFileInputFormat.OneBlockInfo> > rackToBlocks, Dictionary <CombineFileInputFormat.OneBlockInfo , string[]> blockToNodes, Dictionary <string, ICollection <CombineFileInputFormat.OneBlockInfo > > nodeToBlocks, Dictionary <string, ICollection <string> > rackToNodes, long maxSize ) { // size of the file // all blocks in this file this.fileSize = 0; // get block locations from file system BlockLocation[] locations; if (stat is LocatedFileStatus) { locations = ((LocatedFileStatus)stat).GetBlockLocations(); } else { FileSystem fs = stat.GetPath().GetFileSystem(conf); locations = fs.GetFileBlockLocations(stat, 0, stat.GetLen()); } // create a list of all block and their locations if (locations == null) { blocks = new CombineFileInputFormat.OneBlockInfo[0]; } else { if (locations.Length == 0 && !stat.IsDirectory()) { locations = new BlockLocation[] { new BlockLocation() }; } if (!isSplitable) { // if the file is not splitable, just create the one block with // full file length blocks = new CombineFileInputFormat.OneBlockInfo[1]; fileSize = stat.GetLen(); blocks[0] = new CombineFileInputFormat.OneBlockInfo(stat.GetPath(), 0, fileSize, locations[0].GetHosts(), locations[0].GetTopologyPaths()); } else { AList <CombineFileInputFormat.OneBlockInfo> blocksList = new AList <CombineFileInputFormat.OneBlockInfo >(locations.Length); for (int i = 0; i < locations.Length; i++) { fileSize += locations[i].GetLength(); // each split can be a maximum of maxSize long left = locations[i].GetLength(); long myOffset = locations[i].GetOffset(); long myLength = 0; do { if (maxSize == 0) { myLength = left; } else { if (left > maxSize && left < 2 * maxSize) { // if remainder is between max and 2*max - then // instead of creating splits of size max, left-max we // create splits of size left/2 and left/2. This is // a heuristic to avoid creating really really small // splits. myLength = left / 2; } else { myLength = Math.Min(maxSize, left); } } CombineFileInputFormat.OneBlockInfo oneblock = new CombineFileInputFormat.OneBlockInfo (stat.GetPath(), myOffset, myLength, locations[i].GetHosts(), locations[i].GetTopologyPaths ()); left -= myLength; myOffset += myLength; blocksList.AddItem(oneblock); }while (left > 0); } blocks = Sharpen.Collections.ToArray(blocksList, new CombineFileInputFormat.OneBlockInfo [blocksList.Count]); } PopulateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes); } }
internal virtual void CreateSplits(IDictionary <string, ICollection <CombineFileInputFormat.OneBlockInfo > > nodeToBlocks, IDictionary <CombineFileInputFormat.OneBlockInfo, string[]> blockToNodes , IDictionary <string, IList <CombineFileInputFormat.OneBlockInfo> > rackToBlocks, long totLength, long maxSize, long minSizeNode, long minSizeRack, IList <InputSplit > splits) { AList <CombineFileInputFormat.OneBlockInfo> validBlocks = new AList <CombineFileInputFormat.OneBlockInfo >(); long curSplitSize = 0; int totalNodes = nodeToBlocks.Count; long totalLength = totLength; Multiset <string> splitsPerNode = HashMultiset.Create(); ICollection <string> completedNodes = new HashSet <string>(); while (true) { // it is allowed for maxSize to be 0. Disable smoothing load for such cases // process all nodes and create splits that are local to a node. Generate // one split per node iteration, and walk over nodes multiple times to // distribute the splits across nodes. for (IEnumerator <KeyValuePair <string, ICollection <CombineFileInputFormat.OneBlockInfo > > > iter = nodeToBlocks.GetEnumerator(); iter.HasNext();) { KeyValuePair <string, ICollection <CombineFileInputFormat.OneBlockInfo> > one = iter .Next(); string node = one.Key; // Skip the node if it has previously been marked as completed. if (completedNodes.Contains(node)) { continue; } ICollection <CombineFileInputFormat.OneBlockInfo> blocksInCurrentNode = one.Value; // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. IEnumerator <CombineFileInputFormat.OneBlockInfo> oneBlockIter = blocksInCurrentNode .GetEnumerator(); while (oneBlockIter.HasNext()) { CombineFileInputFormat.OneBlockInfo oneblock = oneBlockIter.Next(); // Remove all blocks which may already have been assigned to other // splits. if (!blockToNodes.Contains(oneblock)) { oneBlockIter.Remove(); continue; } validBlocks.AddItem(oneblock); Sharpen.Collections.Remove(blockToNodes, oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array AddCreatedSplit(splits, Sharpen.Collections.Singleton(node), validBlocks); totalLength -= curSplitSize; curSplitSize = 0; splitsPerNode.AddItem(node); // Remove entries from blocksInNode so that we don't walk these // again. blocksInCurrentNode.RemoveAll(validBlocks); validBlocks.Clear(); // Done creating a single split for this node. Move on to the next // node so that splits are distributed across nodes. break; } } if (validBlocks.Count != 0) { // This implies that the last few blocks (or all in case maxSize=0) // were not part of a split. The node is complete. // if there were any blocks left over and their combined size is // larger than minSplitNode, then combine them into one split. // Otherwise add them back to the unprocessed pool. It is likely // that they will be combined with other blocks from the // same rack later on. // This condition also kicks in when max split size is not set. All // blocks on a node will be grouped together into a single split. if (minSizeNode != 0 && curSplitSize >= minSizeNode && splitsPerNode.Count(node) == 0) { // haven't created any split on this machine. so its ok to add a // smaller one for parallelism. Otherwise group it in the rack for // balanced size create an input split and add it to the splits // array AddCreatedSplit(splits, Sharpen.Collections.Singleton(node), validBlocks); totalLength -= curSplitSize; splitsPerNode.AddItem(node); // Remove entries from blocksInNode so that we don't walk this again. blocksInCurrentNode.RemoveAll(validBlocks); } else { // The node is done. This was the last set of blocks for this node. // Put the unplaced blocks back into the pool for later rack-allocation. foreach (CombineFileInputFormat.OneBlockInfo oneblock in validBlocks) { blockToNodes[oneblock] = oneblock.hosts; } } validBlocks.Clear(); curSplitSize = 0; completedNodes.AddItem(node); } else { // No in-flight blocks. if (blocksInCurrentNode.Count == 0) { // Node is done. All blocks were fit into node-local splits. completedNodes.AddItem(node); } } } // else Run through the node again. // Check if node-local assignments are complete. if (completedNodes.Count == totalNodes || totalLength == 0) { // All nodes have been walked over and marked as completed or all blocks // have been assigned. The rest should be handled via rackLock assignment. Log.Info("DEBUG: Terminated node allocation with : CompletedNodes: " + completedNodes .Count + ", size left: " + totalLength); break; } } // if blocks in a rack are below the specified minimum size, then keep them // in 'overflow'. After the processing of all racks is complete, these // overflow blocks will be combined into splits. AList <CombineFileInputFormat.OneBlockInfo> overflowBlocks = new AList <CombineFileInputFormat.OneBlockInfo >(); ICollection <string> racks = new HashSet <string>(); // Process all racks over and over again until there is no more work to do. while (blockToNodes.Count > 0) { // Create one split for this rack before moving over to the next rack. // Come back to this rack after creating a single split for each of the // remaining racks. // Process one rack location at a time, Combine all possible blocks that // reside on this rack as one split. (constrained by minimum and maximum // split size). // iterate over all racks for (IEnumerator <KeyValuePair <string, IList <CombineFileInputFormat.OneBlockInfo> > > iter = rackToBlocks.GetEnumerator(); iter.HasNext();) { KeyValuePair <string, IList <CombineFileInputFormat.OneBlockInfo> > one = iter.Next( ); racks.AddItem(one.Key); IList <CombineFileInputFormat.OneBlockInfo> blocks = one.Value; // for each block, copy it into validBlocks. Delete it from // blockToNodes so that the same block does not appear in // two different splits. bool createdSplit = false; foreach (CombineFileInputFormat.OneBlockInfo oneblock in blocks) { if (blockToNodes.Contains(oneblock)) { validBlocks.AddItem(oneblock); Sharpen.Collections.Remove(blockToNodes, oneblock); curSplitSize += oneblock.length; // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array AddCreatedSplit(splits, GetHosts(racks), validBlocks); createdSplit = true; break; } } } // if we created a split, then just go to the next rack if (createdSplit) { curSplitSize = 0; validBlocks.Clear(); racks.Clear(); continue; } if (!validBlocks.IsEmpty()) { if (minSizeRack != 0 && curSplitSize >= minSizeRack) { // if there is a minimum size specified, then create a single split // otherwise, store these blocks into overflow data structure AddCreatedSplit(splits, GetHosts(racks), validBlocks); } else { // There were a few blocks in this rack that // remained to be processed. Keep them in 'overflow' block list. // These will be combined later. Sharpen.Collections.AddAll(overflowBlocks, validBlocks); } } curSplitSize = 0; validBlocks.Clear(); racks.Clear(); } } System.Diagnostics.Debug.Assert(blockToNodes.IsEmpty()); System.Diagnostics.Debug.Assert(curSplitSize == 0); System.Diagnostics.Debug.Assert(validBlocks.IsEmpty()); System.Diagnostics.Debug.Assert(racks.IsEmpty()); // Process all overflow blocks foreach (CombineFileInputFormat.OneBlockInfo oneblock_1 in overflowBlocks) { validBlocks.AddItem(oneblock_1); curSplitSize += oneblock_1.length; // This might cause an exiting rack location to be re-added, // but it should be ok. for (int i = 0; i < oneblock_1.racks.Length; i++) { racks.AddItem(oneblock_1.racks[i]); } // if the accumulated split size exceeds the maximum, then // create this split. if (maxSize != 0 && curSplitSize >= maxSize) { // create an input split and add it to the splits array AddCreatedSplit(splits, GetHosts(racks), validBlocks); curSplitSize = 0; validBlocks.Clear(); racks.Clear(); } } // Process any remaining blocks, if any. if (!validBlocks.IsEmpty()) { AddCreatedSplit(splits, GetHosts(racks), validBlocks); } }