/// <summary> /// determine the datanode locations of each block in a file /// </summary> /// <param name="file">the description of the file</param> /// <returns>a list of datanode IP address/port descriptions, one for each block</returns> public override IEnumerable <IPEndPoint[]> GetBlockLocations(HdfsFile file) { // HDFS files have blocks of equal size, except for the last block which may not be full long numberOfBlocksToRead = (file.length + file.blockSize - 1) / file.blockSize; // number of blocks to query at a time from the datanode; this limits the size of each http request // and the memory consumed by the JSON response long batchSizeToRead = file.blockSize * 4096L; // keep track of the offset within the file of the current batch of blocks long offset = 0; while (numberOfBlocksToRead > 0) { // make the operation request for this batch of blocks string op = "GET_BLOCK_LOCATIONS&offset=" + offset + "&length=" + batchSizeToRead; // and update the offset ready to read the next batch offset += batchSizeToRead; // get the details of this batch JObject json = GetJSon(file.path, op); JToken blocks = json["LocatedBlocks"]["locatedBlocks"]; foreach (JToken block in blocks) { // look up the ip address and webhdfs port of each location yield return(block["locations"] .Select(l => new IPEndPoint(IPAddress.Parse((string)l["ipAddr"]), (int)l["infoPort"])).ToArray()); --numberOfBlocksToRead; } } }
/// <summary> /// Return the length of a particular block in a file. All blocks except the last one are the same length /// </summary> /// <param name="index">index of the block in the file</param> /// <param name="file">file being read</param> /// <returns>length in bytes of the requested block</returns> private long BlockLength(int index, HdfsFile file) { // start location of the block in the file long offset = (long)index * file.blockSize; // number of bytes after the start of the block long bytesAfterBlockStart = file.length - offset; // either the standard block length, or the length of the final block if it is shorter return(Math.Min(bytesAfterBlockStart, file.blockSize)); }
public override IEnumerable <IPEndPoint[]> GetBlockLocations(HdfsFile file) { HdfsFileInfo fileInfo = this.Instance(file.path).GetFileInfo(file.path.AbsolutePath, true); foreach (HdfsBlockInfo info in fileInfo.blockArray) { IPEndPoint[] endpoints = new IPEndPoint[info.Endpoints.Length]; for (int i = 0; i < endpoints.Length; ++i) { string[] parts = info.Endpoints[i].Split(':'); endpoints[i] = new IPEndPoint(IPAddress.Parse(parts[0]), Int32.Parse(parts[1])); } yield return(endpoints); } }
/// <summary> /// given a file, determine how much of that file's data are stored on each datanode. Return every datanode that stores a /// threshold percentage of the file's data as a candidate match for that file /// </summary> /// <param name="file">file to be matched</param> /// <returns>a match including a (possibly-empty) set of candidate data nodes</returns> protected override IEnumerable <Match> EnumerateFileWork(HdfsFile file) { long numberOfBlocks = (file.length + file.blockSize - 1) / file.blockSize; long threshold; if (numberOfBlocks > 4) { // this is a 'real' multi-block file; only take a datanode that contains at least a third of it threshold = file.length / 3; } else { // this file either has a single block or only a few: only return a matching node if it stores the whole file. // this will select the node that wrote (all the blocks in) the file rather than one of the replicas, in the case of a file with only a // couple of blocks threshold = file.length; } Match match = new Match { categories = this.client.GetBlockLocations(file) // first flatten the list of block locations, into a sequence of pairs of 'endpoint,length' each indicating that length // bytes are stored at endpoint .SelectMany((endpoints, index) => endpoints.Select(endpoint => new KeyValuePair <IPEndPoint, long>(endpoint, BlockLength(index, file)))) // then group by endpoint .GroupBy(x => x.Key) // within each group, sum the bytes to determine how many bytes in total are stored at each endpoint .Select(g => new KeyValuePair <IPEndPoint, long>(g.Key, g.Select(elt => elt.Value).Sum())) // keep only endpoints that store more than 33% of the file .Where(x => x.Value >= threshold) // return the flattened array of candidate endpoints, if any .Select(x => x.Key).ToArray(), // if there isn't a matching worker, use null as the default endpoint, meaning the read will be redirected to the // name node. Set the block to indicate the entire file workStub = new DfsBlock { path = file.path, fileLength = file.length, offset = 0, length = file.length, dataNodeAddress = null } }; yield return(match); }
/// <summary> /// called to convert an input file into a list of blocks /// </summary> /// <param name="file">the input file</param> /// <returns>the blocks in the file, along with a set of datanodes where each block is stored</returns> protected override IEnumerable <Match> EnumerateFileWork(HdfsFile file) { // get the blocks in the file, and convert each block to a base match, with file-specific metadata // filled in to the DfsBlock IEnumerable <Match> rawMatches = MakeBaseMatches(file, this.client.GetBlockLocations(file)); // fill in the rest of the DfsBlock fields return(rawMatches.Select((match, index) => { match.workStub.path = file.path; match.workStub.fileLength = file.length; // all the blocks are the same size match.workStub.offset = (long)index * file.blockSize; long bytesRemaining = file.length - match.workStub.offset; match.workStub.length = Math.Min(file.blockSize, bytesRemaining); // this address will be used if the block is going to be read by a worker on a remote machine // otherwise the correct address will be filled in when the worker is chosen match.workStub.dataNodeAddress = match.categories.First(); return match; })); }
/// <summary> /// For a given file, map a sequence of locations to a sequence of "base" matches. The workStub /// component of the match doesn't need any file-specific metadata filled in. The categories component of the /// match is set to the locations of that block. The DfsBlock fields will /// all be filled in after this sequence has been returned /// </summary> /// <param name="file">The file being read</param> /// <param name="blocks">The block locations for the file</param> /// <returns>A sequence of Matches with the categories set, and any file-specific metadata set</returns> protected override IEnumerable <Match> MakeBaseMatches(HdfsFile file, IEnumerable <IPEndPoint[]> blocks) { return(blocks.Select(endpoints => new Match { workStub = new DfsBlock(), categories = endpoints })); }
/// <summary> /// For a given file, map a sequence of locations to a sequence of "base" matches, where the workStub /// component of the match has any filetype-specific metadata filled in and the categories component of the /// match has been set to the locations of that block. The HdfsBlock fields will /// all be filled in later after this sequence has been returned /// </summary> /// <param name="file">The file being read</param> /// <param name="blocks">The block locations for the file</param> /// <returns>A sequence of Matches with the categories set, and any file-specific metadata set</returns> protected abstract IEnumerable <Match> MakeBaseMatches(HdfsFile file, IEnumerable <IPEndPoint[]> blocks);
/// <summary> /// given an hdfs file, return a sequence of work items, each with a set of matching categories /// </summary> /// <remarks> /// this is currently HDFS-specific, although the HdfsFile class could easily be extended to support /// other DFSs /// </remarks> /// <param name="file">file to expand</param> /// <returns>a sequence of work items, each with a set of matching categories</returns> protected abstract IEnumerable <Match> EnumerateFileWork(HdfsFile file);