Пример #1
0
        /// <summary>
        /// determine the datanode locations of each block in a file
        /// </summary>
        /// <param name="file">the description of the file</param>
        /// <returns>a list of datanode IP address/port descriptions, one for each block</returns>
        public override IEnumerable <IPEndPoint[]> GetBlockLocations(HdfsFile file)
        {
            // HDFS files have blocks of equal size, except for the last block which may not be full
            long numberOfBlocksToRead = (file.length + file.blockSize - 1) / file.blockSize;
            // number of blocks to query at a time from the datanode; this limits the size of each http request
            // and the memory consumed by the JSON response
            long batchSizeToRead = file.blockSize * 4096L;

            // keep track of the offset within the file of the current batch of blocks
            long offset = 0;

            while (numberOfBlocksToRead > 0)
            {
                // make the operation request for this batch of blocks
                string op = "GET_BLOCK_LOCATIONS&offset=" + offset + "&length=" + batchSizeToRead;

                // and update the offset ready to read the next batch
                offset += batchSizeToRead;

                // get the details of this batch
                JObject json   = GetJSon(file.path, op);
                JToken  blocks = json["LocatedBlocks"]["locatedBlocks"];
                foreach (JToken block in blocks)
                {
                    // look up the ip address and webhdfs port of each location
                    yield return(block["locations"]
                                 .Select(l => new IPEndPoint(IPAddress.Parse((string)l["ipAddr"]), (int)l["infoPort"])).ToArray());

                    --numberOfBlocksToRead;
                }
            }
        }
Пример #2
0
Файл: Dfs.cs Проект: xyuan/Naiad
        /// <summary>
        /// Return the length of a particular block in a file. All blocks except the last one are the same length
        /// </summary>
        /// <param name="index">index of the block in the file</param>
        /// <param name="file">file being read</param>
        /// <returns>length in bytes of the requested block</returns>
        private long BlockLength(int index, HdfsFile file)
        {
            // start location of the block in the file
            long offset = (long)index * file.blockSize;
            // number of bytes after the start of the block
            long bytesAfterBlockStart = file.length - offset;

            // either the standard block length, or the length of the final block if it is shorter
            return(Math.Min(bytesAfterBlockStart, file.blockSize));
        }
Пример #3
0
        public override IEnumerable <IPEndPoint[]> GetBlockLocations(HdfsFile file)
        {
            HdfsFileInfo fileInfo = this.Instance(file.path).GetFileInfo(file.path.AbsolutePath, true);

            foreach (HdfsBlockInfo info in fileInfo.blockArray)
            {
                IPEndPoint[] endpoints = new IPEndPoint[info.Endpoints.Length];
                for (int i = 0; i < endpoints.Length; ++i)
                {
                    string[] parts = info.Endpoints[i].Split(':');
                    endpoints[i] = new IPEndPoint(IPAddress.Parse(parts[0]), Int32.Parse(parts[1]));
                }
                yield return(endpoints);
            }
        }
Пример #4
0
Файл: Dfs.cs Проект: xyuan/Naiad
        /// <summary>
        /// given a file, determine how much of that file's data are stored on each datanode. Return every datanode that stores a
        /// threshold percentage of the file's data as a candidate match for that file
        /// </summary>
        /// <param name="file">file to be matched</param>
        /// <returns>a match including a (possibly-empty) set of candidate data nodes</returns>
        protected override IEnumerable <Match> EnumerateFileWork(HdfsFile file)
        {
            long numberOfBlocks = (file.length + file.blockSize - 1) / file.blockSize;
            long threshold;

            if (numberOfBlocks > 4)
            {
                // this is a 'real' multi-block file; only take a datanode that contains at least a third of it
                threshold = file.length / 3;
            }
            else
            {
                // this file either has a single block or only a few: only return a matching node if it stores the whole file.
                // this will select the node that wrote (all the blocks in) the file rather than one of the replicas, in the case of a file with only a
                // couple of blocks
                threshold = file.length;
            }

            Match match = new Match
            {
                categories = this.client.GetBlockLocations(file)
                             // first flatten the list of block locations, into a sequence of pairs of 'endpoint,length' each indicating that length
                             // bytes are stored at endpoint
                             .SelectMany((endpoints, index) =>
                                         endpoints.Select(endpoint => new KeyValuePair <IPEndPoint, long>(endpoint, BlockLength(index, file))))
                             // then group by endpoint
                             .GroupBy(x => x.Key)
                             // within each group, sum the bytes to determine how many bytes in total are stored at each endpoint
                             .Select(g => new KeyValuePair <IPEndPoint, long>(g.Key, g.Select(elt => elt.Value).Sum()))
                             // keep only endpoints that store more than 33% of the file
                             .Where(x => x.Value >= threshold)
                             // return the flattened array of candidate endpoints, if any
                             .Select(x => x.Key).ToArray(),

                // if there isn't a matching worker, use null as the default endpoint, meaning the read will be redirected to the
                // name node. Set the block to indicate the entire file
                workStub = new DfsBlock
                {
                    path            = file.path,
                    fileLength      = file.length,
                    offset          = 0,
                    length          = file.length,
                    dataNodeAddress = null
                }
            };

            yield return(match);
        }
Пример #5
0
Файл: Dfs.cs Проект: xyuan/Naiad
        /// <summary>
        /// called to convert an input file into a list of blocks
        /// </summary>
        /// <param name="file">the input file</param>
        /// <returns>the blocks in the file, along with a set of datanodes where each block is stored</returns>
        protected override IEnumerable <Match> EnumerateFileWork(HdfsFile file)
        {
            // get the blocks in the file, and convert each block to a base match, with file-specific metadata
            // filled in to the DfsBlock
            IEnumerable <Match> rawMatches = MakeBaseMatches(file, this.client.GetBlockLocations(file));

            // fill in the rest of the DfsBlock fields
            return(rawMatches.Select((match, index) =>
            {
                match.workStub.path = file.path;
                match.workStub.fileLength = file.length;
                // all the blocks are the same size
                match.workStub.offset = (long)index * file.blockSize;
                long bytesRemaining = file.length - match.workStub.offset;
                match.workStub.length = Math.Min(file.blockSize, bytesRemaining);
                // this address will be used if the block is going to be read by a worker on a remote machine
                // otherwise the correct address will be filled in when the worker is chosen
                match.workStub.dataNodeAddress = match.categories.First();

                return match;
            }));
        }
Пример #6
0
Файл: Dfs.cs Проект: xyuan/Naiad
 /// <summary>
 /// For a given file, map a sequence of locations to a sequence of "base" matches. The workStub
 /// component of the match doesn't need any file-specific metadata filled in. The categories component of the
 /// match is set to the locations of that block. The DfsBlock fields will
 /// all be filled in after this sequence has been returned
 /// </summary>
 /// <param name="file">The file being read</param>
 /// <param name="blocks">The block locations for the file</param>
 /// <returns>A sequence of Matches with the categories set, and any file-specific metadata set</returns>
 protected override IEnumerable <Match> MakeBaseMatches(HdfsFile file, IEnumerable <IPEndPoint[]> blocks)
 {
     return(blocks.Select(endpoints => new Match {
         workStub = new DfsBlock(), categories = endpoints
     }));
 }
Пример #7
0
Файл: Dfs.cs Проект: xyuan/Naiad
 /// <summary>
 /// For a given file, map a sequence of locations to a sequence of "base" matches, where the workStub
 /// component of the match has any filetype-specific metadata filled in and the categories component of the
 /// match has been set to the locations of that block. The HdfsBlock fields will
 /// all be filled in later after this sequence has been returned
 /// </summary>
 /// <param name="file">The file being read</param>
 /// <param name="blocks">The block locations for the file</param>
 /// <returns>A sequence of Matches with the categories set, and any file-specific metadata set</returns>
 protected abstract IEnumerable <Match> MakeBaseMatches(HdfsFile file, IEnumerable <IPEndPoint[]> blocks);
Пример #8
0
Файл: Dfs.cs Проект: xyuan/Naiad
 /// <summary>
 /// given an hdfs file, return a sequence of work items, each with a set of matching categories
 /// </summary>
 /// <remarks>
 /// this is currently HDFS-specific, although the HdfsFile class could easily be extended to support
 /// other DFSs
 /// </remarks>
 /// <param name="file">file to expand</param>
 /// <returns>a sequence of work items, each with a set of matching categories</returns>
 protected abstract IEnumerable <Match> EnumerateFileWork(HdfsFile file);