/// <summary> /// Find the start of the first record that begins after the end of the block we have been instructed to read. This indicates /// the end of the range of data that this block corresponds to /// </summary> /// <param name="workItem">the block we are reading</param> /// <returns></returns> private long FindSpillExtent(TItem workItem) { // compute the number of bytes remaining in the file past the end of our block long spillBytesRemaining = workItem.fileLength - workItem.offset - workItem.length; if (spillBytesRemaining <= 0) { // this is the last block, so our range runs exactly to the end of the block return(workItem.length); } // get a stream that starts immediately after the end of the block, and continues until the end of the file. using (Stream spillReader = client.GetDfsStreamReader( workItem.path, // read from the end of this block for the rest of the file workItem.offset + workItem.length, spillBytesRemaining, // use small requests if we expect records to be fairly small, so we don't prefetch and buffer a lot of data in the next block this.syncRequestLength)) { // call into the format-specific function to find the start of the next record. Potentially this spins all the way to the end of the // stream this.syncToNextRecord(workItem, spillReader); // return the offset of the next record after the block, relative to the start of the block return(workItem.length + spillReader.Position); } }
/// <summary> /// Execute a work item, reading an HDFS file and generating a sequence of output records /// </summary> /// <param name="workItem">The work item to be executed, corresponding to an entire Hdfs file</param> /// <returns>A sequence of array segments, each containing a sequence of records to be output</returns> public IEnumerable <ArraySegment <TOutput> > DoWork(DfsBlock workItem) { // ask the Hdfs client for a stream corresponding to the file. Use a 1k buffer for byte-at-a-time reads. using (Stream reader = client.GetDfsStreamReader(workItem.path, workItem.offset, workItem.length, 1024, workItem.dataNodeAddress)) { foreach (ArraySegment <TOutput> segment in this.deserialize(reader)) { yield return(segment); } } }