/// <summary> /// create a worker for a file-at-a-time hdfs reader /// </summary> /// <param name="client">Hdfs client used to read files</param> /// <param name="deserialize">function to take a stream consisting of an entire webhdfs file, and return a sequence /// of batches, each containing an arraysegment of output records</param> public DfsFileWorker( HdfsClientBase client, Func <Stream, IEnumerable <ArraySegment <TOutput> > > deserialize) { // cache all the addresses the local node is listening on this.localAddresses = Dns.GetHostAddresses(Dns.GetHostName()); this.deserialize = deserialize; this.client = client; }
/// <summary> /// create a worker for deserializing lines of text from an Hdfs file /// </summary> /// <param name="client">Hdfs client to use for reading data</param> /// <param name="batchSize">number of lines to return at a time</param> public DfsTextWorker(HdfsClientBase client, int batchSize) : base( // use 4k blocks when scanning past the end of the block to find the end of the final line 4 * 1024, (item, stream) => SyncToNextLine(stream), (item, stream) => Deserialize(stream, batchSize), client) { }
/// <summary> /// create a worker to read dfs files broken into blocks /// </summary> /// <param name="syncRequestLength">size of each dfs request when seeking past the end of the block for the start of the /// next record. If records are expected to be small this should also be small, to avoid prefetching and buffering a lot of the /// next block's data</param> /// <param name="syncToNextRecord">action to sync to the start of the next record. The first argument is the block item /// being read, which may contain metadata about sync markers. The second argument is the stream to scan.</param> /// <param name="deserialize">function to deserialize records in a stream</param> /// <param name="client">client used to read hdfs data</param> public DfsBlockWorker( int syncRequestLength, Action <TItem, Stream> syncToNextRecord, Func <TItem, Stream, IEnumerable <ArraySegment <TOutput> > > deserialize, HdfsClientBase client) { // cache the local IP addresses this.localAddresses = Dns.GetHostAddresses(Dns.GetHostName()); this.syncRequestLength = syncRequestLength; this.syncToNextRecord = syncToNextRecord; this.deserialize = deserialize; this.client = client; }
/// <summary> /// create a coordinator for reading Dfs files with fixed-length blocks, made of text records /// </summary> /// <param name="client">client used for reading Hdfs data and metadata</param> public DfsTextCoordinator(HdfsClientBase client) : base(client) { }
/// <summary> /// return a new coordinator for a block-based HDFS reader /// </summary> /// <param name="client">hdfs client used to read files and metadata</param> public DfsBlockCoordinator(HdfsClientBase client) : base(client) { }
/// <summary> /// create a new coordinator for file-at-a-time dfs reads /// </summary> /// <param name="client">hdfs client</param> public DfsFileCoordinator(HdfsClientBase client) : base(client) { }
/// <summary> /// return a new coordinator for a dfs reader /// </summary> /// <param name="client">hdfs client</param> public DfsBaseCoordinator(HdfsClientBase client) { this.client = client; this.dataNodes = new Dictionary <IPAddress, IPEndPoint>(); }
/// <summary> /// A file with the specified path. /// </summary> /// <param name="path">Path to the file.</param> /// <param name="client">Azure client.</param> /// <param name="config">Cluster configuration.</param> /// <param name="job">Job accessing this file.</param> /// <param name="jobFolderUri">Uri to base folder.</param> /// <param name="isFolder">If true this must be a folder.</param> /// <param name="canCache">True if the file can be cached (it is immutable for sure).</param> public DfsFile(ClusterConfiguration config, Uri jobFolderUri, DryadLinqJobSummary job, HdfsClientBase client, string path, bool canCache, bool isFolder) : base(config, job) { this.client = client; this.Exception = null; this.baseUri = jobFolderUri; this.uri = UriFromPath(jobFolderUri, path); this.ShouldCacheLocally = canCache; this.RepresentsAFolder = isFolder; this.size = -1; Console.WriteLine("DfsFile Uri={0}", this.uri); if (!string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory)) this.LocalCachePath = Path.Combine(CachedClusterResidentObject.CacheDirectory, PathFromUri(this.baseUri, this.uri)); }
/// <summary> /// Must be called after setting all properties. /// Returns true if initialization succeeds. /// </summary> public override string Initialize() { try { this.DfsClient = new HdfsClient(this.UserName); return null; } catch (Exception ex) { Console.WriteLine(ex); return ex.Message; } }