Beispiel #1
0
 /// <summary>
 /// create a worker for a file-at-a-time hdfs reader
 /// </summary>
 /// <param name="client">Hdfs client used to read files</param>
 /// <param name="deserialize">function to take a stream consisting of an entire webhdfs file, and return a sequence
 /// of batches, each containing an arraysegment of output records</param>
 public DfsFileWorker(
     HdfsClientBase client, Func <Stream, IEnumerable <ArraySegment <TOutput> > > deserialize)
 {
     // cache all the addresses the local node is listening on
     this.localAddresses = Dns.GetHostAddresses(Dns.GetHostName());
     this.deserialize    = deserialize;
     this.client         = client;
 }
Beispiel #2
0
 /// <summary>
 /// create a worker for deserializing lines of text from an Hdfs file
 /// </summary>
 /// <param name="client">Hdfs client to use for reading data</param>
 /// <param name="batchSize">number of lines to return at a time</param>
 public DfsTextWorker(HdfsClientBase client, int batchSize)
     : base(
         // use 4k blocks when scanning past the end of the block to find the end of the final line
         4 * 1024,
         (item, stream) => SyncToNextLine(stream),
         (item, stream) => Deserialize(stream, batchSize),
         client)
 {
 }
Beispiel #3
0
        /// <summary>
        /// create a worker to read dfs files broken into blocks
        /// </summary>
        /// <param name="syncRequestLength">size of each dfs request when seeking past the end of the block for the start of the
        /// next record. If records are expected to be small this should also be small, to avoid prefetching and buffering a lot of the
        /// next block's data</param>
        /// <param name="syncToNextRecord">action to sync to the start of the next record. The first argument is the block item
        /// being read, which may contain metadata about sync markers. The second argument is the stream to scan.</param>
        /// <param name="deserialize">function to deserialize records in a stream</param>
        /// <param name="client">client used to read hdfs data</param>
        public DfsBlockWorker(
            int syncRequestLength,
            Action <TItem, Stream> syncToNextRecord,
            Func <TItem, Stream, IEnumerable <ArraySegment <TOutput> > > deserialize,
            HdfsClientBase client)
        {
            // cache the local IP addresses
            this.localAddresses = Dns.GetHostAddresses(Dns.GetHostName());

            this.syncRequestLength = syncRequestLength;
            this.syncToNextRecord  = syncToNextRecord;
            this.deserialize       = deserialize;
            this.client            = client;
        }
Beispiel #4
0
 /// <summary>
 /// create a coordinator for reading Dfs files with fixed-length blocks, made of text records
 /// </summary>
 /// <param name="client">client used for reading Hdfs data and metadata</param>
 public DfsTextCoordinator(HdfsClientBase client) : base(client)
 {
 }
Beispiel #5
0
 /// <summary>
 /// return a new coordinator for a block-based HDFS reader
 /// </summary>
 /// <param name="client">hdfs client used to read files and metadata</param>
 public DfsBlockCoordinator(HdfsClientBase client) : base(client)
 {
 }
Beispiel #6
0
 /// <summary>
 /// create a new coordinator for file-at-a-time dfs reads
 /// </summary>
 /// <param name="client">hdfs client</param>
 public DfsFileCoordinator(HdfsClientBase client) : base(client)
 {
 }
Beispiel #7
0
 /// <summary>
 /// return a new coordinator for a dfs reader
 /// </summary>
 /// <param name="client">hdfs client</param>
 public DfsBaseCoordinator(HdfsClientBase client)
 {
     this.client    = client;
     this.dataNodes = new Dictionary <IPAddress, IPEndPoint>();
 }
Beispiel #8
0
        /// <summary>
        /// A file with the specified path.
        /// </summary>
        /// <param name="path">Path to the file.</param>
        /// <param name="client">Azure client.</param>
        /// <param name="config">Cluster configuration.</param>
        /// <param name="job">Job accessing this file.</param>
        /// <param name="jobFolderUri">Uri to base folder.</param>
        /// <param name="isFolder">If true this must be a folder.</param>
        /// <param name="canCache">True if the file can be cached (it is immutable for sure).</param>
        public DfsFile(ClusterConfiguration config, Uri jobFolderUri, DryadLinqJobSummary job, HdfsClientBase client, string path, bool canCache, bool isFolder)
            : base(config, job)
        {
            this.client = client;
            this.Exception = null;
            this.baseUri = jobFolderUri;
            this.uri = UriFromPath(jobFolderUri, path);
            this.ShouldCacheLocally = canCache;
            this.RepresentsAFolder = isFolder;
            this.size = -1;

            Console.WriteLine("DfsFile Uri={0}", this.uri);
            if (!string.IsNullOrEmpty(CachedClusterResidentObject.CacheDirectory))
                this.LocalCachePath = Path.Combine(CachedClusterResidentObject.CacheDirectory, PathFromUri(this.baseUri, this.uri));
        }
 /// <summary>
 /// Must be called after setting all properties.
 /// Returns true if initialization succeeds.
 /// </summary>
 public override string Initialize()
 {
     try
     {
         this.DfsClient = new HdfsClient(this.UserName);
         return null;
     }
     catch (Exception ex)
     {
         Console.WriteLine(ex);
         return ex.Message;
     }
 }