/// <summary> /// write a sequence of strings as hdfs text files, partitioned by time as well as key. /// Within a given time and part, records are written in an undefined order /// </summary> /// <typeparam name="TTime">type of the record time</typeparam> /// <param name="source">stream of records to write</param> /// <param name="prefix">webhdfs directory to write the partitioned data into</param> /// <param name="bufferSize">buffer size to use for the text serializer</param> /// <param name="blockSize">hdfs block size to use, or -1 for the file system default value</param> /// <param name="segmentThreshold">file size to write before closing the file and opening another one</param> /// <returns>stream of filenames written</returns> public static Stream <Uri, TTime> ToHdfsText <TTime>( this Stream <string, TTime> source, Uri prefix, int bufferSize = 1024 * 1024, long blockSize = -1, long segmentThreshold = 254 * 1024 * 1024) where TTime : Time <TTime> { // make sure we'll be able to write the partitioned data HdfsClient client = new HdfsClient(); client.EnsureDirectory(prefix, false); // don't write byte order marks at the start of the files Encoding utf8 = new UTF8Encoding(false, true); return(source.ToHdfsBinary( (processId, threadId, time, segment) => Utils.DefaultPartFormat(prefix, processId, threadId, time, segment), stream => new Utils.FStreamWriter(stream, utf8, 1024 * 1024), (writer, arraySegment) => { for (int i = 0; i < arraySegment.Count; i++) { writer.WriteLine(arraySegment.Array[i]); } }, bufferSize, blockSize, segmentThreshold)); }
/// <summary> /// method to write a stream of records to a collection of HDFS files using the default Naiad binary serializer, /// partitioned by time as well as key. Within a given time and part, records are written in an undefined order /// </summary> /// <typeparam name="TOutput">type of the records to write</typeparam> /// <typeparam name="TTime">type of the record time</typeparam> /// <param name="source">stream of records to write</param> /// <param name="prefix">webhdfs directory to write the partitioned data into</param> /// <param name="bufferSize">buffer size to use for the serializer</param> /// <param name="blockSize">hdfs block size to use, or -1 for the file system default value</param> /// <param name="segmentThreshold">file size to write before closing the file and opening another one</param> /// <returns>stream of filenames written</returns> public static Stream <Uri, TTime> ToHdfsBinary <TOutput, TTime>( this Stream <TOutput, TTime> source, Uri prefix, int bufferSize = 1024 * 1024, long blockSize = -1, long segmentThreshold = 254 * 1024 * 1024) where TTime : Time <TTime> { // make sure we'll be able to write the partitioned data HdfsClient client = new HdfsClient(); client.EnsureDirectory(prefix, false); return(source.ToHdfsBinary( (processId, threadId, time, segment) => Utils.DefaultPartFormat(prefix, processId, threadId, time, segment), stream => new NaiadWriter <TOutput>(stream, source.ForStage.Computation.Controller.SerializationFormat, bufferSize), (writer, arraySegment) => { for (int i = 0; i < arraySegment.Count; i++) { writer.Write(arraySegment.Array[i]); } }, bufferSize, blockSize, segmentThreshold)); }