public override void OnReceive(Message <TOutput, TTime> message) { WriterStreamSequence <TWriter> writer; if (!writers.TryGetValue(message.time, out writer)) { // make a filename generator for the specified process, worker and time Func <int, Uri> format = segment => this.pathFunction(this.Stage.Computation.Controller.Configuration.ProcessID, this.workerId, message.time, segment); // make a sequence writer for the specified process, worker and time writer = new WriterStreamSequence <TWriter>(format, u => streamFunction(this.client, u), writerFunction, fileLengthThreshold); writers.Add(message.time, writer); // ensure that we are called later to close the sequence writer when the time completes this.NotifyAt(message.time); } // before serializing a batch of records, check to see if the current file has gone over // its length threshold; if so the current file will be closed, and the next one will be // opened writer.CheckForFileBoundary(); // serialize the batch of records to the current file this.serialize(writer.Writer, new ArraySegment <TOutput>(message.payload, 0, message.length)); }
/// <summary> /// serialize a sequence of records to a collection of files partitioned by process and thread. For each /// process/thread this writes a sequence of files; each time a file reaches a threshold number of bytes, /// it is closed and another is opened. This keeps individual files of bounded length, allowing for more /// parallelism when reading them later /// </summary> /// <typeparam name="TOutput">type of record to serialize</typeparam> /// <typeparam name="TWriter">type of the serializer</typeparam> /// <param name="source">stream of records to serialize</param> /// <param name="pathFunction">function from processId, threadId and sequence number to filename</param> /// <param name="streamFunction">function to create an output stream given a filename</param> /// <param name="writerFunction">function to create a serializer from a stream</param> /// <param name="serialize">action to serialize a batch of records</param> /// <param name="fileLengthThreshold">length in bytes of a file after which it is closed and a new one is opened</param> /// <returns>a handle that can be waited on for the computation to complete</returns> public static Subscription WriteBySubscription <TOutput, TWriter>( this Stream <TOutput, Epoch> source, Func <int, int, int, Uri> pathFunction, Func <Uri, Stream> streamFunction, Func <Stream, TWriter> writerFunction, Action <TWriter, ArraySegment <TOutput> > serialize, long fileLengthThreshold) where TWriter : class, IDisposable, IFlushable { // dictionary of sequence writers, indexed by worker id var writers = new Dictionary <int, WriterStreamSequence <TWriter> >(); return(source.Subscribe( // OnRecv callback (message, workerid) => { WriterStreamSequence <TWriter> writer; lock (writers) { if (!writers.TryGetValue(workerid, out writer)) { // make a filename generator for the specified worker and process Func <int, Uri> format = segment => pathFunction(source.ForStage.Computation.Controller.Configuration.ProcessID, workerid, segment); // make the sequence writer for the specified worker and process writer = new WriterStreamSequence <TWriter>( format, streamFunction, writerFunction, fileLengthThreshold); writers.Add(workerid, writer); } } // before serializing a batch of records, check to see if the current file has gone over // its length threshold; if so the current file will be closed, and the next one will be // opened writer.CheckForFileBoundary(); // serialize the batch of records to the current file serialize(writer.Writer, new ArraySegment <TOutput>(message.payload, 0, message.length)); }, // OnNotify callback (epoch, workerid) => { }, // OnCompleted callback workerid => { lock (writers) { if (writers.ContainsKey(workerid)) { writers[workerid].Close(); writers.Remove(workerid); } } })); }
public override void OnNotify(TTime time) { WriterStreamSequence <TWriter> writer = writers[time]; writers.Remove(time); // close the sequence writer writer.Close(); var output = this.Output.GetBufferForTime(time); foreach (Uri fileName in writer.Filenames) { // emit the filename of each file written by this writer output.Send(fileName); } }