/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="S"></typeparam> /// <param name="self"></param> /// <param name="updateFunc">State update function. If this function returns None, then corresponding state key-value pair will be eliminated.</param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, S> > UpdateStateByKey <K, V, S>(this DStream <KeyValuePair <K, V> > self, Func <IEnumerable <KeyValuePair <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <KeyValuePair <K, S> > > updateFunc, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = self.Piplinable ? (self as TransformedDStream <KeyValuePair <K, V> >).func : null; Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new UpdateStateByKeysHelper <K, V, S>(updateFunc, prevFunc, numPartitions).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); return(new DStream <KeyValuePair <K, S> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream( self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy, stream.ToArray(), self.serializedMode.ToString(), (self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()), self.streamingContext)); }
/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="S"></typeparam> /// <param name="self"></param> /// <param name="updateFunc">State update function - (pid, IEnumerable[K, [newValues, oldState]]) => IEnumerable[K, newState]</param> /// <param name="initialState">Initial state value of each key</param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <Tuple <K, S> > UpdateStateByKey <K, V, S>(this DStream <Tuple <K, V> > self, Func <int, IEnumerable <Tuple <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <Tuple <K, S> > > updateFunc, RDD <Tuple <K, S> > initialState = null, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } // completes pipelinable dstream by adding the last pipelinable operation // before transforming to CSharpStateDStream so that UpdateStateByKey's // parallel job covers all pipelinable operations before shuffling var ds = self.Transform(new AddShuffleKeyHelper <K, V>(numPartitions).Execute); Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new UpdateStateByKeysHelper <K, V, S>(updateFunc, initialState, numPartitions).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); return(new DStream <Tuple <K, S> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream( ds.DStreamProxy, stream.ToArray(), "CSharpStateDStream", ds.serializedMode.ToString(), ds.serializedMode.ToString()), self.streamingContext)); }