/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> public static MapWithStateDStream <K, V, S, M> MapWithState <K, V, S, M>(this DStream <Tuple <K, V> > self, StateSpec <K, V, S, M> stateSpec) { if (stateSpec.numPartitions <= 0) { stateSpec = stateSpec.NumPartitions(self.streamingContext.SparkContext.DefaultParallelism); } Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = self.Piplinable ? (self as TransformedDStream <Tuple <K, V> >).func : null; Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new MapWithStateHelper <K, V, S, M>(prevFunc, stateSpec).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); var mapWithStateDStream = new DStream <MapWithStateRDDRecord <K, S, M> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream( self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy, stream.ToArray(), "CSharpStateDStream", self.serializedMode.ToString(), (self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()), self.streamingContext); DStream <M> mappedDataDStream = mapWithStateDStream.FlatMap(r => r.mappedData); DStream <Tuple <K, S> > snapshotsDStream = mapWithStateDStream.FlatMap( r => r.stateMap.Select(entry => new Tuple <K, S>(entry.Key, entry.Value.state))); return(new MapWithStateDStream <K, V, S, M>(mappedDataDStream, snapshotsDStream)); }
/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="S"></typeparam> /// <param name="self"></param> /// <param name="updateFunc">State update function. If this function returns None, then corresponding state key-value pair will be eliminated.</param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, S> > UpdateStateByKey <K, V, S>(this DStream <KeyValuePair <K, V> > self, Func <IEnumerable <KeyValuePair <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <KeyValuePair <K, S> > > updateFunc, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = self.Piplinable ? (self as TransformedDStream <KeyValuePair <K, V> >).func : null; Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new UpdateStateByKeysHelper <K, V, S>(updateFunc, prevFunc, numPartitions).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); return(new DStream <KeyValuePair <K, S> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream( self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy, stream.ToArray(), self.serializedMode.ToString(), (self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()), self.streamingContext)); }
/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="S"></typeparam> /// <param name="self"></param> /// <param name="updateFunc">State update function - (pid, IEnumerable[K, [newValues, oldState]]) => IEnumerable[K, newState]</param> /// <param name="initialState">Initial state value of each key</param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <Tuple <K, S> > UpdateStateByKey <K, V, S>(this DStream <Tuple <K, V> > self, Func <int, IEnumerable <Tuple <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <Tuple <K, S> > > updateFunc, RDD <Tuple <K, S> > initialState = null, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } // completes pipelinable dstream by adding the last pipelinable operation // before transforming to CSharpStateDStream so that UpdateStateByKey's // parallel job covers all pipelinable operations before shuffling var ds = self.Transform(new AddShuffleKeyHelper <K, V>(numPartitions).Execute); Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new UpdateStateByKeysHelper <K, V, S>(updateFunc, initialState, numPartitions).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); return(new DStream <Tuple <K, S> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream( ds.DStreamProxy, stream.ToArray(), "CSharpStateDStream", ds.serializedMode.ToString(), ds.serializedMode.ToString()), self.streamingContext)); }
/// <summary> /// Return a new DStream by applying ReduceByKey to each RDD. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="self"></param> /// <param name="reduceFunc"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <Tuple <K, V> > ReduceByKey <K, V>(this DStream <Tuple <K, V> > self, Func <V, V, V> reduceFunc, int numPartitions = 0) { var locallyCombined = self.MapPartitionsWithIndex(new GroupByMergeHelper <K, V>(reduceFunc).Execute, true); var shuffled = locallyCombined.PartitionBy(numPartitions); return(shuffled.MapPartitionsWithIndex(new GroupByMergeHelper <K, V>(reduceFunc).Execute, true)); }
/// <summary> /// Return a new DStream by applying 'join' between RDDs of this DStream and `other` DStream. /// Hash partitioning is used to generate the RDDs with `numPartitions` partitions. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="W"></typeparam> /// <param name="self"></param> /// <param name="other"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, Tuple <V, W> > > Join <K, V, W>(this DStream <KeyValuePair <K, V> > self, DStream <KeyValuePair <K, W> > other, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } return(self.TransformWith <KeyValuePair <K, W>, KeyValuePair <K, Tuple <V, W> > >(new JoinHelper <K, V, W>(numPartitions).Execute, other)); }
/// <summary> /// Return a new DStream in which each RDD are partitioned by numPartitions. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="self"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, V> > PartitionBy <K, V>(this DStream <KeyValuePair <K, V> > self, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } return(self.Transform <KeyValuePair <K, V> >(new PartitionByHelper <K, V>(numPartitions).Execute)); }
/// <summary> /// Return a new DStream by unifying data of another DStream with this DStream. /// /// @param other: Another DStream having the same interval (i.e., slideDuration) as this DStream. /// </summary> /// <param name="other"></param> /// <returns></returns> public DStream <T> Union(DStream <T> other) { if (SlideDuration != other.SlideDuration) { throw new ArgumentException("the two DStream should have same slide duration"); } return(TransformWith((rdd1, rdd2) => rdd1.Union(rdd2), other, true)); }
/// <summary> /// Return a new DStream by applying incremental `reduceByKey` over a sliding window. /// /// The reduced value of over a new window is calculated using the old window's reduce value : /// 1. reduce the new values that entered the window (e.g., adding new counts) /// 2. "inverse reduce" the old values that left the window (e.g., subtracting old counts) /// /// `invFunc` can be None, then it will reduce all the RDDs in window, could be slower than having `invFunc`. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="self"></param> /// <param name="reduceFunc">associative reduce function</param> /// <param name="invReduceFunc">inverse function of `reduceFunc`</param> /// <param name="windowSeconds">width of the window; must be a multiple of this DStream's batching interval</param> /// <param name="slideSeconds">sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval</param> /// <param name="numPartitions">number of partitions of each RDD in the new DStream.</param> /// <param name="filterFunc">function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter</param> /// <returns></returns> public static DStream <KeyValuePair <K, V> > ReduceByKeyAndWindow <K, V>(this DStream <KeyValuePair <K, V> > self, Func <V, V, V> reduceFunc, Func <V, V, V> invReduceFunc, int windowSeconds, int slideSeconds = 0, int numPartitions = 0, Func <KeyValuePair <K, V>, bool> filterFunc = null) { self.ValidatWindowParam(windowSeconds, slideSeconds); if (slideSeconds <= 0) { slideSeconds = self.SlideDuration; } // dstream to be transformed by substracting old RDDs and adding new RDDs based on the window var reduced = self.ReduceByKey(reduceFunc, numPartitions); Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = reduced.Piplinable ? (reduced as TransformedDStream <KeyValuePair <K, V> >).func : null; var helper = new ReduceByKeyAndWindowHelper <K, V>(reduceFunc, invReduceFunc, numPartitions, filterFunc, prevFunc); // function to reduce the new values that entered the window (e.g., adding new counts) Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > reduceF = helper.Reduce; MemoryStream stream = new MemoryStream(); var formatter = new BinaryFormatter(); formatter.Serialize(stream, reduceF); // function to "inverse reduce" the old values that left the window (e.g., subtracting old counts) MemoryStream invStream = null; if (invReduceFunc != null) { Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > invReduceF = helper.InvReduce; invStream = new MemoryStream(); formatter.Serialize(stream, invReduceF); } return(new DStream <KeyValuePair <K, V> >( SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpReducedWindowedDStream( reduced.Piplinable ? reduced.prevDStreamProxy : reduced.DStreamProxy, stream.ToArray(), invStream == null ? null : invStream.ToArray(), windowSeconds, slideSeconds, (reduced.Piplinable ? reduced.prevSerializedMode : reduced.serializedMode).ToString()), self.streamingContext )); }
/// <summary> /// Return a new DStream by applying combineByKey to each RDD. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="C"></typeparam> /// <param name="self"></param> /// <param name="createCombiner"></param> /// <param name="mergeValue"></param> /// <param name="mergeCombiners"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, C> > CombineByKey <K, V, C>( this DStream <KeyValuePair <K, V> > self, Func <C> createCombiner, Func <C, V, C> mergeValue, Func <C, C, C> mergeCombiners, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } return(self.Transform <KeyValuePair <K, C> >(new CombineByKeyHelper <K, V, C>(createCombiner, mergeValue, mergeCombiners, numPartitions).Execute)); }
internal void Init <T>(DStream <T> prev, Func <double, RDD <dynamic>, RDD <dynamic> > f) { streamingContext = prev.streamingContext; serializedMode = SerializedMode.Byte; isCached = false; isCheckpointed = false; dstreamProxy = null; if (prev is TransformedDStream <T> && !prev.isCached && !prev.isCheckpointed) { prevFunc = (prev as TransformedDStream <T>).func; func = new NewFuncWrapper(f, prevFunc).Execute; prevDStreamProxy = prev.prevDStreamProxy; prevSerializedMode = prev.prevSerializedMode; } else { prevDStreamProxy = prev.dstreamProxy; prevSerializedMode = prev.serializedMode; func = f; } }
/// <summary> /// Create an input stream that directly pulls messages from a Kafka Broker and specific offset. /// /// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka /// in each batch duration and processed without storing. /// /// This does not use Zookeeper to store offsets. The consumed offsets are tracked /// by the stream itself. For interoperability with Kafka monitoring tools that depend on /// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. /// You can access the offsets used in each batch from the generated RDDs (see /// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). /// To recover from driver failures, you have to enable checkpointing in the StreamingContext. /// The information on consumed offset can be recovered from the checkpoint. /// See the programming guide for details (constraints, etc.). /// /// </summary> /// <param name="ssc">Spark Streaming Context</param> /// <param name="topics">list of topic_name to consume.</param> /// <param name="kafkaParams"> /// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set /// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form. /// </param> /// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param> /// <param name="readFunc">user function to process the kafka data.</param> /// <returns>A DStream object</returns> public static DStream <T> CreateDirectStream <T>(StreamingContext ssc, List <string> topics, Dictionary <string, string> kafkaParams, Dictionary <string, long> fromOffsets, Func <int, IEnumerable <KeyValuePair <byte[], byte[]> >, IEnumerable <T> > readFunc) { int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams); if (ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) <= 0) { var dstream = new DStream <KeyValuePair <byte[], byte[]> >(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair); return(dstream.MapPartitionsWithIndex(readFunc, true)); } var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper <KeyValuePair <byte[], byte[]>, T>(readFunc, true); var transformHelper = new TransformHelper <KeyValuePair <byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute); var transformDynamicHelper = new TransformDynamicHelper <KeyValuePair <byte[], byte[]>, T>(transformHelper.Execute); Func <double, RDD <dynamic>, RDD <dynamic> > func = transformDynamicHelper.Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); byte[] readFuncBytes = stream.ToArray(); string serializationMode = SerializedMode.Pair.ToString(); return(new DStream <T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc)); }
/// <summary> /// Return a new DStream in which each RDD is generated by applying a function /// on each RDD of this DStream and 'other' DStream. /// /// `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three /// arguments of (`time`, `rdd_a`, `rdd_b`) /// </summary> /// <typeparam name="U"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="f"></param> /// <param name="other"></param> /// <param name="keepSerializer"></param> /// <returns></returns> public DStream <V> TransformWith <U, V>(Func <double, RDD <T>, RDD <U>, RDD <V> > f, DStream <U> other, bool keepSerializer = false) { Func <double, RDD <dynamic>, RDD <dynamic> > prevF = Piplinable ? (this as TransformedDStream <T>).func : null; Func <double, RDD <dynamic>, RDD <dynamic> > otherF = other.Piplinable ? (other as TransformedDStream <U>).func : null; Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new TransformWithDynamicHelper <T, U, V>(f, prevF, otherF).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); return(new DStream <V>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpTransformed2DStream( Piplinable ? prevDStreamProxy : DStreamProxy, other.Piplinable ? other.prevDStreamProxy : other.DStreamProxy, stream.ToArray(), (Piplinable ? prevSerializedMode : serializedMode).ToString(), (other.Piplinable ? other.prevSerializedMode : other.serializedMode).ToString()), streamingContext, keepSerializer ? serializedMode : SerializedMode.Byte)); }
/// <summary> /// Return a new DStream by applying 'cogroup' between RDDs of this DStream and `other` DStream. /// Hash partitioning is used to generate the RDDs with `numPartitions` partitions. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="W"></typeparam> /// <param name="self"></param> /// <param name="other"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <Tuple <K, Tuple <List <V>, List <W> > > > GroupWith <K, V, W>(this DStream <Tuple <K, V> > self, DStream <Tuple <K, W> > other, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } return(self.TransformWith <Tuple <K, W>, Tuple <K, Tuple <List <V>, List <W> > > >(new GroupWithHelper <K, V, W>(numPartitions).Execute, other)); }
/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="S"></typeparam> /// <param name="self"></param> /// <param name="updateFunc"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, S> > UpdateStateByKey <K, V, S>(this DStream <KeyValuePair <K, V> > self, Func <IEnumerable <V>, S, S> updateFunc, int numPartitions = 0) { return(UpdateStateByKey <K, V, S>(self, new UpdateStateByKeyHelper <K, V, S>(updateFunc).Execute, numPartitions)); }
/// <summary> /// Return a new DStream by applying ReduceByKey to each RDD. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="self"></param> /// <param name="reduceFunc"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, V> > ReduceByKey <K, V>(this DStream <KeyValuePair <K, V> > self, Func <V, V, V> reduceFunc, int numPartitions = 0) { return(self.CombineByKey(() => default(V), reduceFunc, reduceFunc, numPartitions)); }
/// <summary> /// Return a new DStream by applying 'full outer join' between RDDs of this DStream and `other` DStream. /// Hash partitioning is used to generate the RDDs with `numPartitions` partitions. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="W"></typeparam> /// <param name="self"></param> /// <param name="other"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <Tuple <K, Tuple <Option <V>, Option <W> > > > FullOuterJoin <K, V, W>(this DStream <Tuple <K, V> > self, DStream <Tuple <K, W> > other, int numPartitions = 0) { if (numPartitions <= 0) { numPartitions = self.streamingContext.SparkContext.DefaultParallelism; } return(self.TransformWith <Tuple <K, W>, Tuple <K, Tuple <Option <V>, Option <W> > > >(new FullOuterJoinHelper <K, V, W>(numPartitions).Execute, other)); }
/// <summary> /// Return a new DStream by applying `GroupByKey` over a sliding window. /// Similar to `DStream.GroupByKey()`, but applies it over a sliding window. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="self"></param> /// <param name="windowSeconds">width of the window; must be a multiple of this DStream's batching interval</param> /// <param name="slideSeconds"> /// sliding interval of the window (i.e., the interval after which /// the new DStream will generate RDDs); must be a multiple of this /// DStream's batching interval /// </param> /// <param name="numPartitions">Number of partitions of each RDD in the new DStream.</param> /// <returns></returns> public static DStream <KeyValuePair <K, IEnumerable <V> > > GroupByKeyAndWindow <K, V>(this DStream <KeyValuePair <K, V> > self, int windowSeconds, int slideSeconds, int numPartitions = 0) { var ls = self.MapValues(x => new List <V> { x }); var grouped = ls.ReduceByKeyAndWindow( (a, b) => { a.AddRange(b); return(a); }, (a, b) => { a.RemoveRange(0, b.Count); return(a); }, windowSeconds, slideSeconds, numPartitions); return(grouped.MapValues(x => x.AsEnumerable())); }
/// <summary> /// Return a new "state" DStream where the state for each key is updated by applying /// the given function on the previous state of the key and the new values of the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="S"></typeparam> /// <param name="self"></param> /// <param name="updateFunc">State update function - IEnumerable[K, [newValues, oldState]] => IEnumerable[K, newState]</param> /// <param name="initialState">Initial state value of each key</param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <Tuple <K, S> > UpdateStateByKey <K, V, S>(this DStream <Tuple <K, V> > self, Func <IEnumerable <Tuple <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <Tuple <K, S> > > updateFunc, RDD <Tuple <K, S> > initialState = null, int numPartitions = 0) { return(UpdateStateByKey <K, V, S>(self, new MapPartitionsHelper <Tuple <K, Tuple <IEnumerable <V>, S> >, Tuple <K, S> >(updateFunc).Execute, initialState, numPartitions)); }
/// <summary> /// Return a new DStream by applying groupByKey on each RDD. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="self"></param> /// <param name="numPartitions"></param> /// <returns></returns> public static DStream <KeyValuePair <K, List <V> > > GroupByKey <K, V>(this DStream <KeyValuePair <K, V> > self, int numPartitions = 0) { return(self.Transform <KeyValuePair <K, List <V> > >(new GroupByKeyHelper <K, V>(numPartitions).Execute)); }
/// <summary> /// Return a new DStream by applying a flatmap function to the value /// of each key-value pairs in this DStream without changing the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="U"></typeparam> /// <param name="self"></param> /// <param name="func"></param> /// <returns></returns> public static DStream <KeyValuePair <K, U> > FlatMapValues <K, V, U>(this DStream <KeyValuePair <K, V> > self, Func <V, IEnumerable <U> > func) { return(self.FlatMap(new FlatMapValuesHelper <K, V, U>(func).Execute, true)); }
/// <summary> /// Return a new DStream in which each RDD is generated by applying a function /// on each RDD of this DStream and 'other' DStream. /// /// `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three /// arguments of (`time`, `rdd_a`, `rdd_b`) /// </summary> /// <typeparam name="U"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="f"></param> /// <param name="other"></param> /// <param name="keepSerializer"></param> /// <returns></returns> public DStream <V> TransformWith <U, V>(Func <RDD <T>, RDD <U>, RDD <V> > f, DStream <U> other, bool keepSerializer = false) { return(TransformWith <U, V>(new TransformWithHelper <T, U, V>(f).Execute, other, keepSerializer)); }
/// <summary> /// Return a new DStream in which each RDD is generated by applying a function /// on each RDD of this DStream and 'other' DStream. /// /// `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three /// arguments of (`time`, `rdd_a`, `rdd_b`) /// </summary> /// <typeparam name="U"></typeparam> /// <typeparam name="V"></typeparam> /// <param name="f"></param> /// <param name="other"></param> /// <param name="keepSerializer"></param> /// <returns></returns> public DStream <V> TransformWith <U, V>(Func <double, RDD <T>, RDD <U>, RDD <V> > f, DStream <U> other, bool keepSerializer = false) { Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new TransformWithDynamicHelper <T, U, V>(f).Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); return(new DStream <V>(SparkCLREnvironment.SparkCLRProxy.CreateCSharpTransformed2DStream( DStreamProxy, other.DStreamProxy, stream.ToArray(), serializedMode.ToString(), other.serializedMode.ToString()), streamingContext, keepSerializer ? serializedMode : SerializedMode.Byte)); }
/// <summary> /// Return a new DStream by applying a map function to the value of /// each key-value pairs in this DStream without changing the key. /// </summary> /// <typeparam name="K"></typeparam> /// <typeparam name="V"></typeparam> /// <typeparam name="U"></typeparam> /// <param name="self"></param> /// <param name="func"></param> /// <returns></returns> public static DStream <KeyValuePair <K, U> > MapValues <K, V, U>(this DStream <KeyValuePair <K, V> > self, Func <V, U> func) { return(self.Map(new MapValuesHelper <K, V, U>(func).Execute, true)); }
internal MapWithStateDStream(DStream <M> mappedDataDStream, DStream <KeyValuePair <K, S> > snapshotsDStream) : base(mappedDataDStream.DStreamProxy, mappedDataDStream.streamingContext) { this.snapshotsDStream = snapshotsDStream; }