/// <summary>
        /// Return a new "state" DStream where the state for each key is updated by applying
        /// the given function on the previous state of the key and the new values of the key.
        /// </summary>
        public static MapWithStateDStream <K, V, S, M> MapWithState <K, V, S, M>(this DStream <Tuple <K, V> > self, StateSpec <K, V, S, M> stateSpec)
        {
            if (stateSpec.numPartitions <= 0)
            {
                stateSpec = stateSpec.NumPartitions(self.streamingContext.SparkContext.DefaultParallelism);
            }

            Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = self.Piplinable ? (self as TransformedDStream <Tuple <K, V> >).func : null;

            Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new MapWithStateHelper <K, V, S, M>(prevFunc, stateSpec).Execute;

            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);

            var mapWithStateDStream = new DStream <MapWithStateRDDRecord <K, S, M> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
                                                                                         self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy,
                                                                                         stream.ToArray(),
                                                                                         "CSharpStateDStream",
                                                                                         self.serializedMode.ToString(),
                                                                                         (self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()),
                                                                                     self.streamingContext);

            DStream <M>             mappedDataDStream = mapWithStateDStream.FlatMap(r => r.mappedData);
            DStream <Tuple <K, S> > snapshotsDStream  = mapWithStateDStream.FlatMap(
                r => r.stateMap.Select(entry => new Tuple <K, S>(entry.Key, entry.Value.state)));

            return(new MapWithStateDStream <K, V, S, M>(mappedDataDStream, snapshotsDStream));
        }
        /// <summary>
        /// Return a new "state" DStream where the state for each key is updated by applying
        /// the given function on the previous state of the key and the new values of the key.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <typeparam name="S"></typeparam>
        /// <param name="self"></param>
        /// <param name="updateFunc">State update function. If this function returns None, then corresponding state key-value pair will be eliminated.</param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <KeyValuePair <K, S> > UpdateStateByKey <K, V, S>(this DStream <KeyValuePair <K, V> > self,
                                                                                Func <IEnumerable <KeyValuePair <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <KeyValuePair <K, S> > > updateFunc,
                                                                                int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = self.Piplinable ? (self as TransformedDStream <KeyValuePair <K, V> >).func : null;

            Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new UpdateStateByKeysHelper <K, V, S>(updateFunc, prevFunc, numPartitions).Execute;

            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);

            return(new DStream <KeyValuePair <K, S> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
                                                          self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy,
                                                          stream.ToArray(),
                                                          self.serializedMode.ToString(),
                                                          (self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()),
                                                      self.streamingContext));
        }
        /// <summary>
        /// Return a new "state" DStream where the state for each key is updated by applying
        /// the given function on the previous state of the key and the new values of the key.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <typeparam name="S"></typeparam>
        /// <param name="self"></param>
        /// <param name="updateFunc">State update function - (pid, IEnumerable[K, [newValues, oldState]]) => IEnumerable[K, newState]</param>
        /// <param name="initialState">Initial state value of each key</param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <Tuple <K, S> > UpdateStateByKey <K, V, S>(this DStream <Tuple <K, V> > self,
                                                                         Func <int, IEnumerable <Tuple <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <Tuple <K, S> > > updateFunc,
                                                                         RDD <Tuple <K, S> > initialState = null, int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            // completes pipelinable dstream by adding the last pipelinable operation
            // before transforming to CSharpStateDStream so that UpdateStateByKey's
            // parallel job covers all pipelinable operations before shuffling
            var ds = self.Transform(new AddShuffleKeyHelper <K, V>(numPartitions).Execute);

            Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new UpdateStateByKeysHelper <K, V, S>(updateFunc, initialState, numPartitions).Execute;

            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);

            return(new DStream <Tuple <K, S> >(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
                                                   ds.DStreamProxy,
                                                   stream.ToArray(),
                                                   "CSharpStateDStream",
                                                   ds.serializedMode.ToString(),
                                                   ds.serializedMode.ToString()),
                                               self.streamingContext));
        }
        /// <summary>
        /// Return a new DStream by applying ReduceByKey to each RDD.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="self"></param>
        /// <param name="reduceFunc"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <Tuple <K, V> > ReduceByKey <K, V>(this DStream <Tuple <K, V> > self, Func <V, V, V> reduceFunc, int numPartitions = 0)
        {
            var locallyCombined = self.MapPartitionsWithIndex(new GroupByMergeHelper <K, V>(reduceFunc).Execute, true);

            var shuffled = locallyCombined.PartitionBy(numPartitions);

            return(shuffled.MapPartitionsWithIndex(new GroupByMergeHelper <K, V>(reduceFunc).Execute, true));
        }
        /// <summary>
        /// Return a new DStream by applying 'join' between RDDs of this DStream and `other` DStream.
        /// Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <typeparam name="W"></typeparam>
        /// <param name="self"></param>
        /// <param name="other"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <KeyValuePair <K, Tuple <V, W> > > Join <K, V, W>(this DStream <KeyValuePair <K, V> > self, DStream <KeyValuePair <K, W> > other, int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            return(self.TransformWith <KeyValuePair <K, W>, KeyValuePair <K, Tuple <V, W> > >(new JoinHelper <K, V, W>(numPartitions).Execute, other));
        }
        /// <summary>
        /// Return a new DStream in which each RDD are partitioned by numPartitions.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="self"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <KeyValuePair <K, V> > PartitionBy <K, V>(this DStream <KeyValuePair <K, V> > self, int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            return(self.Transform <KeyValuePair <K, V> >(new PartitionByHelper <K, V>(numPartitions).Execute));
        }
Beispiel #7
0
        /// <summary>
        /// Return a new DStream by unifying data of another DStream with this DStream.
        ///
        /// @param other: Another DStream having the same interval (i.e., slideDuration) as this DStream.
        /// </summary>
        /// <param name="other"></param>
        /// <returns></returns>
        public DStream <T> Union(DStream <T> other)
        {
            if (SlideDuration != other.SlideDuration)
            {
                throw new ArgumentException("the two DStream should have same slide duration");
            }

            return(TransformWith((rdd1, rdd2) => rdd1.Union(rdd2), other, true));
        }
        /// <summary>
        /// Return a new DStream by applying incremental `reduceByKey` over a sliding window.
        ///
        /// The reduced value of over a new window is calculated using the old window's reduce value :
        ///  1. reduce the new values that entered the window (e.g., adding new counts)
        ///  2. "inverse reduce" the old values that left the window (e.g., subtracting old counts)
        ///
        /// `invFunc` can be None, then it will reduce all the RDDs in window, could be slower than having `invFunc`.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="self"></param>
        /// <param name="reduceFunc">associative reduce function</param>
        /// <param name="invReduceFunc">inverse function of `reduceFunc`</param>
        /// <param name="windowSeconds">width of the window; must be a multiple of this DStream's batching interval</param>
        /// <param name="slideSeconds">sliding interval of the window (i.e., the interval after which the new DStream will generate RDDs); must be a multiple of this DStream's batching interval</param>
        /// <param name="numPartitions">number of partitions of each RDD in the new DStream.</param>
        /// <param name="filterFunc">function to filter expired key-value pairs; only pairs that satisfy the function are retained set this to null if you do not want to filter</param>
        /// <returns></returns>
        public static DStream <KeyValuePair <K, V> > ReduceByKeyAndWindow <K, V>(this DStream <KeyValuePair <K, V> > self,
                                                                                 Func <V, V, V> reduceFunc,
                                                                                 Func <V, V, V> invReduceFunc,
                                                                                 int windowSeconds,
                                                                                 int slideSeconds  = 0,
                                                                                 int numPartitions = 0,
                                                                                 Func <KeyValuePair <K, V>, bool> filterFunc = null)
        {
            self.ValidatWindowParam(windowSeconds, slideSeconds);

            if (slideSeconds <= 0)
            {
                slideSeconds = self.SlideDuration;
            }

            // dstream to be transformed by substracting old RDDs and adding new RDDs based on the window
            var reduced = self.ReduceByKey(reduceFunc, numPartitions);

            Func <double, RDD <dynamic>, RDD <dynamic> > prevFunc = reduced.Piplinable ? (reduced as TransformedDStream <KeyValuePair <K, V> >).func : null;

            var helper = new ReduceByKeyAndWindowHelper <K, V>(reduceFunc, invReduceFunc, numPartitions, filterFunc, prevFunc);
            // function to reduce the new values that entered the window (e.g., adding new counts)
            Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > reduceF = helper.Reduce;

            MemoryStream stream    = new MemoryStream();
            var          formatter = new BinaryFormatter();

            formatter.Serialize(stream, reduceF);

            // function to "inverse reduce" the old values that left the window (e.g., subtracting old counts)
            MemoryStream invStream = null;

            if (invReduceFunc != null)
            {
                Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > invReduceF = helper.InvReduce;

                invStream = new MemoryStream();
                formatter.Serialize(stream, invReduceF);
            }

            return(new DStream <KeyValuePair <K, V> >(
                       SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpReducedWindowedDStream(
                           reduced.Piplinable ? reduced.prevDStreamProxy : reduced.DStreamProxy,
                           stream.ToArray(),
                           invStream == null ? null : invStream.ToArray(),
                           windowSeconds,
                           slideSeconds,
                           (reduced.Piplinable ? reduced.prevSerializedMode : reduced.serializedMode).ToString()),
                       self.streamingContext
                       ));
        }
        /// <summary>
        /// Return a new DStream by applying combineByKey to each RDD.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <typeparam name="C"></typeparam>
        /// <param name="self"></param>
        /// <param name="createCombiner"></param>
        /// <param name="mergeValue"></param>
        /// <param name="mergeCombiners"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <KeyValuePair <K, C> > CombineByKey <K, V, C>(
            this DStream <KeyValuePair <K, V> > self,
            Func <C> createCombiner,
            Func <C, V, C> mergeValue,
            Func <C, C, C> mergeCombiners,
            int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            return(self.Transform <KeyValuePair <K, C> >(new CombineByKeyHelper <K, V, C>(createCombiner, mergeValue, mergeCombiners, numPartitions).Execute));
        }
        internal void Init <T>(DStream <T> prev, Func <double, RDD <dynamic>, RDD <dynamic> > f)
        {
            streamingContext = prev.streamingContext;
            serializedMode   = SerializedMode.Byte;
            isCached         = false;
            isCheckpointed   = false;
            dstreamProxy     = null;

            if (prev is TransformedDStream <T> && !prev.isCached && !prev.isCheckpointed)
            {
                prevFunc           = (prev as TransformedDStream <T>).func;
                func               = new NewFuncWrapper(f, prevFunc).Execute;
                prevDStreamProxy   = prev.prevDStreamProxy;
                prevSerializedMode = prev.prevSerializedMode;
            }
            else
            {
                prevDStreamProxy   = prev.dstreamProxy;
                prevSerializedMode = prev.serializedMode;
                func = f;
            }
        }
Beispiel #11
0
        /// <summary>
        /// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
        ///
        /// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
        /// in each batch duration and processed without storing.
        ///
        /// This does not use Zookeeper to store offsets. The consumed offsets are tracked
        /// by the stream itself. For interoperability with Kafka monitoring tools that depend on
        /// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
        /// You can access the offsets used in each batch from the generated RDDs (see
        /// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
        /// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
        /// The information on consumed offset can be recovered from the checkpoint.
        /// See the programming guide for details (constraints, etc.).
        ///
        /// </summary>
        /// <param name="ssc">Spark Streaming Context</param>
        /// <param name="topics">list of topic_name to consume.</param>
        /// <param name="kafkaParams">
        ///     Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
        ///     with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.
        /// </param>
        /// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
        /// <param name="readFunc">user function to process the kafka data.</param>
        /// <returns>A DStream object</returns>
        public static DStream <T> CreateDirectStream <T>(StreamingContext ssc, List <string> topics, Dictionary <string, string> kafkaParams, Dictionary <string, long> fromOffsets, Func <int, IEnumerable <KeyValuePair <byte[], byte[]> >, IEnumerable <T> > readFunc)
        {
            int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams);

            if (ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) <= 0)
            {
                var dstream = new DStream <KeyValuePair <byte[], byte[]> >(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
                return(dstream.MapPartitionsWithIndex(readFunc, true));
            }

            var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper <KeyValuePair <byte[], byte[]>, T>(readFunc, true);
            var transformHelper        = new TransformHelper <KeyValuePair <byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
            var transformDynamicHelper = new TransformDynamicHelper <KeyValuePair <byte[], byte[]>, T>(transformHelper.Execute);
            Func <double, RDD <dynamic>, RDD <dynamic> > func = transformDynamicHelper.Execute;
            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);
            byte[] readFuncBytes     = stream.ToArray();
            string serializationMode = SerializedMode.Pair.ToString();

            return(new DStream <T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc));
        }
Beispiel #12
0
        /// <summary>
        /// Return a new DStream in which each RDD is generated by applying a function
        /// on each RDD of this DStream and 'other' DStream.
        ///
        /// `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
        /// arguments of (`time`, `rdd_a`, `rdd_b`)
        /// </summary>
        /// <typeparam name="U"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="f"></param>
        /// <param name="other"></param>
        /// <param name="keepSerializer"></param>
        /// <returns></returns>
        public DStream <V> TransformWith <U, V>(Func <double, RDD <T>, RDD <U>, RDD <V> > f, DStream <U> other, bool keepSerializer = false)
        {
            Func <double, RDD <dynamic>, RDD <dynamic> > prevF  = Piplinable ? (this as TransformedDStream <T>).func : null;
            Func <double, RDD <dynamic>, RDD <dynamic> > otherF = other.Piplinable ? (other as TransformedDStream <U>).func : null;

            Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new TransformWithDynamicHelper <T, U, V>(f, prevF, otherF).Execute;

            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);

            return(new DStream <V>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpTransformed2DStream(
                                       Piplinable ? prevDStreamProxy : DStreamProxy,
                                       other.Piplinable ? other.prevDStreamProxy : other.DStreamProxy,
                                       stream.ToArray(),
                                       (Piplinable ? prevSerializedMode : serializedMode).ToString(),
                                       (other.Piplinable ? other.prevSerializedMode : other.serializedMode).ToString()),
                                   streamingContext,
                                   keepSerializer ? serializedMode : SerializedMode.Byte));
        }
        /// <summary>
        /// Return a new DStream by applying 'cogroup' between RDDs of this DStream and `other` DStream.
        /// Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <typeparam name="W"></typeparam>
        /// <param name="self"></param>
        /// <param name="other"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <Tuple <K, Tuple <List <V>, List <W> > > > GroupWith <K, V, W>(this DStream <Tuple <K, V> > self, DStream <Tuple <K, W> > other, int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            return(self.TransformWith <Tuple <K, W>, Tuple <K, Tuple <List <V>, List <W> > > >(new GroupWithHelper <K, V, W>(numPartitions).Execute, other));
        }
 /// <summary>
 /// Return a new "state" DStream where the state for each key is updated by applying
 /// the given function on the previous state of the key and the new values of the key.
 /// </summary>
 /// <typeparam name="K"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <typeparam name="S"></typeparam>
 /// <param name="self"></param>
 /// <param name="updateFunc"></param>
 /// <param name="numPartitions"></param>
 /// <returns></returns>
 public static DStream <KeyValuePair <K, S> > UpdateStateByKey <K, V, S>(this DStream <KeyValuePair <K, V> > self,
                                                                         Func <IEnumerable <V>, S, S> updateFunc,
                                                                         int numPartitions = 0)
 {
     return(UpdateStateByKey <K, V, S>(self, new UpdateStateByKeyHelper <K, V, S>(updateFunc).Execute, numPartitions));
 }
 /// <summary>
 /// Return a new DStream by applying ReduceByKey to each RDD.
 /// </summary>
 /// <typeparam name="K"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <param name="self"></param>
 /// <param name="reduceFunc"></param>
 /// <param name="numPartitions"></param>
 /// <returns></returns>
 public static DStream <KeyValuePair <K, V> > ReduceByKey <K, V>(this DStream <KeyValuePair <K, V> > self, Func <V, V, V> reduceFunc, int numPartitions = 0)
 {
     return(self.CombineByKey(() => default(V), reduceFunc, reduceFunc, numPartitions));
 }
        /// <summary>
        /// Return a new DStream by applying 'full outer join' between RDDs of this DStream and `other` DStream.
        /// Hash partitioning is used to generate the RDDs with `numPartitions` partitions.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <typeparam name="W"></typeparam>
        /// <param name="self"></param>
        /// <param name="other"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <Tuple <K, Tuple <Option <V>, Option <W> > > > FullOuterJoin <K, V, W>(this DStream <Tuple <K, V> > self, DStream <Tuple <K, W> > other, int numPartitions = 0)
        {
            if (numPartitions <= 0)
            {
                numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
            }

            return(self.TransformWith <Tuple <K, W>, Tuple <K, Tuple <Option <V>, Option <W> > > >(new FullOuterJoinHelper <K, V, W>(numPartitions).Execute, other));
        }
        /// <summary>
        /// Return a new DStream by applying `GroupByKey` over a sliding window.
        /// Similar to `DStream.GroupByKey()`, but applies it over a sliding window.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="self"></param>
        /// <param name="windowSeconds">width of the window; must be a multiple of this DStream's batching interval</param>
        /// <param name="slideSeconds">
        ///     sliding interval of the window (i.e., the interval after which
        ///     the new DStream will generate RDDs); must be a multiple of this
        ///     DStream's batching interval
        /// </param>
        /// <param name="numPartitions">Number of partitions of each RDD in the new DStream.</param>
        /// <returns></returns>
        public static DStream <KeyValuePair <K, IEnumerable <V> > > GroupByKeyAndWindow <K, V>(this DStream <KeyValuePair <K, V> > self,
                                                                                               int windowSeconds, int slideSeconds, int numPartitions = 0)
        {
            var ls = self.MapValues(x => new List <V> {
                x
            });

            var grouped = ls.ReduceByKeyAndWindow(
                (a, b) => { a.AddRange(b); return(a); },
                (a, b) => { a.RemoveRange(0, b.Count); return(a); },
                windowSeconds, slideSeconds, numPartitions);

            return(grouped.MapValues(x => x.AsEnumerable()));
        }
 /// <summary>
 /// Return a new "state" DStream where the state for each key is updated by applying
 /// the given function on the previous state of the key and the new values of the key.
 /// </summary>
 /// <typeparam name="K"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <typeparam name="S"></typeparam>
 /// <param name="self"></param>
 /// <param name="updateFunc">State update function - IEnumerable[K, [newValues, oldState]] => IEnumerable[K, newState]</param>
 /// <param name="initialState">Initial state value of each key</param>
 /// <param name="numPartitions"></param>
 /// <returns></returns>
 public static DStream <Tuple <K, S> > UpdateStateByKey <K, V, S>(this DStream <Tuple <K, V> > self,
                                                                  Func <IEnumerable <Tuple <K, Tuple <IEnumerable <V>, S> > >, IEnumerable <Tuple <K, S> > > updateFunc, RDD <Tuple <K, S> > initialState = null,
                                                                  int numPartitions = 0)
 {
     return(UpdateStateByKey <K, V, S>(self, new MapPartitionsHelper <Tuple <K, Tuple <IEnumerable <V>, S> >, Tuple <K, S> >(updateFunc).Execute, initialState, numPartitions));
 }
 /// <summary>
 /// Return a new DStream by applying groupByKey on each RDD.
 /// </summary>
 /// <typeparam name="K"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <param name="self"></param>
 /// <param name="numPartitions"></param>
 /// <returns></returns>
 public static DStream <KeyValuePair <K, List <V> > > GroupByKey <K, V>(this DStream <KeyValuePair <K, V> > self, int numPartitions = 0)
 {
     return(self.Transform <KeyValuePair <K, List <V> > >(new GroupByKeyHelper <K, V>(numPartitions).Execute));
 }
 /// <summary>
 /// Return a new DStream by applying a flatmap function to the value
 /// of each key-value pairs in this DStream without changing the key.
 /// </summary>
 /// <typeparam name="K"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <typeparam name="U"></typeparam>
 /// <param name="self"></param>
 /// <param name="func"></param>
 /// <returns></returns>
 public static DStream <KeyValuePair <K, U> > FlatMapValues <K, V, U>(this DStream <KeyValuePair <K, V> > self, Func <V, IEnumerable <U> > func)
 {
     return(self.FlatMap(new FlatMapValuesHelper <K, V, U>(func).Execute, true));
 }
Beispiel #21
0
 ///  <summary>
 ///  Return a new DStream in which each RDD is generated by applying a function
 ///  on each RDD of this DStream and 'other' DStream.
 ///
 ///  `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
 ///  arguments of (`time`, `rdd_a`, `rdd_b`)
 ///  </summary>
 ///  <typeparam name="U"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <param name="f"></param>
 ///  <param name="other"></param>
 ///  <param name="keepSerializer"></param>
 ///  <returns></returns>
 public DStream <V> TransformWith <U, V>(Func <RDD <T>, RDD <U>, RDD <V> > f, DStream <U> other, bool keepSerializer = false)
 {
     return(TransformWith <U, V>(new TransformWithHelper <T, U, V>(f).Execute, other, keepSerializer));
 }
Beispiel #22
0
        /// <summary>
        /// Return a new DStream in which each RDD is generated by applying a function
        /// on each RDD of this DStream and 'other' DStream.
        ///
        /// `func` can have two arguments of (`rdd_a`, `rdd_b`) or have three
        /// arguments of (`time`, `rdd_a`, `rdd_b`)
        /// </summary>
        /// <typeparam name="U"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="f"></param>
        /// <param name="other"></param>
        /// <param name="keepSerializer"></param>
        /// <returns></returns>
        public DStream <V> TransformWith <U, V>(Func <double, RDD <T>, RDD <U>, RDD <V> > f, DStream <U> other, bool keepSerializer = false)
        {
            Func <double, RDD <dynamic>, RDD <dynamic>, RDD <dynamic> > func = new TransformWithDynamicHelper <T, U, V>(f).Execute;

            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);

            return(new DStream <V>(SparkCLREnvironment.SparkCLRProxy.CreateCSharpTransformed2DStream(
                                       DStreamProxy,
                                       other.DStreamProxy,
                                       stream.ToArray(),
                                       serializedMode.ToString(),
                                       other.serializedMode.ToString()),
                                   streamingContext,
                                   keepSerializer ? serializedMode : SerializedMode.Byte));
        }
 /// <summary>
 /// Return a new DStream by applying a map function to the value of
 /// each key-value pairs in this DStream without changing the key.
 /// </summary>
 /// <typeparam name="K"></typeparam>
 /// <typeparam name="V"></typeparam>
 /// <typeparam name="U"></typeparam>
 /// <param name="self"></param>
 /// <param name="func"></param>
 /// <returns></returns>
 public static DStream <KeyValuePair <K, U> > MapValues <K, V, U>(this DStream <KeyValuePair <K, V> > self, Func <V, U> func)
 {
     return(self.Map(new MapValuesHelper <K, V, U>(func).Execute, true));
 }
 internal MapWithStateDStream(DStream <M> mappedDataDStream, DStream <KeyValuePair <K, S> > snapshotsDStream)
     : base(mappedDataDStream.DStreamProxy, mappedDataDStream.streamingContext)
 {
     this.snapshotsDStream = snapshotsDStream;
 }