/// <summary>
        /// Return a new DStream by applying ReduceByKey to each RDD.
        /// </summary>
        /// <typeparam name="K"></typeparam>
        /// <typeparam name="V"></typeparam>
        /// <param name="self"></param>
        /// <param name="reduceFunc"></param>
        /// <param name="numPartitions"></param>
        /// <returns></returns>
        public static DStream <Tuple <K, V> > ReduceByKey <K, V>(this DStream <Tuple <K, V> > self, Func <V, V, V> reduceFunc, int numPartitions = 0)
        {
            var locallyCombined = self.MapPartitionsWithIndex(new GroupByMergeHelper <K, V>(reduceFunc).Execute, true);

            var shuffled = locallyCombined.PartitionBy(numPartitions);

            return(shuffled.MapPartitionsWithIndex(new GroupByMergeHelper <K, V>(reduceFunc).Execute, true));
        }
Exemple #2
0
        /// <summary>
        /// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
        ///
        /// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
        /// in each batch duration and processed without storing.
        ///
        /// This does not use Zookeeper to store offsets. The consumed offsets are tracked
        /// by the stream itself. For interoperability with Kafka monitoring tools that depend on
        /// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
        /// You can access the offsets used in each batch from the generated RDDs (see
        /// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
        /// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
        /// The information on consumed offset can be recovered from the checkpoint.
        /// See the programming guide for details (constraints, etc.).
        ///
        /// </summary>
        /// <param name="ssc">Spark Streaming Context</param>
        /// <param name="topics">list of topic_name to consume.</param>
        /// <param name="kafkaParams">
        ///     Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
        ///     with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.
        /// </param>
        /// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
        /// <param name="readFunc">user function to process the kafka data.</param>
        /// <returns>A DStream object</returns>
        public static DStream <T> CreateDirectStream <T>(StreamingContext ssc, List <string> topics, Dictionary <string, string> kafkaParams, Dictionary <string, long> fromOffsets, Func <int, IEnumerable <KeyValuePair <byte[], byte[]> >, IEnumerable <T> > readFunc)
        {
            int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams);

            if (ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) <= 0)
            {
                var dstream = new DStream <KeyValuePair <byte[], byte[]> >(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
                return(dstream.MapPartitionsWithIndex(readFunc, true));
            }

            var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper <KeyValuePair <byte[], byte[]>, T>(readFunc, true);
            var transformHelper        = new TransformHelper <KeyValuePair <byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
            var transformDynamicHelper = new TransformDynamicHelper <KeyValuePair <byte[], byte[]>, T>(transformHelper.Execute);
            Func <double, RDD <dynamic>, RDD <dynamic> > func = transformDynamicHelper.Execute;
            var formatter = new BinaryFormatter();
            var stream    = new MemoryStream();

            formatter.Serialize(stream, func);
            byte[] readFuncBytes     = stream.ToArray();
            string serializationMode = SerializedMode.Pair.ToString();

            return(new DStream <T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc));
        }