/// <summary> /// Create an input stream that directly pulls messages from a Kafka Broker and specific offset. /// /// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka /// in each batch duration and processed without storing. /// /// This does not use Zookeeper to store offsets. The consumed offsets are tracked /// by the stream itself. For interoperability with Kafka monitoring tools that depend on /// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. /// You can access the offsets used in each batch from the generated RDDs (see /// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). /// To recover from driver failures, you have to enable checkpointing in the StreamingContext. /// The information on consumed offset can be recovered from the checkpoint. /// See the programming guide for details (constraints, etc.). /// /// </summary> /// <param name="ssc">Spark Streaming Context</param> /// <param name="topics">list of topic_name to consume.</param> /// <param name="kafkaParams"> /// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set /// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form. /// </param> /// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param> /// <param name="numPartitions"> /// user hint on how many kafka RDD partitions to create instead of aligning with kafka partitions, /// unbalanced kafka partitions and/or under-distributed data will be redistributed evenly across /// a probably larger number of RDD partitions /// If numPartitions = -1, either repartition based on spark.streaming.kafka.maxRatePerTask or do nothing if config not defined /// If numPartitions = 0, repartition using original kafka partition count /// If numPartitions > 0, repartition using this parameter /// </param> /// <param name="readFunc">user function to process the kafka data.</param> /// <returns>A DStream object</returns> public static DStream <T> CreateDirectStreamWithRepartitionAndReadFunc <T>(StreamingContext ssc, List <string> topics, Dictionary <string, string> kafkaParams, Dictionary <string, long> fromOffsets, int numPartitions, Func <int, IEnumerable <KeyValuePair <byte[], byte[]> >, IEnumerable <T> > readFunc) { var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper <KeyValuePair <byte[], byte[]>, T>(readFunc, true); var transformHelper = new TransformHelper <KeyValuePair <byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute); var transformDynamicHelper = new TransformDynamicHelper <KeyValuePair <byte[], byte[]>, T>(transformHelper.Execute); Func <double, RDD <dynamic>, RDD <dynamic> > func = transformDynamicHelper.Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); byte[] readFuncBytes = stream.ToArray(); string serializationMode = SerializedMode.Pair.ToString(); return(new DStream <T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc)); }
/// <summary> /// Create an input stream that directly pulls messages from a Kafka Broker and specific offset. /// /// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka /// in each batch duration and processed without storing. /// /// This does not use Zookeeper to store offsets. The consumed offsets are tracked /// by the stream itself. For interoperability with Kafka monitoring tools that depend on /// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application. /// You can access the offsets used in each batch from the generated RDDs (see /// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]). /// To recover from driver failures, you have to enable checkpointing in the StreamingContext. /// The information on consumed offset can be recovered from the checkpoint. /// See the programming guide for details (constraints, etc.). /// /// </summary> /// <param name="ssc">Spark Streaming Context</param> /// <param name="topics">list of topic_name to consume.</param> /// <param name="kafkaParams"> /// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set /// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form. /// </param> /// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param> /// <param name="readFunc">user function to process the kafka data.</param> /// <returns>A DStream object</returns> public static DStream <T> CreateDirectStream <T>(StreamingContext ssc, List <string> topics, Dictionary <string, string> kafkaParams, Dictionary <string, long> fromOffsets, Func <int, IEnumerable <KeyValuePair <byte[], byte[]> >, IEnumerable <T> > readFunc) { int numPartitions = GetNumPartitionsFromConfig(ssc, topics, kafkaParams); if (ssc.SparkContext.SparkConf.SparkConfProxy.GetInt("spark.mobius.streaming.kafka.numReceivers", 0) <= 0) { var dstream = new DStream <KeyValuePair <byte[], byte[]> >(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair); return(dstream.MapPartitionsWithIndex(readFunc, true)); } var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper <KeyValuePair <byte[], byte[]>, T>(readFunc, true); var transformHelper = new TransformHelper <KeyValuePair <byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute); var transformDynamicHelper = new TransformDynamicHelper <KeyValuePair <byte[], byte[]>, T>(transformHelper.Execute); Func <double, RDD <dynamic>, RDD <dynamic> > func = transformDynamicHelper.Execute; var formatter = new BinaryFormatter(); var stream = new MemoryStream(); formatter.Serialize(stream, func); byte[] readFuncBytes = stream.ToArray(); string serializationMode = SerializedMode.Pair.ToString(); return(new DStream <T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc)); }