Example #1
0
        internal static void DStreamDirectKafkaWithRepartitionSample()
        {
            count = 0;

            string directory      = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                var conf                 = new SparkConf();
                SparkContext sc          = new SparkContext(conf);
                StreamingContext context = new StreamingContext(sc, 2000L);
                context.Checkpoint(checkpointPath);

                var kafkaParams = new List <Tuple <string, string> > {
                    new Tuple <string, string>("metadata.broker.list", brokers),
                    new Tuple <string, string>("auto.offset.reset", "smallest")
                };

                conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString());
                var dstream = KafkaUtils.CreateDirectStream(context, new List <string> {
                    topic
                }, kafkaParams, Enumerable.Empty <Tuple <string, long> >());

                dstream.ForeachRDD((time, rdd) =>
                {
                    long batchCount   = rdd.Count();
                    int numPartitions = rdd.GetNumPartitions();

                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Count: " + batchCount);
                    Console.WriteLine("Partitions: " + numPartitions);

                    // only first batch has data and is repartitioned into 10 partitions
                    if (count++ == 0)
                    {
                        Assert.AreEqual(messages, batchCount);
                        Assert.IsTrue(numPartitions >= partitions);
                    }
                    else
                    {
                        Assert.AreEqual(0, batchCount);
                        Assert.IsTrue(numPartitions == 0);
                    }
                });

                return(context);
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
Example #2
0
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);

            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());

            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);

            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> {
                { "testTopic1", 1 }
            }, new Dictionary <string, string>());

            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic2"
            }, new Dictionary <string, string>(), new Dictionary <string, long>());

            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStreamWithRepartition(ssc, new List <string> {
                "testTopic3"
            }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10);

            Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);

            var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStreamWithRepartitionAndReadFunc(
                ssc,
                new List <string> {
                "testTopic3"
            },
                new Dictionary <string, string>(), new Dictionary <string, long>(),
                10,
                (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); });

            Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);

            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Example #3
0
        static void Main(string[] args)
        {
            var          sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLRKafka Example"));
            const string topicName    = "<topicName>";
            var          topicList    = new List <string> {
                topicName
            };
            var kafkaParams = new Dictionary <string, string> //refer to http://kafka.apache.org/documentation.html#configuration
            {
                { "metadata.broker.list", "<kafka brokers list>" },
                { "auto.offset.reset", "smallest" }
            };
            var          perTopicPartitionKafkaOffsets = new Dictionary <string, long>();
            const int    windowDurationInSecs          = 5;
            const int    slideDurationInSecs           = 5;
            const string checkpointPath = "<hdfs path to spark checkpoint directory>";
            const string appOutputPath  = "<hdfs path to app output directory>";


            const long       slideDurationInMillis = 5000;
            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath,
                                                                                  () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
                ssc.Checkpoint(checkpointPath);

                var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams.Select(v => new Tuple <string, string>(v.Key, v.Value)), perTopicPartitionKafkaOffsets.Select(v => new Tuple <string, long>(v.Key, v.Value)));
                var countByLogLevelAndTime = stream
                                             .Map(tuple => Encoding.UTF8.GetString(tuple.Item2))
                                             .Filter(line => line.Contains(","))
                                             .Map(line => line.Split(','))
                                             .Map(columns => new Tuple <string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1))
                                             .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3)
                                             .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Item1, logLevelCountPair.Item2));

                countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
                {
                    countByLogLevel.SaveAsTextFile(string.Format("{0}/{1}", appOutputPath, Guid.NewGuid()));
                    foreach (var logCount in countByLogLevel.Collect())
                    {
                        Console.WriteLine(logCount);
                    }
                });

                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Example #4
0
        public static void Process(string AppName, string CheckpointPath, Dictionary <string, string> kafkaParams)
        {
            var sparkContext = new SparkContext(new SparkConf().SetAppName(AppName));
            var topicList    = new List <string> {
                kafkaParams["topic"]
            };
            var        perTopicPartitionKafkaOffsets = new Dictionary <string, long>();
            const long slideDurationInMillis         = 1000;

            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(CheckpointPath,
                                                                                  () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDurationInMillis);

                var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets);

                stream.Map(kvp =>
                {
                    if (kvp.Value != null)
                    {
                        return(Encoding.UTF8.GetString(kvp.Value));
                    }
                    else
                    {
                        return(null);
                    }
                }
                           ).ForeachRDD(RDD =>
                {
                    foreach (string line in RDD.Collect())
                    {
                        var message = JObject.Parse(line);
                        var _id     = message.SelectToken("docid").ToString();
                        // =======================
                        //  TODO: Process message
                        // =======================
                    }
                }
                                        );
                ssc.Checkpoint(CheckpointPath);

                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Example #5
0
        public override void Run(Lazy <SparkContext> sparkContext, int currentTimes, int totalTimes)
        {
            DeleteCheckPointDirectory(currentTimes);

            var options = Options as UnionTopicTestOptions;

            var streamingContext = StreamingContext.GetOrCreate(options.CheckPointDirectory,
                                                                () =>
            {
                var ssc = new StreamingContext(sparkContext.Value, options.BatchSeconds * 1000L);
                ssc.Checkpoint(options.CheckPointDirectory);

                var stream1 = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                    options.Topic1
                }, kafkaParams, offsetsRange)
                              .Map(line => new RowIdCountTime().Deserialize(line.Value));
                var stream2 = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                    options.Topic2
                }, kafkaParams, offsetsRange)
                              .Map(line => new RowIdCountTime().Deserialize(line.Value));
                var stream = stream1.Union(stream2);

                if (options.RePartition > 0)
                {
                    stream = stream.Repartition(options.RePartition);
                }

                stream.ForeachRDD(rdd =>
                {
                    rdd.Foreach(idCount =>
                    {
                        Console.WriteLine($"{NowMilli} {this.GetType().Name} : {idCount.ToString()}");
                    });
                });

                SaveStreamToFile(stream.Map(it => it.ToString()));
                return(ssc);
            });

            streamingContext.Start();

            WaitTerminationOrTimeout(streamingContext);
        }
Example #6
0
        public override void Run(Lazy <SparkContext> sparkContext, int currentTimes, int totalTimes)
        {
            DeleteCheckPointDirectory(currentTimes);

            var options      = Options as WindowSlideTestOptions;
            var allBeginTime = DateTime.Now;

            var topicList = new List <string>(options.Topics.Split(";,".ToArray()));

            ParseKafkaParameters();

            for (var k = 0; options.TestTimes <= 0 || k < options.TestTimes; k++)
            {
                var beginTime = DateTime.Now;
                //Logger.LogInfo("begin test[{0}]-{1} , sparkContext = {2}", k + 1, options.TestTimes > 0 ? options.TestTimes.ToString() : "infinite", sparkContext.Value);
                var streamingContext = StreamingContext.GetOrCreate(options.CheckPointDirectory,
                                                                    () =>
                {
                    var ssc = new StreamingContext(sparkContext.Value, options.BatchSeconds * 1000L);
                    ssc.Checkpoint(options.CheckPointDirectory);

                    var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, offsetsRange)
                                 .Map(line => Encoding.UTF8.GetString(line.Value));

                    var pairs = stream.Map(new ParseKeyValueArray(options.ElementCount, options.ShowReceivedLines).Parse);

                    var reducedStream = pairs.ReduceByKeyAndWindow(
                        new ReduceHelper(options.CheckArrayAtFirst).Sum,
                        new ReduceHelper(options.CheckArrayAtFirst).InverseSum,
                        options.WindowSeconds,
                        options.SlideSeconds
                        );

                    reducedStream.ForeachRDD(new SumCountStatic().ForeachRDD <int[]>);
                    SaveStreamToFile(reducedStream);
                    return(ssc);
                });

                streamingContext.Start();
                WaitTerminationOrTimeout(streamingContext);
            }
        }
        static void Main(string[] args)
        {
            var checkpointPath        = "";
            var sparkContext          = new SparkContext(new SparkConf());
            var slideDurationInMillis = 10;
            var topics      = new List <string>();
            var kafkaParams = new List <Tuple <string, string> >();
            var perTopicPartitionKafkaOffsets = new List <Tuple <string, long> >();
            var windowDurationInSecs          = 10;
            var slideDurationInSecs           = 10;

            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
                ssc.Checkpoint(checkpointPath);
                var stream = KafkaUtils.CreateDirectStream(ssc, topics, kafkaParams, perTopicPartitionKafkaOffsets);

                var countByLogLevelAndTime = stream
                                             .Map(kvp => Encoding.UTF8.GetString(kvp.Item2))
                                             .Filter(line => line.Contains(","))
                                             .Map(line => line.Split(','))
                                             .Map(columns => new Tuple <string, int>(
                                                      string.Format("{0},{1}", columns[0], columns[1]), 1))
                                             .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y,
                                                                   windowDurationInSecs, slideDurationInSecs, 3)
                                             .Map(logLevelCountPair => string.Format("{0},{1}",
                                                                                     logLevelCountPair.Item1, logLevelCountPair.Item2));
                countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
                {
                    foreach (var logCount in countByLogLevel.Collect())
                    {
                        Console.WriteLine(logCount);
                    }
                });
                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();

            Console.WriteLine("Hello World!");
        }
Example #8
0
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1000);

            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());

            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);

            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> {
                { "testTopic1", 1 }
            }, new Dictionary <string, string>());

            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic2"
            }, new Dictionary <string, string>(), new Dictionary <string, long>());

            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);

            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Example #9
0
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1000L);

            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000L);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());

            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);

            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new [] { Tuple.Create("testTopic1", 1) }, null);

            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic2"
            }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >());

            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10");

            var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic3"
            }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >());

            Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);

            var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream(
                ssc,
                new List <string> {
                "testTopic3"
            },
                new List <Tuple <string, string> >(), new List <Tuple <string, long> >(),
                (int pid, IEnumerable <Tuple <byte[], byte[]> > input) => { return(input); });

            Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc);

            ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10");

            var directKafkaReceiver = KafkaUtils.CreateDirectStream(
                ssc,
                new List <string> {
                "testTopic3"
            },
                new List <Tuple <string, string> >(), new List <Tuple <string, long> >(),
                (int pid, IEnumerable <Tuple <byte[], byte[]> > input) => { return(input); });

            Assert.IsNotNull(directKafkaReceiver.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);

            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }