Exemple #1
0
        private static void DStreamReduceByKeyAndWindowSample()
        {
            count = 0;

            const int bacthInterval  = 2;
            const int windowDuration = 26;
            const int numPartitions  = 2;

            var sc  = SparkCLRSamples.SparkContext;
            var ssc = new StreamingContext(sc, bacthInterval);

            // create the RDD
            var seedRDD = sc.Parallelize(Enumerable.Range(0, 100), numPartitions);
            var numbers = new ConstantInputDStream <int>(seedRDD, ssc);
            var pairs   = numbers.Map(n => new KeyValuePair <int, int>(n % numPartitions, n));
            var reduced = pairs.ReduceByKeyAndWindow(
                (int x, int y) => (x + y),
                (int x, int y) => (x - y),
                windowDuration,
                slideDuration,
                numPartitions
                );

            reduced.ForeachRDD((time, rdd) =>
            {
                count++;
                var taken      = rdd.Collect();
                int partitions = rdd.GetNumPartitions();

                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Batch: " + count);
                Console.WriteLine("Count: " + taken.Length);
                Console.WriteLine("Partitions: " + partitions);

                Assert.AreEqual(taken.Length, 2);
                Assert.AreEqual(partitions, numPartitions);

                foreach (object record in taken)
                {
                    KeyValuePair <int, int> sum = (KeyValuePair <int, int>)record;
                    Console.WriteLine("Key: {0}, Value: {1}", sum.Key, sum.Value);
                    // when batch count reaches window size, sum of even/odd number stay at windowDuration / slideDuration * (2450, 2500) respectively
                    Assert.AreEqual(sum.Value, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / bacthInterval * (sum.Key == 0 ? 2450 : 2500));
                }
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
Exemple #2
0
        static void Main(string[] args)
        {
            var          sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLRKafka Example"));
            const string topicName    = "<topicName>";
            var          topicList    = new List <string> {
                topicName
            };
            var kafkaParams = new Dictionary <string, string> //refer to http://kafka.apache.org/documentation.html#configuration
            {
                { "metadata.broker.list", "<kafka brokers list>" },
                { "auto.offset.reset", "smallest" }
            };
            var          perTopicPartitionKafkaOffsets = new Dictionary <string, long>();
            const int    windowDurationInSecs          = 5;
            const int    slideDurationInSecs           = 5;
            const string checkpointPath = "<hdfs path to spark checkpoint directory>";
            const string appOutputPath  = "<hdfs path to app output directory>";


            const long       slideDurationInMillis = 5000;
            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath,
                                                                                  () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
                ssc.Checkpoint(checkpointPath);

                var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams.Select(v => new Tuple <string, string>(v.Key, v.Value)), perTopicPartitionKafkaOffsets.Select(v => new Tuple <string, long>(v.Key, v.Value)));
                var countByLogLevelAndTime = stream
                                             .Map(tuple => Encoding.UTF8.GetString(tuple.Item2))
                                             .Filter(line => line.Contains(","))
                                             .Map(line => line.Split(','))
                                             .Map(columns => new Tuple <string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1))
                                             .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3)
                                             .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Item1, logLevelCountPair.Item2));

                countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
                {
                    countByLogLevel.SaveAsTextFile(string.Format("{0}/{1}", appOutputPath, Guid.NewGuid()));
                    foreach (var logCount in countByLogLevel.Collect())
                    {
                        Console.WriteLine(logCount);
                    }
                });

                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Exemple #3
0
        static void Main(string[] args)
        {
            var sparkContext    = new SparkContext(new SparkConf().SetAppName("SparkCLREventHub Example"));
            var eventhubsParams = new Dictionary <string, string>()
            {
                { "eventhubs.policyname", "<policyname>" },
                { "eventhubs.policykey", "<policykey>" },
                { "eventhubs.namespace", "<namespace>" },
                { "eventhubs.name", "<name>" },
                { "eventhubs.partition.count", "<partitioncount>" },
                { "eventhubs.consumergroup", "$default" },
                { "eventhubs.checkpoint.dir", "<hdfs path to eventhub checkpoint dir>" },
                { "eventhubs.checkpoint.interval", "<interval>" },
            };

            const string checkpointPath = "<hdfs path to spark checkpoint dir>";
            //const string outputPath = "<hdfs path to output dir>";

            const long       slideDuration         = 5000;
            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath,
                                                                                  () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDuration);
                ssc.Checkpoint(checkpointPath);

                var stream = EventHubsUtils.CreateUnionStream(ssc, eventhubsParams);
                var countByLogLevelAndTime = stream
                                             .Map(bytes => Encoding.UTF8.GetString(bytes))
                                             .Filter(s => s.Contains(","))
                                             .Map(line => line.Split(','))
                                             .Map(columns => new KeyValuePair <string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1))
                                             .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, 5, 5, 3)
                                             .Map(kvp => string.Format("{0},{1}", kvp.Key, kvp.Value));

                countByLogLevelAndTime.ForeachRDD(dimensionalCount =>
                {
                    //dimensionalCount.SaveAsTextFile(string.Format("{0}/{1}", outputPath, Guid.NewGuid()));
                    var dimensionalCountCollection = dimensionalCount.Collect();
                    foreach (var dimensionalCountItem in dimensionalCountCollection)
                    {
                        Console.WriteLine(dimensionalCountItem);
                    }
                });

                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Exemple #4
0
        public static void Process(string AppName, string CheckpointPath, Dictionary <string, string> kafkaParams)
        {
            var sparkContext = new SparkContext(new SparkConf().SetAppName(AppName));
            var topicList    = new List <string> {
                kafkaParams["topic"]
            };
            var        perTopicPartitionKafkaOffsets = new Dictionary <string, long>();
            const long slideDurationInMillis         = 1000;

            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(CheckpointPath,
                                                                                  () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDurationInMillis);

                var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets);

                stream.Map(kvp =>
                {
                    if (kvp.Value != null)
                    {
                        return(Encoding.UTF8.GetString(kvp.Value));
                    }
                    else
                    {
                        return(null);
                    }
                }
                           ).ForeachRDD(RDD =>
                {
                    foreach (string line in RDD.Collect())
                    {
                        var message = JObject.Parse(line);
                        var _id     = message.SelectToken("docid").ToString();
                        // =======================
                        //  TODO: Process message
                        // =======================
                    }
                }
                                        );
                ssc.Checkpoint(CheckpointPath);

                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Exemple #5
0
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>");
                return;
            }

            string checkpointPath = args[0];
            string inputDir       = args[1];

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                var sparkConf = new SparkConf();
                sparkConf.SetAppName("HdfsWordCount");
                var sc = new SparkContext(sparkConf);
                StreamingContext context = new StreamingContext(sc, 30000);
                context.Checkpoint(checkpointPath);

                var lines      = context.TextFileStream(inputDir);
                var words      = lines.FlatMap(l => l.Split(' '));
                var pairs      = words.Map(w => new KeyValuePair <string, int>(w, 1));
                var wordCounts = pairs.ReduceByKey((x, y) => x + y);

                wordCounts.ForeachRDD((time, rdd) =>
                {
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");
                    object[] taken = rdd.Take(10);
                    foreach (object record in taken)
                    {
                        Console.WriteLine(record);
                    }
                    Console.WriteLine();
                });

                return(context);
            });

            ssc.Start();
            ssc.AwaitTermination();
            ssc.Stop();
        }
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);

            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());

            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);

            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> {
                { "testTopic1", 1 }
            }, new Dictionary <string, string>());

            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic2"
            }, new Dictionary <string, string>(), new Dictionary <string, long>());

            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStreamWithRepartition(ssc, new List <string> {
                "testTopic3"
            }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10);

            Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);

            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Exemple #7
0
        internal static void DStreamTextFileSamples()
        {
            SparkContext sc        = SparkCLRSamples.SparkContext;
            string       directory = SparkCLRSamples.Configuration.SampleDataLocation;

            sc.SetCheckpointDir(directory);
            StreamingContext ssc = new StreamingContext(sc, 2000);

            var lines      = ssc.TextFileStream(Path.Combine(directory, "test"));
            var words      = lines.FlatMap(l => l.Split(' '));
            var pairs      = words.Map(w => new KeyValuePair <string, int>(w, 1));
            var wordCounts = pairs.ReduceByKey((x, y) => x + y);
            var join       = wordCounts.Join(wordCounts, 2);
            var state      = join.UpdateStateByKey <string, Tuple <int, int>, int>((vs, s) => vs.Sum(x => x.Item1 + x.Item2) + s);

            state.ForeachRDD((time, rdd) =>
            {
                // there's chance rdd.Take conflicts with ssc.Stop
                if (stopFileServer)
                {
                    return;
                }

                object[] taken = rdd.Take(10);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                foreach (object record in taken)
                {
                    Console.WriteLine(record);
                }
                Console.WriteLine();

                stopFileServer = count++ > 3;
            });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);

            ssc.AwaitTermination();
            ssc.Stop();
        }
        static void Main(string[] args)
        {
            var checkpointPath        = "";
            var sparkContext          = new SparkContext(new SparkConf());
            var slideDurationInMillis = 10;
            var topics      = new List <string>();
            var kafkaParams = new List <Tuple <string, string> >();
            var perTopicPartitionKafkaOffsets = new List <Tuple <string, long> >();
            var windowDurationInSecs          = 10;
            var slideDurationInSecs           = 10;

            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () =>
            {
                var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
                ssc.Checkpoint(checkpointPath);
                var stream = KafkaUtils.CreateDirectStream(ssc, topics, kafkaParams, perTopicPartitionKafkaOffsets);

                var countByLogLevelAndTime = stream
                                             .Map(kvp => Encoding.UTF8.GetString(kvp.Item2))
                                             .Filter(line => line.Contains(","))
                                             .Map(line => line.Split(','))
                                             .Map(columns => new Tuple <string, int>(
                                                      string.Format("{0},{1}", columns[0], columns[1]), 1))
                                             .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y,
                                                                   windowDurationInSecs, slideDurationInSecs, 3)
                                             .Map(logLevelCountPair => string.Format("{0},{1}",
                                                                                     logLevelCountPair.Item1, logLevelCountPair.Item2));
                countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
                {
                    foreach (var logCount in countByLogLevel.Collect())
                    {
                        Console.WriteLine(logCount);
                    }
                });
                return(ssc);
            });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();

            Console.WriteLine("Hello World!");
        }
Exemple #9
0
        internal static void DStreamCSharpInputSample()
        {
            const int numPartitions = 5;

            var sc  = SparkCLRSamples.SparkContext;
            var ssc = new StreamingContext(sc, 2000L); // batch interval is in milliseconds

            var inputDStream = CSharpInputDStreamUtils.CreateStream <string>(
                ssc,
                numPartitions,
                (double time, int pid) =>
            {
                var list = new List <string>()
                {
                    string.Format("PluggableInputDStream-{0}-{1}", pid, time)
                };
                return(list.AsEnumerable());
            });

            inputDStream.ForeachRDD((time, rdd) =>
            {
                var taken      = rdd.Collect();
                int partitions = rdd.GetNumPartitions();

                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Count: " + taken.Length);
                Console.WriteLine("Partitions: " + partitions);

                foreach (object record in taken)
                {
                    Console.WriteLine(record);
                }
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext(Env.SPARK_MASTER_URL, "xxxx"), 1000);

            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());

            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream("127.0.0.1", 12345);

            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = ssc.KafkaStream("127.0.0.1:2181", "testGroupId", new Dictionary <string, int> {
                { "testTopic1", 1 }
            }, new Dictionary <string, string>());

            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = ssc.DirectKafkaStream(new List <string> {
                "testTopic2"
            }, new Dictionary <string, string>(), new Dictionary <string, long>());

            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);

            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Exemple #11
0
        internal static void DStreamTextFileSample()
        {
            count = 0;

            string directory      = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            SparkContext sc = SparkCLRSamples.SparkContext;
            var          b  = sc.Broadcast <int>(0);

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                StreamingContext context = new StreamingContext(sc, 2);
                context.Checkpoint(checkpointPath);

                var lines = context.TextFileStream(Path.Combine(directory, "test"));
                lines     = context.Union(lines, lines);
                var words = lines.FlatMap(l => l.Split(' '));
                var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1));

                // since operations like ReduceByKey, Join and UpdateStateByKey are
                // separate dstream transformations defined in CSharpDStream.scala
                // an extra CSharpRDD is introduced in between these operations
                var wordCounts = pairs.ReduceByKey((x, y) => x + y);
                var join       = wordCounts.Window(2, 2).Join(wordCounts, 2);
                var state      = join.UpdateStateByKey <string, Tuple <int, int>, int>(new UpdateStateHelper(b).Execute);

                state.ForeachRDD((time, rdd) =>
                {
                    // there's chance rdd.Take conflicts with ssc.Stop
                    if (stopFileServer)
                    {
                        return;
                    }

                    object[] taken = rdd.Take(10);
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");
                    foreach (object record in taken)
                    {
                        Console.WriteLine(record);

                        var countByWord = (KeyValuePair <string, int>)record;
                        Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "lazy" || countByWord.Key == "dog" ? 92 : 88);
                    }
                    Console.WriteLine();

                    stopFileServer = true;
                });

                return(context);
            });

            StartFileServer(ssc, directory, "words.txt");

            ssc.Start();

            ssc.AwaitTermination();
        }
Exemple #12
0
        internal static void DStreamMapWithStateSample()
        {
            string directory      = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                SparkContext sc          = SparkCLRSamples.SparkContext;
                StreamingContext context = new StreamingContext(sc, 10000L);     // batch interval is in milliseconds
                context.Checkpoint(checkpointPath);

                var lines = context.TextFileStream(Path.Combine(directory, "test1"));
                lines     = context.Union(lines, lines);
                var words = lines.FlatMap(l => l.Split(' '));
                var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1));

                var wordCounts   = pairs.ReduceByKey((x, y) => x + y);
                var initialState = sc.Parallelize(new[] { new KeyValuePair <string, int>("NOT_A_WORD", 1024), new KeyValuePair <string, int>("dog", 10000), }, 1);
                var stateSpec    = new StateSpec <string, int, int, KeyValuePair <string, int> >((word, count, state) =>
                {
                    if (state.IsTimingOut())
                    {
                        Console.WriteLine("Found timing out word: {0}", word);
                        return(new KeyValuePair <string, int>(word, state.Get()));
                    }

                    var sum = 0;
                    if (state.Exists())
                    {
                        sum = state.Get();
                    }
                    state.Update(sum + count);
                    Console.WriteLine("word: {0}, count: {1}", word, sum + count);
                    return(new KeyValuePair <string, int>(word, sum + count));
                }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30));

                var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots();
                snapshots.ForeachRDD((double time, RDD <dynamic> rdd) =>
                {
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Snapshots @ Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");

                    foreach (KeyValuePair <string, int> record in rdd.Collect())
                    {
                        Console.WriteLine("[{0}, {1}]", record.Key, record.Value);
                    }
                    Console.WriteLine();
                });

                return(context);
            });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Exemple #13
0
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1000L);

            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000L);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());

            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);

            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> {
                { "testTopic1", 1 }
            }, new Dictionary <string, string>());

            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic2"
            }, new Dictionary <string, string>(), new Dictionary <string, long>());

            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10");

            var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List <string> {
                "testTopic3"
            }, new Dictionary <string, string>(), new Dictionary <string, long>());

            Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);

            var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream(
                ssc,
                new List <string> {
                "testTopic3"
            },
                new Dictionary <string, string>(), new Dictionary <string, long>(),
                (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); });

            Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc);

            ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10");

            var directKafkaReceiver = KafkaUtils.CreateDirectStream(
                ssc,
                new List <string> {
                "testTopic3"
            },
                new Dictionary <string, string>(), new Dictionary <string, long>(),
                (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); });

            Assert.IsNotNull(directKafkaReceiver.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);

            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }