Beispiel #1
0
        private static void StartFileServer(StreamingContext ssc, string directory, string pattern, int loops = 1)
        {
            string testDir = Path.Combine(directory, "test");
            if (!Directory.Exists(testDir))
                Directory.CreateDirectory(testDir);

            stopFileServer = false;

            string[] files = Directory.GetFiles(directory, pattern);

            Task.Run(() =>
            {
                int loop = 0;
                while (!stopFileServer)
                {
                    if (loop++ < loops)
                    {
                        DateTime now = DateTime.Now;
                        foreach (string path in files)
                        {
                            string text = File.ReadAllText(path);
                            File.WriteAllText(testDir + "\\" + now.ToBinary() + "_" + Path.GetFileName(path), text);
                        }
                    }
                    System.Threading.Thread.Sleep(200);
                }

                ssc.Stop();
            });
            
            System.Threading.Thread.Sleep(1);
        }
Beispiel #2
0
        public void TestDStreamMapReduce()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            var lines = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(lines.DStreamProxy);

            var words = lines.FlatMap(l => l.Split(' ')).Filter(w => w != "The").Repartition(1);

            words.Slice(DateTime.MinValue, DateTime.MaxValue);
            words.Cache();
            words.Checkpoint(1);
            words.Window(1, 1);

            words.Count().ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 1);
                Assert.AreEqual((int)taken[0], 178);
            });

            words.CountByValue().ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 8);

                foreach (object record in taken)
                {
                    KeyValuePair<string, long> countByWord = (KeyValuePair<string, long>)record;
                    Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
                }
            });

            words.CountByValueAndWindow(1, 1).ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken[0], 8);
            });

            words.CountByWindow(1).ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 1);
                Assert.AreEqual((int)taken[0], 356);
            });

            words.Union(words).ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 356);
            });

            words.Glom().ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 1);
                Assert.AreEqual((taken[0] as string[]).Length, 178);
            });
        }
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);
            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary<string, int> { { "testTopic1", 1 } }, new Dictionary<string, string>());
            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic2" }, new Dictionary<string, string>(), new Dictionary<string, long>());
            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStreamWithRepartition(ssc, new List<string> { "testTopic3" }, new Dictionary<string, string>(), new Dictionary<string, long>(), 10);
            Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);
            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #4
0
        internal static void DStreamTextFileSample()
        {
            count = 0;

            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            SparkContext sc = SparkCLRSamples.SparkContext;
            var b = sc.Broadcast<int>(0);

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {

                    StreamingContext context = new StreamingContext(sc, 2);
                    context.Checkpoint(checkpointPath);

                    var lines = context.TextFileStream(Path.Combine(directory, "test"));
                    lines = context.Union(lines, lines);
                    var words = lines.FlatMap(l => l.Split(' '));
                    var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

                    // since operations like ReduceByKey, Join and UpdateStateByKey are
                    // separate dstream transformations defined in CSharpDStream.scala
                    // an extra CSharpRDD is introduced in between these operations
                    var wordCounts = pairs.ReduceByKey((x, y) => x + y);
                    var join = wordCounts.Window(2, 2).Join(wordCounts, 2);
                    var state = join.UpdateStateByKey<string, Tuple<int, int>, int>(new UpdateStateHelper(b).Execute);

                    state.ForeachRDD((time, rdd) =>
                    {
                        // there's chance rdd.Take conflicts with ssc.Stop
                        if (stopFileServer)
                            return;

                        object[] taken = rdd.Take(10);
                        Console.WriteLine("-------------------------------------------");
                        Console.WriteLine("Time: {0}", time);
                        Console.WriteLine("-------------------------------------------");
                        foreach (object record in taken)
                        {
                            Console.WriteLine(record);
                            
                            var countByWord = (KeyValuePair<string, int>)record;
                            Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "lazy" || countByWord.Key == "dog" ? 92 : 88);
                        }
                        Console.WriteLine();

                        stopFileServer = true;
                    });

                    return context;
                });

            StartFileServer(ssc, directory, "words.txt");

            ssc.Start();

            ssc.AwaitTermination();
        }
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1000);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream("127.0.0.1", 12345);
            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = ssc.KafkaStream("127.0.0.1:2181", "testGroupId", new Dictionary<string, int> { { "testTopic1", 1 } }, new Dictionary<string, string>());
            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = ssc.DirectKafkaStream(new List<string> { "testTopic2" }, new Dictionary<string, string>(), new Dictionary<string, long>());
            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);
            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #6
0
        public void TestCSharpInputDStream()
        {
            // test create CSharpInputDStream
            var sc = new SparkContext("", "");
            var ssc = new StreamingContext(sc, 1000L);
            Func<double, int, IEnumerable<string>> func =
                (double time, int pid) =>
                {
                    var list = new List<string>() { string.Format("PluggableInputDStream-{0}-{1}", pid, time) };
                    return list.AsEnumerable();
                };
            const int numPartitions = 5;
            var inputDStream = CSharpInputDStreamUtils.CreateStream<string>(
                ssc,
                numPartitions,
                func);
            Assert.IsNotNull(inputDStream);
            Assert.AreEqual(ssc, inputDStream.streamingContext);

            // test CSharpInputDStreamMapPartitionWithIndexHelper
            int[] array = new int[numPartitions];
            int partitionIndex = 0;
            new CSharpInputDStreamMapPartitionWithIndexHelper<string>(0.0, func).Execute(partitionIndex, array.AsEnumerable());

            // test CSharpInputDStreamGenerateRDDHelper
            new CSharpInputDStreamGenerateRDDHelper<string>(numPartitions, func).Execute(0.0);
        }
Beispiel #7
0
        internal static void DStreamConstantDStreamSample()
        {
            var sc = SparkCLRSamples.SparkContext;
            var ssc = new StreamingContext(sc, 2000L);

            const int count = 100;
            const int partitions = 2;

            // create the RDD
            var seedRDD = sc.Parallelize(Enumerable.Range(0, 100), 2);
            var dstream = new ConstantInputDStream<int>(seedRDD, ssc);

            dstream.ForeachRDD((time, rdd) =>
            {
                long batchCount = rdd.Count();
                int numPartitions = rdd.GetNumPartitions();

                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Count: " + batchCount);
                Console.WriteLine("Partitions: " + numPartitions);
                Assert.AreEqual(count, batchCount);
                Assert.AreEqual(partitions, numPartitions);
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
Beispiel #8
0
        internal static void DStreamTextFileSamples()
        {
            count = 0;

            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    SparkContext sc = SparkCLRSamples.SparkContext;
                    StreamingContext context = new StreamingContext(sc, 2000);
                    context.Checkpoint(checkpointPath);

                    var lines = context.TextFileStream(Path.Combine(directory, "test"));
                    lines = context.Union(lines, lines);
                    var words = lines.FlatMap(l => l.Split(' '));
                    var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

                    // since operations like ReduceByKey, Join and UpdateStateByKey are
                    // separate dstream transformations defined in CSharpDStream.scala
                    // an extra CSharpRDD is introduced in between these operations
                    var wordCounts = pairs.ReduceByKey((x, y) => x + y);
                    var join = wordCounts.Join(wordCounts, 2);
                    var state = join.UpdateStateByKey<string, Tuple<int, int>, int>((vs, s) => vs.Sum(x => x.Item1 + x.Item2) + s);

                    state.ForeachRDD((time, rdd) =>
                    {
                        // there's chance rdd.Take conflicts with ssc.Stop
                        if (stopFileServer)
                            return;

                        object[] taken = rdd.Take(10);
                        Console.WriteLine("-------------------------------------------");
                        Console.WriteLine("Time: {0}", time);
                        Console.WriteLine("-------------------------------------------");
                        foreach (object record in taken)
                        {
                            Console.WriteLine(record);
                        }
                        Console.WriteLine();

                        stopFileServer = count++ > 100;
                    });

                    return context;
                });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #9
0
        public void TestDStreamTransform()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            var lines = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(lines.DStreamProxy);

            var words = lines.FlatMap(l => l.Split(' '));

            var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

            var wordCounts = pairs.PartitionBy().ReduceByKey((x, y) => x + y);

            wordCounts.ForeachRDD((time, rdd) => 
                {
                    var taken = rdd.Collect();
                    Assert.AreEqual(taken.Length, 9);

                    foreach (object record in taken)
                    {
                        KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
                        Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
                    }
                });

            var wordLists = pairs.GroupByKey();

            wordLists.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 9);

                foreach (object record in taken)
                {
                    KeyValuePair<string, List<int>> countByWord = (KeyValuePair<string, List<int>>)record;
                    Assert.AreEqual(countByWord.Value.Count, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
                }
            });

            var wordCountsByWindow = pairs.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, 1);

            wordCountsByWindow.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 9);

                foreach (object record in taken)
                {
                    KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
                    Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 46 : 44);
                }
            });
        }
Beispiel #10
0
        public void TestConstantInputDStream()
        {
            var sc = new SparkContext("", "");
            var rdd = sc.Parallelize(Enumerable.Range(0, 10), 1);
            var ssc = new StreamingContext(sc, 1000L);

            // test when rdd is null
            Assert.Throws<ArgumentNullException>(() => new ConstantInputDStream<int>(null, ssc));

            var constantInputDStream = new ConstantInputDStream<int>(rdd, ssc);
            Assert.IsNotNull(constantInputDStream);
            Assert.AreEqual(ssc, constantInputDStream.streamingContext);
        }
Beispiel #11
0
        static void Main(string[] args)
        {
            var sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLREventHub Example"));
            var eventhubsParams = new Dictionary<string, string>()
            {
                {"eventhubs.policyname", "<policyname>"},
                {"eventhubs.policykey", "<policykey>"},
                {"eventhubs.namespace", "<namespace>"},
                {"eventhubs.name", "<name>"},
                {"eventhubs.partition.count", "<partitioncount>"},
                {"eventhubs.consumergroup", "$default"},
                {"eventhubs.checkpoint.dir", "<hdfs path to eventhub checkpoint dir>"},
                {"eventhubs.checkpoint.interval", "<interval>"},
            };
            const int windowDurationInSecs = 5;
            const int slideDurationInSecs = 5;
            const string checkpointPath = "<hdfs path to spark checkpoint dir>";
            //const string outputPath = "<hdfs path to output dir>";

            const long slideDurationInMillis = 5000;
            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
                    ssc.Checkpoint(checkpointPath);

                    var stream = EventHubsUtils.CreateUnionStream(ssc, eventhubsParams);
                    var countByLogLevelAndTime = stream
                                                    .Map(bytes => Encoding.UTF8.GetString(bytes))
                                                    .Filter(line => line.Contains(","))
                                                    .Map(line => line.Split(','))
                                                    .Map(columns => new KeyValuePair<string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1))
                                                    .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3)
                                                    .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Key, logLevelCountPair.Value));

                    countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
                    {
                        //dimensionalCount.SaveAsTextFile(string.Format("{0}/{1}", outputPath, Guid.NewGuid()));
                        var dimensionalCountCollection = countByLogLevel.Collect();
                        foreach (var dimensionalCountItem in dimensionalCountCollection)
                        {
                            Console.WriteLine(dimensionalCountItem);
                        }
                    });

                    return ssc;
                });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Beispiel #12
0
        /// <summary>
        /// Create an input stream that pulls messages from a Kafka Broker.
        /// </summary>
        /// <param name="zkQuorum">Zookeeper quorum (hostname:port,hostname:port,..).</param>
        /// <param name="groupId">The group id for this consumer.</param>
        /// <param name="topics">Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.</param>
        /// <param name="kafkaParams">Additional params for Kafka</param>
        /// <param name="storageLevelType">RDD storage level.</param>
        /// <returns>A DStream object</returns>
        public static DStream<KeyValuePair<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
        {
            if (kafkaParams == null)
                kafkaParams = new Dictionary<string, string>();

            if (!string.IsNullOrEmpty(zkQuorum))
                kafkaParams["zookeeper.connect"] = zkQuorum;
            if (groupId != null)
                kafkaParams["group.id"] = groupId;
            if (kafkaParams.ContainsKey("zookeeper.connection.timeout.ms"))
                kafkaParams["zookeeper.connection.timeout.ms"] = "10000";

            return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.KafkaStream(topics, kafkaParams, storageLevelType), ssc);
        }
        public void TestStreamingContext()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1000L);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000L);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);
            Assert.IsNotNull(socketStream.DStreamProxy);

            var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary<string, int> { { "testTopic1", 1 } }, new Dictionary<string, string>());
            Assert.IsNotNull(kafkaStream.DStreamProxy);

            var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic2" }, new Dictionary<string, string>(), new Dictionary<string, long>());
            Assert.IsNotNull(directKafkaStream.DStreamProxy);

            ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10");

            var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List<string> { "testTopic3" }, new Dictionary<string, string>(), new Dictionary<string, long>());
            Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy);

            var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream(
                ssc,
                new List<string> { "testTopic3" },
                new Dictionary<string, string>(), new Dictionary<string, long>(),
                (int pid, IEnumerable<KeyValuePair<byte[], byte[]>> input) => { return input; });
            Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc);

            ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10");

            var directKafkaReceiver = KafkaUtils.CreateDirectStream(
                ssc,
                new List<string> { "testTopic3" },
                new Dictionary<string, string>(), new Dictionary<string, long>(),
                (int pid, IEnumerable<KeyValuePair<byte[], byte[]>> input) => { return input; });
            Assert.IsNotNull(directKafkaReceiver.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);
            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #14
0
        static void Main(string[] args)
        {
            var sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLRKafka Example"));
            const string topicName = "<topicName>";
            var topicList = new List<string> {topicName};
            var kafkaParams = new Dictionary<string, string> //refer to http://kafka.apache.org/documentation.html#configuration
            {
                {"metadata.broker.list", "<kafka brokers list>"},
                {"auto.offset.reset", "smallest"}
            };
            var perTopicPartitionKafkaOffsets = new Dictionary<string, long>();
            const int windowDurationInSecs = 5;
            const int slideDurationInSecs = 5;
            const string checkpointPath = "<hdfs path to spark checkpoint directory>";
            const string appOutputPath = "<hdfs path to app output directory>";


            const long slideDurationInMillis = 5000;
            StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
                    ssc.Checkpoint(checkpointPath);

                    var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets);
                    var countByLogLevelAndTime = stream
                                                    .Map(kvp => Encoding.UTF8.GetString(kvp.Value))
                                                    .Filter(line => line.Contains(","))
                                                    .Map(line => line.Split(','))
                                                    .Map(columns => new KeyValuePair<string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1))
                                                    .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3)
                                                    .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Key, logLevelCountPair.Value));

                    countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
                    {
                        countByLogLevel.SaveAsTextFile(string.Format("{0}/{1}", appOutputPath, Guid.NewGuid()));
                        foreach (var logCount in countByLogLevel.Collect())
                        {
                            Console.WriteLine(logCount);
                        }
                    });

                    return ssc;
                });

            sparkStreamingContext.Start();
            sparkStreamingContext.AwaitTermination();
        }
Beispiel #15
0
        internal static void DStreamTextFileSamples()
        {
            SparkContext sc = SparkCLRSamples.SparkContext;
            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            sc.SetCheckpointDir(directory);
            StreamingContext ssc = new StreamingContext(sc, 2000);

            var lines = ssc.TextFileStream(Path.Combine(directory, "test"));
            var words = lines.FlatMap(l => l.Split(' '));
            var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
            var wordCounts = pairs.ReduceByKey((x, y) => x + y);
            var join = wordCounts.Join(wordCounts, 2);
            var state = join.UpdateStateByKey<string, Tuple<int, int>, int>((vs, s) => vs.Sum(x => x.Item1 + x.Item2) + s);

            state.ForeachRDD((time, rdd) =>
            {
                // there's chance rdd.Take conflicts with ssc.Stop
                if (stopFileServer)
                    return;

                object[] taken = rdd.Take(10);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                foreach (object record in taken)
                {
                    Console.WriteLine(record);
                }
                Console.WriteLine();

                stopFileServer = count++ > 3;
            });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);
            while (!stopFileServer)
            {
                System.Threading.Thread.Sleep(1000);
            }

            // wait ForeachRDD to complete to let ssc.Stop() gracefully
            System.Threading.Thread.Sleep(2000);

            ssc.Stop();
        }
Beispiel #16
0
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>");
                return;
            }

            string checkpointPath = args[0];
            string inputDir = args[1];

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    var sparkConf = new SparkConf();
                    sparkConf.SetAppName("HdfsWordCount");
                    var sc = new SparkContext(sparkConf);
                    StreamingContext context = new StreamingContext(sc, 30000);
                    context.Checkpoint(checkpointPath);

                    var lines = context.TextFileStream(inputDir);
                    var words = lines.FlatMap(l => l.Split(' '));
                    var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
                    var wordCounts = pairs.ReduceByKey((x, y) => x + y);

                    wordCounts.ForeachRDD((time, rdd) =>
                    {
                        Console.WriteLine("-------------------------------------------");
                        Console.WriteLine("Time: {0}", time);
                        Console.WriteLine("-------------------------------------------");
                        object[] taken = rdd.Take(10);
                        foreach (object record in taken)
                        {
                            Console.WriteLine(record);
                        }
                        Console.WriteLine();
                    });

                    return context;
                });

            ssc.Start();
            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #17
0
        public void TestCreateUnionStream()
        {
            var streamingContextProxy = new Mock<IStreamingContextProxy>();
            var mockDstreamProxy = new Mock<IDStreamProxy>().Object;
            streamingContextProxy.Setup(
                                    m => m.EventHubsUnionStream(It.IsAny<Dictionary<string, string>>(), It.IsAny<StorageLevelType>()))
                                .Returns(mockDstreamProxy);

            var mockSparkClrProxy = new Mock<ISparkCLRProxy>();
            mockSparkClrProxy.Setup(m => m.CreateStreamingContext(It.IsAny<SparkContext>(), It.IsAny<long>()))
                .Returns(streamingContextProxy.Object);
            SparkCLREnvironment.SparkCLRProxy = mockSparkClrProxy.Object;

            var sparkContext = new SparkContext(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy, new SparkConf(new Mock<ISparkConfProxy>().Object));
            var streamingContext = new StreamingContext(sparkContext, 123L);
            var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new Dictionary<string, string>());
            Assert.AreEqual(mockDstreamProxy, dstream.DStreamProxy);
        }
        public void TestStreamingAwaitTimeout()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1000L);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            ssc.Start();
            ssc.Remember(1000L);
            ssc.Checkpoint(Path.GetTempPath());

            var textFile = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(textFile.DStreamProxy);

            var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345);
            Assert.IsNotNull(socketStream.DStreamProxy);

            var union = ssc.Union(textFile, socketStream);
            Assert.IsNotNull(union.DStreamProxy);

            ssc.AwaitTerminationOrTimeout(3000);
            ssc.Stop();
        }
Beispiel #19
0
        public void TestInitialize()
        {
            result = null;

            // Create Mock object to mock implementation of T by new Mock<T>();
            _mockSparkCLRProxy         = new Mock <ISparkCLRProxy>();
            _mockSparkContextProxy     = new Mock <ISparkContextProxy>();
            _mockStreamingContextProxy = new Mock <IStreamingContextProxy>();
            _mockRddProxy = new Mock <IRDDProxy>();

            SparkCLREnvironment.SparkCLRProxy = _mockSparkCLRProxy.Object;

            // Mock method of T by Mock<T>.Setup(). For method with parameters, you can mock different method implementation for different method parameters.
            // e.g., if you want to mock a method regardless of what values the method parameters are, you can use It.IsAny<T>() for each parameter; if you want
            // to mock the method for certain criteria, use It.Is<T>(Func<T, bool>) can. You can mock the same method multiple times for different criteria of
            // method parameters.

            // If the method to mock has return value and you want to mock the return value only, Use Returns(TReturnValue); if you want to add logics and return,
            // use Returns<T1, T2, ...>(Func<T1, T2, ..., TReturnValue>). If method is void, use CallBack<T1, T2, ...>(Action<T1, T2, ...>)

            // for more info please visit https://github.com/Moq/moq4/wiki/Quickstart
            _mockSparkCLRProxy.Setup(m => m.CreateSparkConf(It.IsAny <bool>())).Returns(new MockSparkConfProxy()); // some of mocks which rarely change can be kept

            _mockSparkCLRProxy.Setup(m => m.CreateSparkContext(It.IsAny <ISparkConfProxy>())).Returns(_mockSparkContextProxy.Object);
            _mockSparkCLRProxy.Setup(m => m.CreateStreamingContext(It.IsAny <SparkContext>(), It.IsAny <long>())).Returns(_mockStreamingContextProxy.Object);
            _mockRddProxy.Setup(m => m.CollectAndServe()).Returns(() =>
            {
                var listener = SocketFactory.CreateSocket();
                listener.Listen();

                Task.Run(() =>
                {
                    using (var socket = listener.Accept())
                        using (var ns = socket.GetStream())
                        {
                            foreach (var item in result)
                            {
                                var ms = new MemoryStream();
                                new BinaryFormatter().Serialize(ms, item);
                                byte[] buffer = ms.ToArray();
                                SerDe.Write(ns, buffer.Length);
                                SerDe.Write(ns, buffer);
                            }
                            ns.Flush();
                        }
                });
                return((listener.LocalEndPoint as IPEndPoint).Port);
            });
            _mockRddProxy.Setup(m => m.RDDCollector).Returns(new RDDCollector());

            _mockSparkContextProxy.Setup(m => m.CreateCSharpRdd(It.IsAny <IRDDProxy>(), It.IsAny <byte[]>(), It.IsAny <Dictionary <string, string> >(),
                                                                It.IsAny <List <string> >(), It.IsAny <bool>(), It.IsAny <List <Broadcast> >(), It.IsAny <List <byte[]> >()))
            .Returns <IRDDProxy, byte[], Dictionary <string, string>, List <string>, bool, List <Broadcast>, List <byte[]> >(
                (prefvJavaRddReference, command, environmentVariables, cSharpIncludes, preservePartitioning, broadcastVariables, accumulator) =>
            {
                IEnumerable <dynamic> input = result ?? (new[] {
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The dog lazy"
                }).AsEnumerable();

                var formatter = new BinaryFormatter();
                using (MemoryStream s = new MemoryStream(command))
                {
                    int rddId       = SerDe.ReadInt(s);
                    int stageId     = SerDe.ReadInt(s);
                    int partitionId = SerDe.ReadInt(s);

                    SerDe.ReadString(s);
                    SerDe.ReadString(s);

                    string runMode = SerDe.ReadString(s);
                    if ("R".Equals(runMode, StringComparison.InvariantCultureIgnoreCase))
                    {
                        string compilationDumpDir = SerDe.ReadString(s);
                    }

                    CSharpWorkerFunc workerFunc = (CSharpWorkerFunc)formatter.Deserialize(new MemoryStream(SerDe.ReadBytes(s)));
                    var func = workerFunc.Func;
                    result   = func(default(int), input);
                }

                if (result.FirstOrDefault() is byte[] && (result.First() as byte[]).Length == 8)
                {
                    result = result.Where(e => (e as byte[]).Length != 8).Select(e => formatter.Deserialize(new MemoryStream(e as byte[])));
                }

                return(_mockRddProxy.Object);
            });

            _streamingContext = new StreamingContext(new SparkContext("", ""), 1000L);
        }
Beispiel #20
0
        internal static void DStreamCSharpInputSample()
        {
            const int numPartitions = 5;

            var sc = SparkCLRSamples.SparkContext;
            var ssc = new StreamingContext(sc, 2000L); // batch interval is in milliseconds

            var inputDStream = CSharpInputDStreamUtils.CreateStream<string>(
                ssc,
                numPartitions,
                (double time, int pid) =>
                {
                    var list = new List<string>() { string.Format("PluggableInputDStream-{0}-{1}", pid, time) };
                    return list.AsEnumerable();
                });

            inputDStream.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                int partitions = rdd.GetNumPartitions();

                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Count: " + taken.Length);
                Console.WriteLine("Partitions: " + partitions);

                foreach (object record in taken)
                {
                    Console.WriteLine(record);
                }
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
Beispiel #21
0
        internal static void DStreamMapWithStateSample()
        {
            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    SparkContext sc = SparkCLRSamples.SparkContext;
                    StreamingContext context = new StreamingContext(sc, 10000);
                    context.Checkpoint(checkpointPath);

                    var lines = context.TextFileStream(Path.Combine(directory, "test1"));
                    lines = context.Union(lines, lines);
                    var words = lines.FlatMap(l => l.Split(' '));
                    var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

                    var wordCounts = pairs.ReduceByKey((x, y) => x + y);
                    var initialState = sc.Parallelize(new[] { new KeyValuePair<string, int>("NOT_A_WORD", 1024), new KeyValuePair<string, int>("dog", 10000), }, 1);
                    var stateSpec = new StateSpec<string, int, int, KeyValuePair<string, int>>((word, count, state) =>
                    {
                        if (state.IsTimingOut())
                        {
                            Console.WriteLine("Found timing out word: {0}", word);
                            return new KeyValuePair<string, int>(word, state.Get());
                        }

                        var sum = 0;
                        if (state.Exists())
                        {
                            sum = state.Get();
                        }
                        state.Update(sum + count);
                        Console.WriteLine("word: {0}, count: {1}", word, sum + count);
                        return new KeyValuePair<string, int>(word, sum + count);
                    }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30));

                    var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots();
                    snapshots.ForeachRDD((double time, RDD<dynamic> rdd) =>
                    {
                        Console.WriteLine("-------------------------------------------");
                        Console.WriteLine("Snapshots @ Time: {0}", time);
                        Console.WriteLine("-------------------------------------------");

                        foreach (KeyValuePair<string, int> record in rdd.Collect())
                        {
                            Console.WriteLine("[{0}, {1}]", record.Key, record.Value);
                        }
                        Console.WriteLine();
                    });

                    return context;
                });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #22
0
        public void TestInitialize()
        {
            result = null;

            // Create Mock object to mock implementation of T by new Mock<T>();
            _mockSparkCLRProxy         = new Mock <ISparkCLRProxy>();
            _mockSparkContextProxy     = new Mock <ISparkContextProxy>();
            _mockStreamingContextProxy = new Mock <IStreamingContextProxy>();
            _mockRddProxy = new Mock <IRDDProxy>();

            SparkCLREnvironment.SparkCLRProxy = _mockSparkCLRProxy.Object;

            // Mock method of T by Mock<T>.Setup(). For method with parameters, you can mock different method implementation for different method parameters.
            // e.g., if you want to mock a method regardless of what values the method parameters are, you can use It.IsAny<T>() for each parameter; if you want
            // to mock the method for certain criteria, use It.Is<T>(Func<T, bool>) can. You can mock the same method multiple times for different criteria of
            // method parameters.

            // If the method to mock has return value and you want to mock the return value only, Use Returns(TReturnValue); if you want to add logics and return,
            // use Returns<T1, T2, ...>(Func<T1, T2, ..., TReturnValue>). If method is void, use CallBack<T1, T2, ...>(Action<T1, T2, ...>)
            _mockSparkCLRProxy.Setup(m => m.CreateSparkConf(It.IsAny <bool>())).Returns(new MockSparkConfProxy()); // some of mocks which rarely change can be kept

            _mockSparkCLRProxy.Setup(m => m.CreateSparkContext(It.IsAny <ISparkConfProxy>())).Returns(_mockSparkContextProxy.Object);
            _mockSparkContextProxy.Setup(m => m.CreateStreamingContext(It.IsAny <SparkContext>(), It.IsAny <long>())).Returns(_mockStreamingContextProxy.Object);
            _mockRddProxy.Setup(m => m.CollectAndServe()).Returns(() =>
            {
                TcpListener listener = new TcpListener(IPAddress.Parse("127.0.0.1"), 0);
                listener.Start();

                Task.Run(() =>
                {
                    using (Socket socket = listener.AcceptSocket())
                        using (Stream ns = new NetworkStream(socket))
                        {
                            foreach (var item in result)
                            {
                                var ms = new MemoryStream();
                                new BinaryFormatter().Serialize(ms, item);
                                byte[] buffer = ms.ToArray();
                                SerDe.Write(ns, buffer.Length);
                                SerDe.Write(ns, buffer);
                            }
                        }
                });
                return((listener.LocalEndpoint as IPEndPoint).Port);
            });

            _mockSparkContextProxy.Setup(m => m.CreateCSharpRdd(It.IsAny <IRDDProxy>(), It.IsAny <byte[]>(), It.IsAny <Dictionary <string, string> >(),
                                                                It.IsAny <List <string> >(), It.IsAny <bool>(), It.IsAny <List <Broadcast> >(), It.IsAny <List <byte[]> >()))
            .Returns <IRDDProxy, byte[], Dictionary <string, string>, List <string>, bool, List <Broadcast>, List <byte[]> >(
                (prefvJavaRddReference, command, environmentVariables, cSharpIncludes, preservePartitioning, broadcastVariables, accumulator) =>
            {
                IEnumerable <dynamic> input = result ?? (new[] {
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The dog lazy"
                }).AsEnumerable();

                var formatter = new BinaryFormatter();
                using (MemoryStream s = new MemoryStream(command))
                {
                    SerDe.ReadString(s);
                    SerDe.ReadString(s);
                    var func = (Func <int, IEnumerable <dynamic>, IEnumerable <dynamic> >)formatter.Deserialize(new MemoryStream(SerDe.ReadBytes(s)));
                    result   = func(default(int), input);
                }

                if (result.FirstOrDefault() is byte[] && (result.First() as byte[]).Length == 8)
                {
                    result = result.Where(e => (e as byte[]).Length != 8).Select(e => formatter.Deserialize(new MemoryStream(e as byte[])));
                }

                return(_mockRddProxy.Object);
            });

            _streamingContext = new StreamingContext(new SparkContext("", ""), 1000);
        }
Beispiel #23
0
        internal static void DStreamDirectKafkaWithRepartitionSample()
        {
            count = 0;

            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    var conf = new SparkConf();
                    SparkContext sc = new SparkContext(conf);
                    StreamingContext context = new StreamingContext(sc, 2000L);
                    context.Checkpoint(checkpointPath);

                    var kafkaParams = new Dictionary<string, string> {
                        {"metadata.broker.list", brokers},
                        {"auto.offset.reset", "smallest"}
                    };

                    conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString());
                    var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, new Dictionary<string, long>());

                    dstream.ForeachRDD((time, rdd) =>
                        {
                            long batchCount = rdd.Count();
                            int numPartitions = rdd.GetNumPartitions();

                            Console.WriteLine("-------------------------------------------");
                            Console.WriteLine("Time: {0}", time);
                            Console.WriteLine("-------------------------------------------");
                            Console.WriteLine("Count: " + batchCount);
                            Console.WriteLine("Partitions: " + numPartitions);

                            // only first batch has data and is repartitioned into 10 partitions
                            if (count++ == 0)
                            {
                                Assert.AreEqual(messages, batchCount);
                                Assert.IsTrue(numPartitions >= partitions);
                            }
                            else
                            {
                                Assert.AreEqual(0, batchCount);
                                Assert.IsTrue(numPartitions == 0);
                            }
                        });

                    return context;
                });

            ssc.Start();
            ssc.AwaitTermination();
        }
Beispiel #24
0
        public void TestInitialize()
        {
            result = null;

            // Create Mock object to mock implementation of T by new Mock<T>();
            _mockSparkCLRProxy = new Mock<ISparkCLRProxy>();
            _mockSparkContextProxy = new Mock<ISparkContextProxy>();
            _mockStreamingContextProxy = new Mock<IStreamingContextProxy>();
            _mockRddProxy = new Mock<IRDDProxy>();

            SparkCLREnvironment.SparkCLRProxy = _mockSparkCLRProxy.Object;

            // Mock method of T by Mock<T>.Setup(). For method with parameters, you can mock different method implementation for different method parameters.
            // e.g., if you want to mock a method regardless of what values the method parameters are, you can use It.IsAny<T>() for each parameter; if you want 
            // to mock the method for certain criteria, use It.Is<T>(Func<T, bool>) can. You can mock the same method multiple times for different criteria of 
            // method parameters.

            // If the method to mock has return value and you want to mock the return value only, Use Returns(TReturnValue); if you want to add logics and return,
            // use Returns<T1, T2, ...>(Func<T1, T2, ..., TReturnValue>). If method is void, use CallBack<T1, T2, ...>(Action<T1, T2, ...>)
			
			// for more info please visit https://github.com/Moq/moq4/wiki/Quickstart
            _mockSparkCLRProxy.Setup(m => m.CreateSparkConf(It.IsAny<bool>())).Returns(new MockSparkConfProxy()); // some of mocks which rarely change can be kept

            _mockSparkCLRProxy.Setup(m => m.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(_mockSparkContextProxy.Object);
            _mockSparkCLRProxy.Setup(m => m.CreateStreamingContext(It.IsAny<SparkContext>(), It.IsAny<long>())).Returns(_mockStreamingContextProxy.Object);
            _mockRddProxy.Setup(m => m.CollectAndServe()).Returns(() =>
            {
                TcpListener listener = new TcpListener(IPAddress.Loopback, 0);
                listener.Start();

                Task.Run(() =>
                {
                    using (Socket socket = listener.AcceptSocket())
                    using (Stream ns = new NetworkStream(socket))
                    {
                        foreach (var item in result)
                        {
                            var ms = new MemoryStream();
                            new BinaryFormatter().Serialize(ms, item);
                            byte[] buffer = ms.ToArray();
                            SerDe.Write(ns, buffer.Length);
                            SerDe.Write(ns, buffer);
                        }
                    }
                });
                return (listener.LocalEndpoint as IPEndPoint).Port;
            });
            _mockRddProxy.Setup(m => m.RDDCollector).Returns(new RDDCollector());

            _mockSparkContextProxy.Setup(m => m.CreateCSharpRdd(It.IsAny<IRDDProxy>(), It.IsAny<byte[]>(), It.IsAny<Dictionary<string, string>>(),
                It.IsAny<List<string>>(), It.IsAny<bool>(), It.IsAny<List<Broadcast>>(), It.IsAny<List<byte[]>>()))
                .Returns<IRDDProxy, byte[], Dictionary<string, string>, List<string>, bool, List<Broadcast>, List<byte[]>>(
                (prefvJavaRddReference, command, environmentVariables, cSharpIncludes, preservePartitioning, broadcastVariables, accumulator) =>
                {
                    IEnumerable<dynamic> input = result ?? (new[] {
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The quick brown fox jumps over the lazy dog",
                    "The dog lazy"
                    }).AsEnumerable();

                    var formatter = new BinaryFormatter();
                    using (MemoryStream s = new MemoryStream(command))
                    {
                        int rddId = SerDe.ReadInt(s);
                        int stageId = SerDe.ReadInt(s);
                        int partitionId = SerDe.ReadInt(s);

                        SerDe.ReadString(s);
                        SerDe.ReadString(s);
                        CSharpWorkerFunc workerFunc = (CSharpWorkerFunc)formatter.Deserialize(new MemoryStream(SerDe.ReadBytes(s)));
                        var func = workerFunc.Func;
                        result = func(default(int), input);
                    }

                    if (result.FirstOrDefault() is byte[] && (result.First() as byte[]).Length == 8)
                    {
                        result = result.Where(e => (e as byte[]).Length != 8).Select(e => formatter.Deserialize(new MemoryStream(e as byte[])));
                    }

                    return _mockRddProxy.Object;
                });

            _streamingContext = new StreamingContext(new SparkContext("", ""), 1000);

        }
Beispiel #25
0
        public void TestDStreamUpdateStateByKey()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            var lines = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(lines.DStreamProxy);

            var words = lines.FlatMap(l => l.Split(' '));

            var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

            var doubleCounts = pairs.GroupByKey().FlatMapValues(vs => vs).MapValues(v => 2 * v).ReduceByKey((x, y) => x + y);
            doubleCounts.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 9);

                foreach (object record in taken)
                {
                    KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
                    Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 2 * 23 : 2 * 22);
                }
            });

            // disable pipeline to UpdateStateByKey which replys on checkpoint mock proxy doesn't support
            pairs.Cache();

            var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count);
            state.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 9);

                foreach (object record in taken)
                {
                    KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
                    Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 24 : 23);
                }
            });
        }
Beispiel #26
0
        public void TestDStreamMapWithState()
        {
            var mapwithStateDStreamProxy = new Mock<IDStreamProxy>();
            var streamingContextProxy = new Mock<IStreamingContextProxy>();
            streamingContextProxy.Setup(p =>
                p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>()))
                .Returns(mapwithStateDStreamProxy.Object);

            var sparkContextProxy = new Mock<ISparkContextProxy>();

            var sparkConfProxy = new Mock<ISparkConfProxy>();

            var sparkClrProxy = new Mock<ISparkCLRProxy>();
            sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
            sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object);

            // reset sparkCLRProxy for after test completes
            var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;
            try
            {
                SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;

                var sparkConf = new SparkConf(false);
                var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);

                var dstreamProxy = new Mock<IDStreamProxy>();
                var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);

                var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
                var stateDStream = pairDStream.MapWithState(stateSpec);
                var snapshotDStream = stateDStream.StateSnapshots();

                Assert.IsNotNull(stateDStream);
                Assert.IsNotNull(snapshotDStream);
            }
            finally
            {
                SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
            }
        }
Beispiel #27
0
        private static void DStreamReduceByKeyAndWindowSample()
        {
            count = 0;

            const long bacthIntervalMs = 2000; // batch interval is in milliseconds
            const int windowDuration = 26;     // window duration in seconds
            const int numPartitions = 2;

            var sc = SparkCLRSamples.SparkContext;
            var ssc = new StreamingContext(sc, bacthIntervalMs);

            // create the RDD
            var seedRDD = sc.Parallelize(Enumerable.Range(0, 100), numPartitions);
            var numbers = new ConstantInputDStream<int>(seedRDD, ssc);
            var pairs = numbers.Map(n => new KeyValuePair<int, int>(n % numPartitions, n));
            var reduced = pairs.ReduceByKeyAndWindow(
                    (int x, int y) => (x + y),
                    (int x, int y) => (x - y),
                    windowDuration,
                    slideDuration,
                    numPartitions
                );

            reduced.ForeachRDD((time, rdd) =>
            {
                count++;
                var taken = rdd.Collect();
                int partitions = rdd.GetNumPartitions();

                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Time: {0}", time);
                Console.WriteLine("-------------------------------------------");
                Console.WriteLine("Batch: " + count);
                Console.WriteLine("Count: " + taken.Length);
                Console.WriteLine("Partitions: " + partitions);

                Assert.AreEqual(taken.Length, 2);
                Assert.AreEqual(partitions, numPartitions);

                foreach (object record in taken)
                {
                    KeyValuePair<int, int> sum = (KeyValuePair<int, int>)record;
                    Console.WriteLine("Key: {0}, Value: {1}", sum.Key, sum.Value);
                    // when batch count reaches window size, sum of even/odd number stay at windowDuration / slideDuration * (2450, 2500) respectively
                    Assert.AreEqual(sum.Value, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / (bacthIntervalMs / 1000) * (sum.Key == 0 ? 2450 : 2500));
                }
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
Beispiel #28
0
        public void TestDStreamJoin()
        {
            var ssc = new StreamingContext(new SparkContext("", ""), 1);
            Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));

            var lines = ssc.TextFileStream(Path.GetTempPath());
            Assert.IsNotNull(lines.DStreamProxy);

            var words = lines.FlatMap(l => l.Split(' '));

            var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

            var wordCounts = pairs.ReduceByKey((x, y) => x + y);

            var left = wordCounts.Filter(x => x.Key != "quick" && x.Key != "lazy");
            var right = wordCounts.Filter(x => x.Key != "brown");

            var groupWith = left.GroupWith(right);
            groupWith.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 9);

                foreach (object record in taken)
                {
                    KeyValuePair<string, Tuple<List<int>, List<int>>> countByWord = (KeyValuePair<string, Tuple<List<int>, List<int>>>)record;
                    if (countByWord.Key == "quick" || countByWord.Key == "lazy")
                        Assert.AreEqual(countByWord.Value.Item1.Count, 0);
                    else if (countByWord.Key == "brown")
                        Assert.AreEqual(countByWord.Value.Item2.Count, 0);
                    else
                    {
                        Assert.AreEqual(countByWord.Value.Item1[0], countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
                        Assert.AreEqual(countByWord.Value.Item2[0], countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
                    }
                }
            });

            var innerJoin = left.Join(right);
            innerJoin.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 6);

                foreach (object record in taken)
                {
                    KeyValuePair<string, Tuple<int, int>> countByWord = (KeyValuePair<string, Tuple<int, int>>)record;
                    Assert.AreEqual(countByWord.Value.Item1, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
                    Assert.AreEqual(countByWord.Value.Item2, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
                }
            });

            var leftOuterJoin = left.LeftOuterJoin(right);
            leftOuterJoin.ForeachRDD((time, rdd) =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 7);

                foreach (object record in taken)
                {
                    KeyValuePair<string, Tuple<int, Option<int>>> countByWord = (KeyValuePair<string, Tuple<int, Option<int>>>)record;
                    Assert.AreEqual(countByWord.Value.Item1, countByWord.Key == "The" || countByWord.Key == "dog" ? 23 : 22);
                    Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ? 
                        countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 23 : (countByWord.Key == "brown" ?
                        countByWord.Value.Item2.IsDefined == true == false : countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 22));
                }
            });

            var rightOuterJoin = left.RightOuterJoin(right);
            rightOuterJoin.ForeachRDD(rdd =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 8);

                foreach (object record in taken)
                {
                    KeyValuePair<string, Tuple<Option<int>, int>> countByWord = (KeyValuePair<string, Tuple<Option<int>, int>>)record;
                    Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ? 
                        countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 23 : 
                        (countByWord.Key == "quick" || countByWord.Key == "lazy" ? countByWord.Value.Item1.IsDefined == false :
                        countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 22));
                    Assert.AreEqual(countByWord.Value.Item2, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22);
                }
            });
            
            var fullOuterJoin = left.FullOuterJoin(right);
            fullOuterJoin.ForeachRDD(rdd =>
            {
                var taken = rdd.Collect();
                Assert.AreEqual(taken.Length, 9);

                foreach (object record in taken)
                {
                    KeyValuePair<string, Tuple<Option<int>, Option<int>>> countByWord = (KeyValuePair<string, Tuple<Option<int>, Option<int>>>)record;
                    Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" ?
                        countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 23 :
                        (countByWord.Key == "quick" || countByWord.Key == "lazy" ? countByWord.Value.Item1.IsDefined == false :
                        countByWord.Value.Item1.IsDefined == true && countByWord.Value.Item1.GetValue() == 22));

                    Assert.IsTrue(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 
                        countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 23 : 
                        (countByWord.Key == "brown" ? countByWord.Value.Item2.IsDefined == false : countByWord.Value.Item2.IsDefined == true && countByWord.Value.Item2.GetValue() == 22));
                }
            });
        }
Beispiel #29
0
 /// <summary>
 /// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
 /// 
 /// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
 /// in each batch duration and processed without storing.
 /// 
 /// This does not use Zookeeper to store offsets. The consumed offsets are tracked
 /// by the stream itself. For interoperability with Kafka monitoring tools that depend on
 /// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
 /// You can access the offsets used in each batch from the generated RDDs (see
 /// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
 /// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
 /// The information on consumed offset can be recovered from the checkpoint.
 /// See the programming guide for details (constraints, etc.).
 /// 
 /// </summary>
 /// <param name="ssc">Spark Streaming Context</param>
 /// <param name="topics">list of topic_name to consume.</param>
 /// <param name="kafkaParams">
 ///     Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
 ///     with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.        
 /// </param>
 /// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
 /// <returns>A DStream object</returns>
 public static DStream<KeyValuePair<byte[], byte[]>> CreateDirectStream(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
 {
     return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
 }
Beispiel #30
0
 /// <summary>
 /// Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
 /// The unioned stream will receive message from all partitions of the EventHubs
 /// </summary>
 /// <param name="ssc">Streaming context</param>
 /// <param name="eventhubsParams"> Parameters for EventHubs.
 ///  Required parameters are:
 ///  "eventhubs.policyname": EventHubs policy name
 ///  "eventhubs.policykey": EventHubs policy key
 ///  "eventhubs.namespace": EventHubs namespace
 ///  "eventhubs.name": EventHubs name
 ///  "eventhubs.partition.count": Number of partitions
 ///  "eventhubs.checkpoint.dir": checkpoint directory on HDFS
 ///
 ///  Optional parameters are:
 ///  "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
 ///  "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
 ///  "eventhubs.filter.enqueuetime": Unix time, millisecond since epoch, default to "0"
 ///  "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
 ///  "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
 /// </param>
 /// <param name="storageLevelType">Storage level, by default it is MEMORY_ONLY</param>
 /// <returns>DStream with byte[] representing events from EventHub</returns>
 public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, Dictionary<string, string> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
 {
     return new DStream<byte[]>(ssc.streamingContextProxy.EventHubsUnionStream(eventhubsParams, storageLevelType), ssc, SerializedMode.None);
 }
Beispiel #31
0
 /// <summary>
 /// Create an input stream that pulls messages from a Kafka Broker.
 /// </summary>
 /// <param name="ssc">Spark Streaming Context</param>
 /// <param name="zkQuorum">Zookeeper quorum (hostname:port,hostname:port,..).</param>
 /// <param name="groupId">The group id for this consumer.</param>
 /// <param name="topics">Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.</param>
 /// <param name="kafkaParams">Additional params for Kafka</param>
 /// <returns>A DStream object</returns>
 public static DStream<KeyValuePair<byte[], byte[]>> CreateStream(StreamingContext ssc, string zkQuorum, string groupId, Dictionary<string, int> topics, Dictionary<string, string> kafkaParams)
 {
     return CreateStream(ssc, zkQuorum, groupId, topics, kafkaParams, StorageLevelType.MEMORY_AND_DISK_SER_2);
 }