Beispiel #1
0
        public void TestDStreamMapWithState()
        {
            var mapwithStateDStreamProxy = new Mock <IDStreamProxy>();
            var streamingContextProxy    = new Mock <IStreamingContextProxy>();

            streamingContextProxy.Setup(p =>
                                        p.CreateCSharpStateDStream(It.IsAny <IDStreamProxy>(), It.IsAny <byte[]>(), It.IsAny <string>(), It.IsAny <string>(), It.IsAny <string>()))
            .Returns(mapwithStateDStreamProxy.Object);

            var sparkContextProxy = new Mock <ISparkContextProxy>();

            var sparkConfProxy = new Mock <ISparkConfProxy>();

            var sparkClrProxy = new Mock <ISparkCLRProxy>();

            sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
            sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny <ISparkConfProxy>())).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny <bool>())).Returns(sparkConfProxy.Object);

            // reset sparkCLRProxy for after test completes
            var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;

            try
            {
                SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;

                var sparkConf = new SparkConf(false);
                var ssc       = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);

                var dstreamProxy = new Mock <IDStreamProxy>();
                var pairDStream  = new DStream <KeyValuePair <string, int> >(dstreamProxy.Object, ssc);

                var stateSpec       = new StateSpec <string, int, int, int>((k, v, s) => v);
                var stateDStream    = pairDStream.MapWithState(stateSpec);
                var snapshotDStream = stateDStream.StateSnapshots();

                Assert.IsNotNull(stateDStream);
                Assert.IsNotNull(snapshotDStream);
            }
            finally
            {
                SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
            }
        }
Beispiel #2
0
        public void TestDStreamMapWithStateMapWithStateHelper()
        {
            // test when initialStateRdd is null
            var stateSpec = new StateSpec <string, int, int, int>((k, v, s) => v).NumPartitions(2).Timeout(TimeSpan.FromSeconds(100));
            var helper    = new MapWithStateHelper <string, int, int, int>((t, rdd) => rdd, stateSpec);

            var sparkContextProxy = new Mock <ISparkContextProxy>();
            var sc = new SparkContext(sparkContextProxy.Object, null);

            var pairwiseRddProxy = new Mock <IRDDProxy>();

            sparkContextProxy.Setup(p => p.CreatePairwiseRDD(It.IsAny <IRDDProxy>(), It.IsAny <int>(), It.IsAny <long>())).Returns(pairwiseRddProxy.Object);

            var pipelinedRddProxy = new Mock <IRDDProxy>();

            pipelinedRddProxy.Setup(p => p.Union(It.IsAny <IRDDProxy>())).Returns(new Mock <IRDDProxy>().Object);

            sparkContextProxy.Setup(p =>
                                    p.CreateCSharpRdd(It.IsAny <IRDDProxy>(), It.IsAny <byte[]>(), It.IsAny <Dictionary <string, string> >(), It.IsAny <List <string> >(), It.IsAny <bool>(), It.IsAny <List <Broadcast> >(), It.IsAny <List <byte[]> >()))
            .Returns(pipelinedRddProxy.Object);

            var valueRddProxy = new Mock <IRDDProxy>();
            var valuesRdd     = new RDD <dynamic>(valueRddProxy.Object, sc);

            var resultRdd = helper.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);

            Assert.IsNotNull(resultRdd);

            // test when initialStateRdd is not null
            var initialStateRdd = new RDD <KeyValuePair <string, int> >(new Mock <IRDDProxy>().Object, null);
            var stateSpec2      = new StateSpec <string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2);
            var helper2         = new MapWithStateHelper <string, int, int, int>((t, rdd) => rdd, stateSpec2);

            var resultRdd2 = helper2.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);

            Assert.IsNotNull(resultRdd2);
        }
Beispiel #3
0
        internal static void DStreamMapWithStateSample()
        {
            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    SparkContext sc = SparkCLRSamples.SparkContext;
                    StreamingContext context = new StreamingContext(sc, 10000);
                    context.Checkpoint(checkpointPath);

                    var lines = context.TextFileStream(Path.Combine(directory, "test1"));
                    lines = context.Union(lines, lines);
                    var words = lines.FlatMap(l => l.Split(' '));
                    var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));

                    var wordCounts = pairs.ReduceByKey((x, y) => x + y);
                    var initialState = sc.Parallelize(new[] { new KeyValuePair<string, int>("NOT_A_WORD", 1024), new KeyValuePair<string, int>("dog", 10000), }, 1);
                    var stateSpec = new StateSpec<string, int, int, KeyValuePair<string, int>>((word, count, state) =>
                    {
                        if (state.IsTimingOut())
                        {
                            Console.WriteLine("Found timing out word: {0}", word);
                            return new KeyValuePair<string, int>(word, state.Get());
                        }

                        var sum = 0;
                        if (state.Exists())
                        {
                            sum = state.Get();
                        }
                        state.Update(sum + count);
                        Console.WriteLine("word: {0}, count: {1}", word, sum + count);
                        return new KeyValuePair<string, int>(word, sum + count);
                    }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30));

                    var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots();
                    snapshots.ForeachRDD((double time, RDD<dynamic> rdd) =>
                    {
                        Console.WriteLine("-------------------------------------------");
                        Console.WriteLine("Snapshots @ Time: {0}", time);
                        Console.WriteLine("-------------------------------------------");

                        foreach (KeyValuePair<string, int> record in rdd.Collect())
                        {
                            Console.WriteLine("[{0}, {1}]", record.Key, record.Value);
                        }
                        Console.WriteLine();
                    });

                    return context;
                });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);

            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #4
0
        public void TestDStreamMapWithStateMapWithStateHelper()
        {
            // test when initialStateRdd is null
            var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v).NumPartitions(2).Timeout(TimeSpan.FromSeconds(100));
            var helper = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec);

            var sparkContextProxy = new Mock<ISparkContextProxy>();
            var sc = new SparkContext(sparkContextProxy.Object, null);

            var pairwiseRddProxy = new Mock<IRDDProxy>();
            sparkContextProxy.Setup(p => p.CreatePairwiseRDD(It.IsAny<IRDDProxy>(), It.IsAny<int>(), It.IsAny<long>())).Returns(pairwiseRddProxy.Object);

            var pipelinedRddProxy = new Mock<IRDDProxy>();
            pipelinedRddProxy.Setup(p => p.Union(It.IsAny<IRDDProxy>())).Returns(new Mock<IRDDProxy>().Object);

            sparkContextProxy.Setup(p => 
                p.CreateCSharpRdd(It.IsAny<IRDDProxy>(), It.IsAny<byte[]>(), It.IsAny<Dictionary<string, string>>(), It.IsAny<List<string>>(), It.IsAny<bool>(), It.IsAny<List<Broadcast>>(), It.IsAny<List<byte[]>>()))
                .Returns(pipelinedRddProxy.Object);

            var valueRddProxy = new Mock<IRDDProxy>();
            var valuesRdd = new RDD<dynamic>(valueRddProxy.Object, sc);

            var resultRdd = helper.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);

            Assert.IsNotNull(resultRdd);

            // test when initialStateRdd is not null
            var initialStateRdd = new RDD<KeyValuePair<string, int>>(new Mock<IRDDProxy>().Object, null);
            var stateSpec2 = new StateSpec<string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2);
            var helper2 = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec2);

            var resultRdd2 = helper2.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);

            Assert.IsNotNull(resultRdd2);
        }
Beispiel #5
0
        public void TestDStreamMapWithState()
        {
            var mapwithStateDStreamProxy = new Mock<IDStreamProxy>();
            var streamingContextProxy = new Mock<IStreamingContextProxy>();
            streamingContextProxy.Setup(p =>
                p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>()))
                .Returns(mapwithStateDStreamProxy.Object);

            var sparkContextProxy = new Mock<ISparkContextProxy>();

            var sparkConfProxy = new Mock<ISparkConfProxy>();

            var sparkClrProxy = new Mock<ISparkCLRProxy>();
            sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
            sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object);

            // reset sparkCLRProxy for after test completes
            var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;
            try
            {
                SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;

                var sparkConf = new SparkConf(false);
                var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);

                var dstreamProxy = new Mock<IDStreamProxy>();
                var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);

                var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
                var stateDStream = pairDStream.MapWithState(stateSpec);
                var snapshotDStream = stateDStream.StateSnapshots();

                Assert.IsNotNull(stateDStream);
                Assert.IsNotNull(snapshotDStream);
            }
            finally
            {
                SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
            }
        }
Beispiel #6
0
        internal static void DStreamMapWithStateSample()
        {
            string directory      = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                SparkContext sc          = SparkCLRSamples.SparkContext;
                StreamingContext context = new StreamingContext(sc, 10000L);     // batch interval is in milliseconds
                context.Checkpoint(checkpointPath);

                var lines = context.TextFileStream(Path.Combine(directory, "test1"));
                lines     = context.Union(lines, lines);
                var words = lines.FlatMap(l => l.Split(' '));
                var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1));

                var wordCounts   = pairs.ReduceByKey((x, y) => x + y);
                var initialState = sc.Parallelize(new[] { new KeyValuePair <string, int>("NOT_A_WORD", 1024), new KeyValuePair <string, int>("dog", 10000), }, 1);
                var stateSpec    = new StateSpec <string, int, int, KeyValuePair <string, int> >((word, count, state) =>
                {
                    if (state.IsTimingOut())
                    {
                        Console.WriteLine("Found timing out word: {0}", word);
                        return(new KeyValuePair <string, int>(word, state.Get()));
                    }

                    var sum = 0;
                    if (state.Exists())
                    {
                        sum = state.Get();
                    }
                    state.Update(sum + count);
                    Console.WriteLine("word: {0}, count: {1}", word, sum + count);
                    return(new KeyValuePair <string, int>(word, sum + count));
                }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30));

                var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots();
                snapshots.ForeachRDD((double time, RDD <dynamic> rdd) =>
                {
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Snapshots @ Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");

                    foreach (KeyValuePair <string, int> record in rdd.Collect())
                    {
                        Console.WriteLine("[{0}, {1}]", record.Key, record.Value);
                    }
                    Console.WriteLine();
                });

                return(context);
            });

            ssc.Start();

            StartFileServer(directory, "words.txt", 100);

            ssc.AwaitTermination();
            ssc.Stop();
        }