public void TestDStreamMapWithState() { var mapwithStateDStreamProxy = new Mock <IDStreamProxy>(); var streamingContextProxy = new Mock <IStreamingContextProxy>(); streamingContextProxy.Setup(p => p.CreateCSharpStateDStream(It.IsAny <IDStreamProxy>(), It.IsAny <byte[]>(), It.IsAny <string>(), It.IsAny <string>(), It.IsAny <string>())) .Returns(mapwithStateDStreamProxy.Object); var sparkContextProxy = new Mock <ISparkContextProxy>(); var sparkConfProxy = new Mock <ISparkConfProxy>(); var sparkClrProxy = new Mock <ISparkCLRProxy>(); sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object); sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny <ISparkConfProxy>())).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny <bool>())).Returns(sparkConfProxy.Object); // reset sparkCLRProxy for after test completes var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy; try { SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object; var sparkConf = new SparkConf(false); var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10); var dstreamProxy = new Mock <IDStreamProxy>(); var pairDStream = new DStream <KeyValuePair <string, int> >(dstreamProxy.Object, ssc); var stateSpec = new StateSpec <string, int, int, int>((k, v, s) => v); var stateDStream = pairDStream.MapWithState(stateSpec); var snapshotDStream = stateDStream.StateSnapshots(); Assert.IsNotNull(stateDStream); Assert.IsNotNull(snapshotDStream); } finally { SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy; } }
public void TestDStreamMapWithStateMapWithStateHelper() { // test when initialStateRdd is null var stateSpec = new StateSpec <string, int, int, int>((k, v, s) => v).NumPartitions(2).Timeout(TimeSpan.FromSeconds(100)); var helper = new MapWithStateHelper <string, int, int, int>((t, rdd) => rdd, stateSpec); var sparkContextProxy = new Mock <ISparkContextProxy>(); var sc = new SparkContext(sparkContextProxy.Object, null); var pairwiseRddProxy = new Mock <IRDDProxy>(); sparkContextProxy.Setup(p => p.CreatePairwiseRDD(It.IsAny <IRDDProxy>(), It.IsAny <int>(), It.IsAny <long>())).Returns(pairwiseRddProxy.Object); var pipelinedRddProxy = new Mock <IRDDProxy>(); pipelinedRddProxy.Setup(p => p.Union(It.IsAny <IRDDProxy>())).Returns(new Mock <IRDDProxy>().Object); sparkContextProxy.Setup(p => p.CreateCSharpRdd(It.IsAny <IRDDProxy>(), It.IsAny <byte[]>(), It.IsAny <Dictionary <string, string> >(), It.IsAny <List <string> >(), It.IsAny <bool>(), It.IsAny <List <Broadcast> >(), It.IsAny <List <byte[]> >())) .Returns(pipelinedRddProxy.Object); var valueRddProxy = new Mock <IRDDProxy>(); var valuesRdd = new RDD <dynamic>(valueRddProxy.Object, sc); var resultRdd = helper.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd); Assert.IsNotNull(resultRdd); // test when initialStateRdd is not null var initialStateRdd = new RDD <KeyValuePair <string, int> >(new Mock <IRDDProxy>().Object, null); var stateSpec2 = new StateSpec <string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2); var helper2 = new MapWithStateHelper <string, int, int, int>((t, rdd) => rdd, stateSpec2); var resultRdd2 = helper2.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd); Assert.IsNotNull(resultRdd2); }
internal static void DStreamMapWithStateSample() { string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { SparkContext sc = SparkCLRSamples.SparkContext; StreamingContext context = new StreamingContext(sc, 10000); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test1")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var initialState = sc.Parallelize(new[] { new KeyValuePair<string, int>("NOT_A_WORD", 1024), new KeyValuePair<string, int>("dog", 10000), }, 1); var stateSpec = new StateSpec<string, int, int, KeyValuePair<string, int>>((word, count, state) => { if (state.IsTimingOut()) { Console.WriteLine("Found timing out word: {0}", word); return new KeyValuePair<string, int>(word, state.Get()); } var sum = 0; if (state.Exists()) { sum = state.Get(); } state.Update(sum + count); Console.WriteLine("word: {0}, count: {1}", word, sum + count); return new KeyValuePair<string, int>(word, sum + count); }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30)); var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots(); snapshots.ForeachRDD((double time, RDD<dynamic> rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Snapshots @ Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (KeyValuePair<string, int> record in rdd.Collect()) { Console.WriteLine("[{0}, {1}]", record.Key, record.Value); } Console.WriteLine(); }); return context; }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }
public void TestDStreamMapWithStateMapWithStateHelper() { // test when initialStateRdd is null var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v).NumPartitions(2).Timeout(TimeSpan.FromSeconds(100)); var helper = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec); var sparkContextProxy = new Mock<ISparkContextProxy>(); var sc = new SparkContext(sparkContextProxy.Object, null); var pairwiseRddProxy = new Mock<IRDDProxy>(); sparkContextProxy.Setup(p => p.CreatePairwiseRDD(It.IsAny<IRDDProxy>(), It.IsAny<int>(), It.IsAny<long>())).Returns(pairwiseRddProxy.Object); var pipelinedRddProxy = new Mock<IRDDProxy>(); pipelinedRddProxy.Setup(p => p.Union(It.IsAny<IRDDProxy>())).Returns(new Mock<IRDDProxy>().Object); sparkContextProxy.Setup(p => p.CreateCSharpRdd(It.IsAny<IRDDProxy>(), It.IsAny<byte[]>(), It.IsAny<Dictionary<string, string>>(), It.IsAny<List<string>>(), It.IsAny<bool>(), It.IsAny<List<Broadcast>>(), It.IsAny<List<byte[]>>())) .Returns(pipelinedRddProxy.Object); var valueRddProxy = new Mock<IRDDProxy>(); var valuesRdd = new RDD<dynamic>(valueRddProxy.Object, sc); var resultRdd = helper.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd); Assert.IsNotNull(resultRdd); // test when initialStateRdd is not null var initialStateRdd = new RDD<KeyValuePair<string, int>>(new Mock<IRDDProxy>().Object, null); var stateSpec2 = new StateSpec<string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2); var helper2 = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec2); var resultRdd2 = helper2.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd); Assert.IsNotNull(resultRdd2); }
public void TestDStreamMapWithState() { var mapwithStateDStreamProxy = new Mock<IDStreamProxy>(); var streamingContextProxy = new Mock<IStreamingContextProxy>(); streamingContextProxy.Setup(p => p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>())) .Returns(mapwithStateDStreamProxy.Object); var sparkContextProxy = new Mock<ISparkContextProxy>(); var sparkConfProxy = new Mock<ISparkConfProxy>(); var sparkClrProxy = new Mock<ISparkCLRProxy>(); sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object); sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object); // reset sparkCLRProxy for after test completes var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy; try { SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object; var sparkConf = new SparkConf(false); var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10); var dstreamProxy = new Mock<IDStreamProxy>(); var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc); var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v); var stateDStream = pairDStream.MapWithState(stateSpec); var snapshotDStream = stateDStream.StateSnapshots(); Assert.IsNotNull(stateDStream); Assert.IsNotNull(snapshotDStream); } finally { SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy; } }
internal static void DStreamMapWithStateSample() { string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { SparkContext sc = SparkCLRSamples.SparkContext; StreamingContext context = new StreamingContext(sc, 10000L); // batch interval is in milliseconds context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test1")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var initialState = sc.Parallelize(new[] { new KeyValuePair <string, int>("NOT_A_WORD", 1024), new KeyValuePair <string, int>("dog", 10000), }, 1); var stateSpec = new StateSpec <string, int, int, KeyValuePair <string, int> >((word, count, state) => { if (state.IsTimingOut()) { Console.WriteLine("Found timing out word: {0}", word); return(new KeyValuePair <string, int>(word, state.Get())); } var sum = 0; if (state.Exists()) { sum = state.Get(); } state.Update(sum + count); Console.WriteLine("word: {0}, count: {1}", word, sum + count); return(new KeyValuePair <string, int>(word, sum + count)); }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30)); var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots(); snapshots.ForeachRDD((double time, RDD <dynamic> rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Snapshots @ Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (KeyValuePair <string, int> record in rdd.Collect()) { Console.WriteLine("[{0}, {1}]", record.Key, record.Value); } Console.WriteLine(); }); return(context); }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }