private static void DStreamReduceByKeyAndWindowSample() { count = 0; const int bacthInterval = 2; const int windowDuration = 26; const int numPartitions = 2; var sc = SparkCLRSamples.SparkContext; var ssc = new StreamingContext(sc, bacthInterval); // create the RDD var seedRDD = sc.Parallelize(Enumerable.Range(0, 100), numPartitions); var numbers = new ConstantInputDStream <int>(seedRDD, ssc); var pairs = numbers.Map(n => new KeyValuePair <int, int>(n % numPartitions, n)); var reduced = pairs.ReduceByKeyAndWindow( (int x, int y) => (x + y), (int x, int y) => (x - y), windowDuration, slideDuration, numPartitions ); reduced.ForeachRDD((time, rdd) => { count++; var taken = rdd.Collect(); int partitions = rdd.GetNumPartitions(); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Batch: " + count); Console.WriteLine("Count: " + taken.Length); Console.WriteLine("Partitions: " + partitions); Assert.AreEqual(taken.Length, 2); Assert.AreEqual(partitions, numPartitions); foreach (object record in taken) { KeyValuePair <int, int> sum = (KeyValuePair <int, int>)record; Console.WriteLine("Key: {0}, Value: {1}", sum.Key, sum.Value); // when batch count reaches window size, sum of even/odd number stay at windowDuration / slideDuration * (2450, 2500) respectively Assert.AreEqual(sum.Value, (count > windowDuration / slideDuration ? windowDuration : count * slideDuration) / bacthInterval * (sum.Key == 0 ? 2450 : 2500)); } }); ssc.Start(); ssc.AwaitTermination(); }
static void Main(string[] args) { var sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLRKafka Example")); const string topicName = "<topicName>"; var topicList = new List <string> { topicName }; var kafkaParams = new Dictionary <string, string> //refer to http://kafka.apache.org/documentation.html#configuration { { "metadata.broker.list", "<kafka brokers list>" }, { "auto.offset.reset", "smallest" } }; var perTopicPartitionKafkaOffsets = new Dictionary <string, long>(); const int windowDurationInSecs = 5; const int slideDurationInSecs = 5; const string checkpointPath = "<hdfs path to spark checkpoint directory>"; const string appOutputPath = "<hdfs path to app output directory>"; const long slideDurationInMillis = 5000; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDurationInMillis); ssc.Checkpoint(checkpointPath); var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams.Select(v => new Tuple <string, string>(v.Key, v.Value)), perTopicPartitionKafkaOffsets.Select(v => new Tuple <string, long>(v.Key, v.Value))); var countByLogLevelAndTime = stream .Map(tuple => Encoding.UTF8.GetString(tuple.Item2)) .Filter(line => line.Contains(",")) .Map(line => line.Split(',')) .Map(columns => new Tuple <string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1)) .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3) .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Item1, logLevelCountPair.Item2)); countByLogLevelAndTime.ForeachRDD(countByLogLevel => { countByLogLevel.SaveAsTextFile(string.Format("{0}/{1}", appOutputPath, Guid.NewGuid())); foreach (var logCount in countByLogLevel.Collect()) { Console.WriteLine(logCount); } }); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); }
static void Main(string[] args) { var sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLREventHub Example")); var eventhubsParams = new Dictionary <string, string>() { { "eventhubs.policyname", "<policyname>" }, { "eventhubs.policykey", "<policykey>" }, { "eventhubs.namespace", "<namespace>" }, { "eventhubs.name", "<name>" }, { "eventhubs.partition.count", "<partitioncount>" }, { "eventhubs.consumergroup", "$default" }, { "eventhubs.checkpoint.dir", "<hdfs path to eventhub checkpoint dir>" }, { "eventhubs.checkpoint.interval", "<interval>" }, }; const string checkpointPath = "<hdfs path to spark checkpoint dir>"; //const string outputPath = "<hdfs path to output dir>"; const long slideDuration = 5000; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDuration); ssc.Checkpoint(checkpointPath); var stream = EventHubsUtils.CreateUnionStream(ssc, eventhubsParams); var countByLogLevelAndTime = stream .Map(bytes => Encoding.UTF8.GetString(bytes)) .Filter(s => s.Contains(",")) .Map(line => line.Split(',')) .Map(columns => new KeyValuePair <string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1)) .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, 5, 5, 3) .Map(kvp => string.Format("{0},{1}", kvp.Key, kvp.Value)); countByLogLevelAndTime.ForeachRDD(dimensionalCount => { //dimensionalCount.SaveAsTextFile(string.Format("{0}/{1}", outputPath, Guid.NewGuid())); var dimensionalCountCollection = dimensionalCount.Collect(); foreach (var dimensionalCountItem in dimensionalCountCollection) { Console.WriteLine(dimensionalCountItem); } }); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); }
public static void Process(string AppName, string CheckpointPath, Dictionary <string, string> kafkaParams) { var sparkContext = new SparkContext(new SparkConf().SetAppName(AppName)); var topicList = new List <string> { kafkaParams["topic"] }; var perTopicPartitionKafkaOffsets = new Dictionary <string, long>(); const long slideDurationInMillis = 1000; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(CheckpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDurationInMillis); var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets); stream.Map(kvp => { if (kvp.Value != null) { return(Encoding.UTF8.GetString(kvp.Value)); } else { return(null); } } ).ForeachRDD(RDD => { foreach (string line in RDD.Collect()) { var message = JObject.Parse(line); var _id = message.SelectToken("docid").ToString(); // ======================= // TODO: Process message // ======================= } } ); ssc.Checkpoint(CheckpointPath); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>"); return; } string checkpointPath = args[0]; string inputDir = args[1]; StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var sparkConf = new SparkConf(); sparkConf.SetAppName("HdfsWordCount"); var sc = new SparkContext(sparkConf); StreamingContext context = new StreamingContext(sc, 30000); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(inputDir); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); wordCounts.ForeachRDD((time, rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); object[] taken = rdd.Take(10); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); }); return(context); }); ssc.Start(); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStreamWithRepartition(ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10); Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
internal static void DStreamTextFileSamples() { SparkContext sc = SparkCLRSamples.SparkContext; string directory = SparkCLRSamples.Configuration.SampleDataLocation; sc.SetCheckpointDir(directory); StreamingContext ssc = new StreamingContext(sc, 2000); var lines = ssc.TextFileStream(Path.Combine(directory, "test")); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var join = wordCounts.Join(wordCounts, 2); var state = join.UpdateStateByKey <string, Tuple <int, int>, int>((vs, s) => vs.Sum(x => x.Item1 + x.Item2) + s); state.ForeachRDD((time, rdd) => { // there's chance rdd.Take conflicts with ssc.Stop if (stopFileServer) { return; } object[] taken = rdd.Take(10); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); stopFileServer = count++ > 3; }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }
static void Main(string[] args) { var checkpointPath = ""; var sparkContext = new SparkContext(new SparkConf()); var slideDurationInMillis = 10; var topics = new List <string>(); var kafkaParams = new List <Tuple <string, string> >(); var perTopicPartitionKafkaOffsets = new List <Tuple <string, long> >(); var windowDurationInSecs = 10; var slideDurationInSecs = 10; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDurationInMillis); ssc.Checkpoint(checkpointPath); var stream = KafkaUtils.CreateDirectStream(ssc, topics, kafkaParams, perTopicPartitionKafkaOffsets); var countByLogLevelAndTime = stream .Map(kvp => Encoding.UTF8.GetString(kvp.Item2)) .Filter(line => line.Contains(",")) .Map(line => line.Split(',')) .Map(columns => new Tuple <string, int>( string.Format("{0},{1}", columns[0], columns[1]), 1)) .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3) .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Item1, logLevelCountPair.Item2)); countByLogLevelAndTime.ForeachRDD(countByLogLevel => { foreach (var logCount in countByLogLevel.Collect()) { Console.WriteLine(logCount); } }); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); Console.WriteLine("Hello World!"); }
internal static void DStreamCSharpInputSample() { const int numPartitions = 5; var sc = SparkCLRSamples.SparkContext; var ssc = new StreamingContext(sc, 2000L); // batch interval is in milliseconds var inputDStream = CSharpInputDStreamUtils.CreateStream <string>( ssc, numPartitions, (double time, int pid) => { var list = new List <string>() { string.Format("PluggableInputDStream-{0}-{1}", pid, time) }; return(list.AsEnumerable()); }); inputDStream.ForeachRDD((time, rdd) => { var taken = rdd.Collect(); int partitions = rdd.GetNumPartitions(); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Count: " + taken.Length); Console.WriteLine("Partitions: " + partitions); foreach (object record in taken) { Console.WriteLine(record); } }); ssc.Start(); ssc.AwaitTermination(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext(Env.SPARK_MASTER_URL, "xxxx"), 1000); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream("127.0.0.1", 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = ssc.KafkaStream("127.0.0.1:2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = ssc.DirectKafkaStream(new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
internal static void DStreamTextFileSample() { count = 0; string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); SparkContext sc = SparkCLRSamples.SparkContext; var b = sc.Broadcast <int>(0); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { StreamingContext context = new StreamingContext(sc, 2); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); // since operations like ReduceByKey, Join and UpdateStateByKey are // separate dstream transformations defined in CSharpDStream.scala // an extra CSharpRDD is introduced in between these operations var wordCounts = pairs.ReduceByKey((x, y) => x + y); var join = wordCounts.Window(2, 2).Join(wordCounts, 2); var state = join.UpdateStateByKey <string, Tuple <int, int>, int>(new UpdateStateHelper(b).Execute); state.ForeachRDD((time, rdd) => { // there's chance rdd.Take conflicts with ssc.Stop if (stopFileServer) { return; } object[] taken = rdd.Take(10); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (object record in taken) { Console.WriteLine(record); var countByWord = (KeyValuePair <string, int>)record; Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "lazy" || countByWord.Key == "dog" ? 92 : 88); } Console.WriteLine(); stopFileServer = true; }); return(context); }); StartFileServer(ssc, directory, "words.txt"); ssc.Start(); ssc.AwaitTermination(); }
internal static void DStreamMapWithStateSample() { string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { SparkContext sc = SparkCLRSamples.SparkContext; StreamingContext context = new StreamingContext(sc, 10000L); // batch interval is in milliseconds context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test1")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var initialState = sc.Parallelize(new[] { new KeyValuePair <string, int>("NOT_A_WORD", 1024), new KeyValuePair <string, int>("dog", 10000), }, 1); var stateSpec = new StateSpec <string, int, int, KeyValuePair <string, int> >((word, count, state) => { if (state.IsTimingOut()) { Console.WriteLine("Found timing out word: {0}", word); return(new KeyValuePair <string, int>(word, state.Get())); } var sum = 0; if (state.Exists()) { sum = state.Get(); } state.Update(sum + count); Console.WriteLine("word: {0}, count: {1}", word, sum + count); return(new KeyValuePair <string, int>(word, sum + count)); }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30)); var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots(); snapshots.ForeachRDD((double time, RDD <dynamic> rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Snapshots @ Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (KeyValuePair <string, int> record in rdd.Collect()) { Console.WriteLine("[{0}, {1}]", record.Key, record.Value); } Console.WriteLine(); }); return(context); }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1000L); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000L); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10"); var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy); var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream( ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc); ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10"); var directKafkaReceiver = KafkaUtils.CreateDirectStream( ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaReceiver.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }