internal static void DStreamDirectKafkaWithRepartitionSample() { count = 0; string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var conf = new SparkConf(); SparkContext sc = new SparkContext(conf); StreamingContext context = new StreamingContext(sc, 2000L); context.Checkpoint(checkpointPath); var kafkaParams = new List <Tuple <string, string> > { new Tuple <string, string>("metadata.broker.list", brokers), new Tuple <string, string>("auto.offset.reset", "smallest") }; conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString()); var dstream = KafkaUtils.CreateDirectStream(context, new List <string> { topic }, kafkaParams, Enumerable.Empty <Tuple <string, long> >()); dstream.ForeachRDD((time, rdd) => { long batchCount = rdd.Count(); int numPartitions = rdd.GetNumPartitions(); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Count: " + batchCount); Console.WriteLine("Partitions: " + numPartitions); // only first batch has data and is repartitioned into 10 partitions if (count++ == 0) { Assert.AreEqual(messages, batchCount); Assert.IsTrue(numPartitions >= partitions); } else { Assert.AreEqual(0, batchCount); Assert.IsTrue(numPartitions == 0); } }); return(context); }); ssc.Start(); ssc.AwaitTermination(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStreamWithRepartition(ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10); Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy); var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStreamWithRepartitionAndReadFunc( ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10, (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
static void Main(string[] args) { var sparkContext = new SparkContext(new SparkConf().SetAppName("SparkCLRKafka Example")); const string topicName = "<topicName>"; var topicList = new List <string> { topicName }; var kafkaParams = new Dictionary <string, string> //refer to http://kafka.apache.org/documentation.html#configuration { { "metadata.broker.list", "<kafka brokers list>" }, { "auto.offset.reset", "smallest" } }; var perTopicPartitionKafkaOffsets = new Dictionary <string, long>(); const int windowDurationInSecs = 5; const int slideDurationInSecs = 5; const string checkpointPath = "<hdfs path to spark checkpoint directory>"; const string appOutputPath = "<hdfs path to app output directory>"; const long slideDurationInMillis = 5000; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDurationInMillis); ssc.Checkpoint(checkpointPath); var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams.Select(v => new Tuple <string, string>(v.Key, v.Value)), perTopicPartitionKafkaOffsets.Select(v => new Tuple <string, long>(v.Key, v.Value))); var countByLogLevelAndTime = stream .Map(tuple => Encoding.UTF8.GetString(tuple.Item2)) .Filter(line => line.Contains(",")) .Map(line => line.Split(',')) .Map(columns => new Tuple <string, int>(string.Format("{0},{1}", columns[0], columns[1]), 1)) .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3) .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Item1, logLevelCountPair.Item2)); countByLogLevelAndTime.ForeachRDD(countByLogLevel => { countByLogLevel.SaveAsTextFile(string.Format("{0}/{1}", appOutputPath, Guid.NewGuid())); foreach (var logCount in countByLogLevel.Collect()) { Console.WriteLine(logCount); } }); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); }
public static void Process(string AppName, string CheckpointPath, Dictionary <string, string> kafkaParams) { var sparkContext = new SparkContext(new SparkConf().SetAppName(AppName)); var topicList = new List <string> { kafkaParams["topic"] }; var perTopicPartitionKafkaOffsets = new Dictionary <string, long>(); const long slideDurationInMillis = 1000; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(CheckpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDurationInMillis); var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets); stream.Map(kvp => { if (kvp.Value != null) { return(Encoding.UTF8.GetString(kvp.Value)); } else { return(null); } } ).ForeachRDD(RDD => { foreach (string line in RDD.Collect()) { var message = JObject.Parse(line); var _id = message.SelectToken("docid").ToString(); // ======================= // TODO: Process message // ======================= } } ); ssc.Checkpoint(CheckpointPath); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); }
public override void Run(Lazy <SparkContext> sparkContext, int currentTimes, int totalTimes) { DeleteCheckPointDirectory(currentTimes); var options = Options as UnionTopicTestOptions; var streamingContext = StreamingContext.GetOrCreate(options.CheckPointDirectory, () => { var ssc = new StreamingContext(sparkContext.Value, options.BatchSeconds * 1000L); ssc.Checkpoint(options.CheckPointDirectory); var stream1 = KafkaUtils.CreateDirectStream(ssc, new List <string> { options.Topic1 }, kafkaParams, offsetsRange) .Map(line => new RowIdCountTime().Deserialize(line.Value)); var stream2 = KafkaUtils.CreateDirectStream(ssc, new List <string> { options.Topic2 }, kafkaParams, offsetsRange) .Map(line => new RowIdCountTime().Deserialize(line.Value)); var stream = stream1.Union(stream2); if (options.RePartition > 0) { stream = stream.Repartition(options.RePartition); } stream.ForeachRDD(rdd => { rdd.Foreach(idCount => { Console.WriteLine($"{NowMilli} {this.GetType().Name} : {idCount.ToString()}"); }); }); SaveStreamToFile(stream.Map(it => it.ToString())); return(ssc); }); streamingContext.Start(); WaitTerminationOrTimeout(streamingContext); }
public override void Run(Lazy <SparkContext> sparkContext, int currentTimes, int totalTimes) { DeleteCheckPointDirectory(currentTimes); var options = Options as WindowSlideTestOptions; var allBeginTime = DateTime.Now; var topicList = new List <string>(options.Topics.Split(";,".ToArray())); ParseKafkaParameters(); for (var k = 0; options.TestTimes <= 0 || k < options.TestTimes; k++) { var beginTime = DateTime.Now; //Logger.LogInfo("begin test[{0}]-{1} , sparkContext = {2}", k + 1, options.TestTimes > 0 ? options.TestTimes.ToString() : "infinite", sparkContext.Value); var streamingContext = StreamingContext.GetOrCreate(options.CheckPointDirectory, () => { var ssc = new StreamingContext(sparkContext.Value, options.BatchSeconds * 1000L); ssc.Checkpoint(options.CheckPointDirectory); var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, offsetsRange) .Map(line => Encoding.UTF8.GetString(line.Value)); var pairs = stream.Map(new ParseKeyValueArray(options.ElementCount, options.ShowReceivedLines).Parse); var reducedStream = pairs.ReduceByKeyAndWindow( new ReduceHelper(options.CheckArrayAtFirst).Sum, new ReduceHelper(options.CheckArrayAtFirst).InverseSum, options.WindowSeconds, options.SlideSeconds ); reducedStream.ForeachRDD(new SumCountStatic().ForeachRDD <int[]>); SaveStreamToFile(reducedStream); return(ssc); }); streamingContext.Start(); WaitTerminationOrTimeout(streamingContext); } }
static void Main(string[] args) { var checkpointPath = ""; var sparkContext = new SparkContext(new SparkConf()); var slideDurationInMillis = 10; var topics = new List <string>(); var kafkaParams = new List <Tuple <string, string> >(); var perTopicPartitionKafkaOffsets = new List <Tuple <string, long> >(); var windowDurationInSecs = 10; var slideDurationInSecs = 10; StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () => { var ssc = new StreamingContext(sparkContext, slideDurationInMillis); ssc.Checkpoint(checkpointPath); var stream = KafkaUtils.CreateDirectStream(ssc, topics, kafkaParams, perTopicPartitionKafkaOffsets); var countByLogLevelAndTime = stream .Map(kvp => Encoding.UTF8.GetString(kvp.Item2)) .Filter(line => line.Contains(",")) .Map(line => line.Split(',')) .Map(columns => new Tuple <string, int>( string.Format("{0},{1}", columns[0], columns[1]), 1)) .ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y, windowDurationInSecs, slideDurationInSecs, 3) .Map(logLevelCountPair => string.Format("{0},{1}", logLevelCountPair.Item1, logLevelCountPair.Item2)); countByLogLevelAndTime.ForeachRDD(countByLogLevel => { foreach (var logCount in countByLogLevel.Collect()) { Console.WriteLine(logCount); } }); return(ssc); }); sparkStreamingContext.Start(); sparkStreamingContext.AwaitTermination(); Console.WriteLine("Hello World!"); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1000); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1000L); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000L); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new [] { Tuple.Create("testTopic1", 1) }, null); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >()); Assert.IsNotNull(directKafkaStream.DStreamProxy); ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10"); var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic3" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >()); Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy); var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream( ssc, new List <string> { "testTopic3" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >(), (int pid, IEnumerable <Tuple <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc); ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10"); var directKafkaReceiver = KafkaUtils.CreateDirectStream( ssc, new List <string> { "testTopic3" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >(), (int pid, IEnumerable <Tuple <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaReceiver.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }