private static void StartFileServer(StreamingContext ssc, string directory, string pattern, int loops = 1) { string testDir = Path.Combine(directory, "test"); if (!Directory.Exists(testDir)) { Directory.CreateDirectory(testDir); } stopFileServer = false; string[] files = Directory.GetFiles(directory, pattern); Task.Run(() => { int loop = 0; while (!stopFileServer) { if (loop++ < loops) { DateTime now = DateTime.Now; foreach (string path in files) { string text = File.ReadAllText(path); File.WriteAllText(testDir + "\\" + now.ToBinary() + "_" + Path.GetFileName(path), text); } } System.Threading.Thread.Sleep(200); } ssc.Stop(); }); System.Threading.Thread.Sleep(1); }
public void TestStreamingAwaitTimeout() { var ssc = new StreamingContext(new SparkContext("", ""), 1000); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTerminationOrTimeout(3000); ssc.Stop(); }
internal static void DStreamTextFileSamples() { count = 0; string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); SparkContext sc = SparkCLRSamples.SparkContext; var b = sc.Broadcast <int>(0); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { StreamingContext context = new StreamingContext(sc, 2000); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); // since operations like ReduceByKey, Join and UpdateStateByKey are // separate dstream transformations defined in CSharpDStream.scala // an extra CSharpRDD is introduced in between these operations var wordCounts = pairs.ReduceByKey((x, y) => x + y); var join = wordCounts.Join(wordCounts, 2); var state = join.UpdateStateByKey <string, Tuple <int, int>, int>(new UpdateStateHelper(b).Execute); state.ForeachRDD((time, rdd) => { // there's chance rdd.Take conflicts with ssc.Stop if (stopFileServer) { return; } object[] taken = rdd.Take(10); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); stopFileServer = count++ > 100; }); return(context); }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStreamWithRepartition(ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10); Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy); var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStreamWithRepartitionAndReadFunc( ssc, new List <string> { "testTopic3" }, new Dictionary <string, string>(), new Dictionary <string, long>(), 10, (int pid, IEnumerable <KeyValuePair <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>"); return; } string checkpointPath = args[0]; string inputDir = args[1]; StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var sparkConf = new SparkConf(); sparkConf.SetAppName("HdfsWordCount"); var sc = new SparkContext(sparkConf); StreamingContext context = new StreamingContext(sc, 30000); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(inputDir); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); wordCounts.ForeachRDD((time, rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); object[] taken = rdd.Take(10); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); }); return(context); }); ssc.Start(); ssc.AwaitTermination(); ssc.Stop(); }
internal static void DStreamTextFileSamples() { SparkContext sc = SparkCLRSamples.SparkContext; string directory = SparkCLRSamples.Configuration.SampleDataLocation; sc.SetCheckpointDir(directory); StreamingContext ssc = new StreamingContext(sc, 2000); var lines = ssc.TextFileStream(Path.Combine(directory, "test")); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var join = wordCounts.Join(wordCounts, 2); var state = join.UpdateStateByKey <string, Tuple <int, int>, int>((vs, s) => vs.Sum(x => x.Item1 + x.Item2) + s); state.ForeachRDD((time, rdd) => { // there's chance rdd.Take conflicts with ssc.Stop if (stopFileServer) { return; } object[] taken = rdd.Take(10); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); stopFileServer = count++ > 3; }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext(Env.SPARK_MASTER_URL, "xxxx"), 1000); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream("127.0.0.1", 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = ssc.KafkaStream("127.0.0.1:2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = ssc.DirectKafkaStream(new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1000); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new Dictionary <string, int> { { "testTopic1", 1 } }, new Dictionary <string, string>()); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new Dictionary <string, string>(), new Dictionary <string, long>()); Assert.IsNotNull(directKafkaStream.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
internal static void DStreamMapWithStateSample() { string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { SparkContext sc = SparkCLRSamples.SparkContext; StreamingContext context = new StreamingContext(sc, 10000L); // batch interval is in milliseconds context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test1")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var initialState = sc.Parallelize(new[] { new KeyValuePair <string, int>("NOT_A_WORD", 1024), new KeyValuePair <string, int>("dog", 10000), }, 1); var stateSpec = new StateSpec <string, int, int, KeyValuePair <string, int> >((word, count, state) => { if (state.IsTimingOut()) { Console.WriteLine("Found timing out word: {0}", word); return(new KeyValuePair <string, int>(word, state.Get())); } var sum = 0; if (state.Exists()) { sum = state.Get(); } state.Update(sum + count); Console.WriteLine("word: {0}, count: {1}", word, sum + count); return(new KeyValuePair <string, int>(word, sum + count)); }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30)); var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots(); snapshots.ForeachRDD((double time, RDD <dynamic> rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Snapshots @ Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (KeyValuePair <string, int> record in rdd.Collect()) { Console.WriteLine("[{0}, {1}]", record.Key, record.Value); } Console.WriteLine(); }); return(context); }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }
public void TestStreamingContext() { var ssc = new StreamingContext(new SparkContext("", ""), 1000L); Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy)); ssc.Start(); ssc.Remember(1000L); ssc.Checkpoint(Path.GetTempPath()); var textFile = ssc.TextFileStream(Path.GetTempPath()); Assert.IsNotNull(textFile.DStreamProxy); var socketStream = ssc.SocketTextStream(IPAddress.Loopback.ToString(), 12345); Assert.IsNotNull(socketStream.DStreamProxy); var kafkaStream = KafkaUtils.CreateStream(ssc, IPAddress.Loopback + ":2181", "testGroupId", new [] { Tuple.Create("testTopic1", 1) }, null); Assert.IsNotNull(kafkaStream.DStreamProxy); var directKafkaStream = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic2" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >()); Assert.IsNotNull(directKafkaStream.DStreamProxy); ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numPartitions.testTopic3", "10"); var directKafkaStreamWithRepartition = KafkaUtils.CreateDirectStream(ssc, new List <string> { "testTopic3" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >()); Assert.IsNotNull(directKafkaStreamWithRepartition.DStreamProxy); var directKafkaStreamWithRepartitionAndReadFunc = KafkaUtils.CreateDirectStream( ssc, new List <string> { "testTopic3" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >(), (int pid, IEnumerable <Tuple <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaStreamWithRepartitionAndReadFunc); ssc.SparkContext.SparkConf.Set("spark.mobius.streaming.kafka.numReceivers", "10"); var directKafkaReceiver = KafkaUtils.CreateDirectStream( ssc, new List <string> { "testTopic3" }, new List <Tuple <string, string> >(), new List <Tuple <string, long> >(), (int pid, IEnumerable <Tuple <byte[], byte[]> > input) => { return(input); }); Assert.IsNotNull(directKafkaReceiver.DStreamProxy); var union = ssc.Union(textFile, socketStream); Assert.IsNotNull(union.DStreamProxy); ssc.AwaitTermination(); ssc.Stop(); }
public static void Main(string[] args) { Logger.LogInfo(EnvironmentInfo); var config = AppDomain.CurrentDomain.SetupInformation.ConfigurationFile; var isParseOK = false; //Options = ParserByCommandLine.Parse(args, out isParseOK); Options = ArgParser.Parse <ArgOptions>(args, out isParseOK, "-Help"); if (!isParseOK) { return; } Logger.LogDebug("{0} configuration {1}", File.Exists(config) ? "Exist" : "Not Exist", config); if (Options.WaitSecondsForAttachDebug > 0) { var waitBegin = DateTime.Now; var waitEnd = waitBegin + TimeSpan.FromSeconds(Options.WaitSecondsForAttachDebug); var currentPID = Process.GetCurrentProcess().Id; Logger.LogWarn($"Will wait {Options.WaitSecondsForAttachDebug} seconds for you to debug this process : please attach PID {currentPID} before {waitEnd}"); Thread.Sleep(Options.WaitSecondsForAttachDebug * 1000); } Logger.LogInfo("will connect " + Options.Host + ":" + Options.Port + " batchSeconds = " + Options.BatchSeconds + " s , windowSeconds = " + Options.WindowSeconds + " s, slideSeconds = " + Options.SlideSeconds + " s." + " checkpointDirectory = " + Options.CheckPointDirectory + ", is-array-test = " + Options.IsArrayValue); var prefix = ExeName + (Options.IsArrayValue ? "-array" + (Options.IsUnevenArray ? "-uneven" : "-even") : "-single"); var beginTime = DateTime.Now; var sc = new SparkContext(new SparkConf()); Action <long> testOneStreaming = (testTime) => { var timesInfo = "[" + testTime + "]-" + Options.TestTimes + " "; Logger.LogInfo($"Begin test{timesInfo} : {GetCurrentProcessInfo()}"); if (Options.DeleteCheckPointDirectoryTimes >= testTime) { TestUtils.DeleteDirectory(Options.CheckPointDirectory); } var ssc = new StreamingContext(sc, Options.BatchSeconds * 1000L); ssc.Checkpoint(Options.CheckPointDirectory); var lines = ssc.SocketTextStream(Options.Host, Options.Port, StorageLevelType.MEMORY_AND_DISK_SER); var oldSum = new SumCount(SumCountStatic.GetStaticSumCount()); StartOneTest(sc, lines, Options.ElementCount, prefix); var newSum = SumCountStatic.GetStaticSumCount(); // var sum = newSum - oldSum; // newSum maybe same as oldSum ssc.Start(); var startTime = DateTime.Now; ssc.AwaitTerminationOrTimeout(Options.RunningSeconds * 1000); ssc.Stop(); var sum = newSum - oldSum; var isSameLineCount = Options.LineCount <= 0 || Options.LineCount == sum.LineCount; var message = Options.LineCount <= 0 ? string.Empty : (isSameLineCount ? ". LineCount same" : string.Format(". LineCount different : expected = {0}, but line count = {1}", Options.LineCount, sum.LineCount)); Logger.LogInfo("oldSum = {0}, newSum = {1}, sum = {2}", oldSum, newSum, sum); Logger.LogInfo($"End test{timesInfo}, used time = {(DateTime.Now - startTime).TotalSeconds} s, total cost = {(DateTime.Now - beginTime).TotalSeconds} s, started at {startTime.ToString(TestUtils.MilliTimeFormat)} . Reduced final sumCount : {sum.ToString()} {message}. {GetCurrentProcessInfo()}"); }; for (var times = 1; times <= Options.TestTimes; times++) { testOneStreaming(times); if (times < Options.TestTimes) { Thread.Sleep(TimeSpan.FromSeconds(Options.TestIntervalSeconds)); } } Logger.LogInfo($"Finished all tests, test times = {Options.TestTimes}, used time = {(DateTime.Now - beginTime).TotalSeconds} s = {DateTime.Now - beginTime} . {GetCurrentProcessInfo(true, "Final info: ")}"); }