/// <summary> /// Calculate Pi /// Reference: https://github.com/apache/spark/blob/branch-1.5/examples/src/main/scala/org/apache/spark/examples/SparkPi.scala /// </summary> private static void Pi() { const int slices = 3; var n = (int)Math.Min(100000L * slices, int.MaxValue); var values = new List <int>(n); for (var i = 0; i <= n; i++) { values.Add(i); } // // Anonymous method approach // var count = SparkContext.Parallelize(values, slices) .Map(i => { var random = new Random(); var x = random.NextDouble() * 2 - 1; var y = random.NextDouble() * 2 - 1; return((x * x + y * y) < 1 ? 1 : 0); } ).Reduce((x, y) => x + y); Logger.InfoFormat("(anonymous method approach) Pi is roughly {0}.", 4.0 * (int)count / n); // // Serialized class approach, an alternative to the anonymous method approach above // var countComputedUsingAnotherApproach = SparkContext.Parallelize(values, slices).Map(new PiHelper().Execute).Reduce((x, y) => x + y); var approximatePiValue = 4.0 * countComputedUsingAnotherApproach / n; Logger.InfoFormat("(serialized class approach) Pi is roughly {0}.", approximatePiValue); }
public void TestParallelize() { { RDD <int> rdd = _sc.Parallelize(Enumerable.Range(0, 5)); Assert.Equal(new[] { 0, 1, 2, 3, 4 }, rdd.Collect()); } { var strs = new string[] { "hello", "spark", "for", "dotnet" }; RDD <string> rdd = _sc.Parallelize(strs); Assert.Equal(strs, rdd.Collect()); } }
public static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set Logger = LoggerServiceFactory.GetLogger(typeof(PiExample)); var sparkContext = new SparkContext(new SparkConf()); try { const int slices = 3; var numberOfItems = (int)Math.Min(100000L * slices, int.MaxValue); var values = new List <int>(numberOfItems); for (var i = 0; i <= numberOfItems; i++) { values.Add(i); } var rdd = sparkContext.Parallelize(values, slices); CalculatePiUsingAnonymousMethod(numberOfItems, rdd); CalculatePiUsingSerializedClassApproach(numberOfItems, rdd); Logger.LogInfo("Completed calculating the value of Pi"); } catch (Exception ex) { Logger.LogError("Error calculating Pi"); Logger.LogException(ex); } sparkContext.Stop(); }
/// <summary> /// To calculate Pi value /// </summary> private static void Pi() { var sparkConf = new SparkConf(); sparkConf.SetAppName("MobiusSimpleSamplePI"); sparkConf.SetMaster("yarn"); sparkContext = new SparkContext(sparkConf); try { const int slices = 3; var numberOfItems = (int)Math.Min(100000L * slices, int.MaxValue); var values = new List <int>(numberOfItems); for (var i = 0; i <= numberOfItems; i++) { values.Add(i); } var rdd = sparkContext.Parallelize(values, slices); logger.LogInfo("Started Calculating Pi"); CalculatePiUsingAnonymousMethod(numberOfItems, rdd); CalculatePiUsingSerializedClassApproach(numberOfItems, rdd); logger.LogInfo("Completed calculating the value of Pi"); logger.LogInfo("Executed Successfully................."); } catch (Exception ex) { logger.LogError("Error calculating Pi"); logger.LogException(ex); } }
public static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set Logger = LoggerServiceFactory.GetLogger(typeof(PiExample)); var sparkContext = new SparkContext(new SparkConf()); try { const int slices = 3; var numberOfItems = (int)Math.Min(100000L * slices, int.MaxValue); var values = new List<int>(numberOfItems); for (var i = 0; i <= numberOfItems; i++) { values.Add(i); } var rdd = sparkContext.Parallelize(values, slices); CalculatePiUsingAnonymousMethod(numberOfItems, rdd); CalculatePiUsingSerializedClassApproach(numberOfItems, rdd); Logger.LogInfo("Completed calculating the value of Pi"); } catch (Exception ex) { Logger.LogError("Error calculating Pi"); Logger.LogException(ex); } sparkContext.Stop(); }
public void TestSparkContextProxy() { var sparkContext = new SparkContext("masterUrl", "appName"); sparkContext.AddFile(null); sparkContext.BinaryFiles(null, null); sparkContext.CancelAllJobs(); sparkContext.CancelJobGroup(null); sparkContext.EmptyRDD <string>(); sparkContext.GetLocalProperty(null); sparkContext.HadoopFile(null, null, null, null); sparkContext.HadoopRDD(null, null, null); sparkContext.NewAPIHadoopFile(null, null, null, null); sparkContext.NewAPIHadoopRDD(null, null, null); sparkContext.Parallelize <int>(new int[] { 1, 2, 3, 4, 5 }); sparkContext.SequenceFile(null, null, null, null, null, null); sparkContext.SetCheckpointDir(null); sparkContext.SetJobGroup(null, null); sparkContext.SetLocalProperty(null, null); sparkContext.SetLogLevel(null); sparkContext.TextFile(null); sparkContext.WholeTextFiles(null); sparkContext.Stop(); sparkContext.Union <string>(null); }
public void TestCollect() { RDD <Tuple <string, int> > rdd = _sc.Parallelize(new[] { new Tuple <string, int>("a", 1), new Tuple <string, int>("b", 2) }); // Validate CollectAsMap(). { var expected = new Dictionary <string, int> { ["a"] = 1, ["b"] = 2 }; Assert.Equal(expected, rdd.CollectAsMap()); } // Validate Keys(). { Assert.Equal(new[] { "a", "b" }, rdd.Keys().Collect()); } // Validate Values(). { Assert.Equal(new[] { 1, 2 }, rdd.Values().Collect()); } }
public void TestConstantInputDStream() { var sc = new SparkContext("", ""); var rdd = sc.Parallelize(Enumerable.Range(0, 10), 1); var ssc = new StreamingContext(sc, 1000L); // test when rdd is null Assert.Throws<ArgumentNullException>(() => new ConstantInputDStream<int>(null, ssc)); var constantInputDStream = new ConstantInputDStream<int>(rdd, ssc); Assert.IsNotNull(constantInputDStream); Assert.AreEqual(ssc, constantInputDStream.streamingContext); }
public void TestConstantInputDStream() { var sc = new SparkContext("", ""); var rdd = sc.Parallelize(Enumerable.Range(0, 10), 1); var ssc = new StreamingContext(sc, 1); // test when rdd is null Assert.Throws <ArgumentNullException>(() => new ConstantInputDStream <int>(null, ssc)); var constantInputDStream = new ConstantInputDStream <int>(rdd, ssc); Assert.IsNotNull(constantInputDStream); Assert.AreEqual(ssc, constantInputDStream.streamingContext); }
static void Main(string[] args) { Console.WriteLine("sizeof(int) = " + sizeof(int) + ", sizeof(long) = " + sizeof(long) + ", Is64BitOperatingSystem = " + Environment.Is64BitOperatingSystem + ", Is64BitProcess = " + Environment.Is64BitProcess + ", OSVersion = " + Environment.OSVersion + ", MachineName = " + Environment.MachineName); var exe = Path.GetFileName(System.Reflection.Assembly.GetExecutingAssembly().CodeBase); if (args.Length < 1 || args[0] == "-h" || args[0] == "--help") { Console.WriteLine("Usage : {0} input-arguments", exe); Console.WriteLine("Example-1: {0} any-thing that you-want-to-write=input", exe); var mapCurrentDir = new Dictionary <PlatformID, string> { { PlatformID.Win32NT, "%CD%" }, { PlatformID.Win32S, "%CD%" }, { PlatformID.Win32Windows, "%CD%" }, { PlatformID.WinCE, "%CD%" }, { PlatformID.Unix, "$PWD" } }; var currentDirectory = string.Empty; if (mapCurrentDir.TryGetValue(Environment.OSVersion.Platform, out currentDirectory)) { Console.WriteLine(@"Example-2: {0} {1} arg2@*#:,+.-\/~ Pi* d:\tmp {2}", exe, currentDirectory, "\"jdbc:mysql://localhost:3306/lzdb?user=guest&password=abc123\""); } return; } var idx = 0; Log("args.Length = " + args.Length + Environment.NewLine + string.Join(Environment.NewLine, args.Select(arg => { idx++; return("args[" + idx + "] = " + arg); })) ); var singleValueRDD = new List <KeyValuePair <string, int> >( args.Select(arg => new KeyValuePair <string, int>(arg, 1)) ); idx = 0; singleValueRDD.ForEach(kv => Log(string.Format("src-pair[{0}] : {1} = {2}", idx++, kv.Key, kv.Value))); var sparkContext = new SparkContext(new SparkConf()); var rdd = sparkContext.Parallelize(singleValueRDD); Log(string.Format("Main() rdd = {0}", rdd)); var reduced = rdd.ReduceByKey((v1, v2) => v1 + v2); Log("reduced.count = " + reduced.Count()); sparkContext.Stop(); }
public void TestRunJob() { // Arrange Mock <ISparkContextProxy> sparkContextProxy = new Mock <ISparkContextProxy>(); SparkContext sc = new SparkContext(sparkContextProxy.Object, null); RDD <int> rdd = sc.Parallelize(new int[] { 0, 1, 2, 3, 4, 5 }, 2); sparkContextProxy.Setup(m => m.RunJob(It.IsAny <IRDDProxy>(), It.IsAny <IEnumerable <int> >())); // Act int[] partitions = new int[] { 0, 1 }; rdd.SparkContext.RunJob(rdd, partitions); // Assert sparkContextProxy.Verify(m => m.RunJob(rdd.RddProxy, partitions), Times.Once); }
public void TestParallelize() { // Arrange Mock <IRDDProxy> rddProxy = new Mock <IRDDProxy>(); Mock <ISparkContextProxy> sparkContextProxy = new Mock <ISparkContextProxy>(); sparkContextProxy.Setup(m => m.Parallelize(It.IsAny <IEnumerable <byte[]> >(), It.IsAny <int>())).Returns(rddProxy.Object); SparkContext sc = new SparkContext(sparkContextProxy.Object, null); // Act var nums = new[] { 0, 2, 3, 4, 6 }; RDD <int> rdd = sc.Parallelize(nums, -2); // Assert Assert.IsNotNull(rdd); Assert.AreEqual(rddProxy.Object, rdd.RddProxy); Assert.AreEqual(sc, rdd.sparkContext); }
public void TestSparkContextProxy() { var sparkContext = new SparkContext("masterUrl", "appName"); sparkContext.AddFile(null); sparkContext.BinaryFiles(null, null); sparkContext.CancelAllJobs(); sparkContext.CancelJobGroup(null); sparkContext.EmptyRDD<string>(); sparkContext.GetLocalProperty(null); sparkContext.HadoopFile(null, null, null, null); sparkContext.HadoopRDD(null, null, null); sparkContext.NewAPIHadoopFile(null, null, null, null); sparkContext.NewAPIHadoopRDD(null, null, null); sparkContext.Parallelize<int>(new int[] { 1, 2, 3, 4, 5 }); sparkContext.SequenceFile(null, null, null, null, null, null); sparkContext.SetCheckpointDir(null); sparkContext.SetJobGroup(null, null); sparkContext.SetLocalProperty(null, null); sparkContext.SetLogLevel(null); sparkContext.TextFile(null); sparkContext.WholeTextFiles(null); sparkContext.Stop(); sparkContext.Union<string>(null); }
internal static void DStreamTextFileSample() { count = 0; string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); SparkContext sc = SparkCLRSamples.SparkContext; var b = sc.Broadcast <int>(0); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { StreamingContext context = new StreamingContext(sc, 2000L); // batch interval is in milliseconds context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new Tuple <string, int>(w, 1)); // since operations like ReduceByKey, Join and UpdateStateByKey are // separate dstream transformations defined in CSharpDStream.scala // an extra CSharpRDD is introduced in between these operations var wordCounts = pairs.ReduceByKey((x, y) => x + y); var join = wordCounts.Window(2, 2).Join(wordCounts, 2); var initialStateRdd = sc.Parallelize(new[] { new Tuple <string, int>("AAA", 88), new Tuple <string, int>("BBB", 88) }); var state = join.UpdateStateByKey(new UpdateStateHelper(b).Execute, initialStateRdd); state.ForeachRDD((time, rdd) => { // there's chance rdd.Take conflicts with ssc.Stop if (stopFileServer) { return; } object[] taken = rdd.Take(10); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (object record in taken) { Console.WriteLine(record); var countByWord = (Tuple <string, int>)record; Assert.AreEqual(countByWord.Item2, countByWord.Item1 == "The" || countByWord.Item1 == "lazy" || countByWord.Item1 == "dog" ? 92 : 88); } Console.WriteLine(); stopFileServer = true; }); return(context); }); StartFileServer(ssc, directory, "words.txt"); ssc.Start(); ssc.AwaitTermination(); }
internal static void DStreamMapWithStateSample() { string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { SparkContext sc = SparkCLRSamples.SparkContext; StreamingContext context = new StreamingContext(sc, 10000L); // batch interval is in milliseconds context.Checkpoint(checkpointPath); var lines = context.TextFileStream(Path.Combine(directory, "test1")); lines = context.Union(lines, lines); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); var initialState = sc.Parallelize(new[] { new KeyValuePair <string, int>("NOT_A_WORD", 1024), new KeyValuePair <string, int>("dog", 10000), }, 1); var stateSpec = new StateSpec <string, int, int, KeyValuePair <string, int> >((word, count, state) => { if (state.IsTimingOut()) { Console.WriteLine("Found timing out word: {0}", word); return(new KeyValuePair <string, int>(word, state.Get())); } var sum = 0; if (state.Exists()) { sum = state.Get(); } state.Update(sum + count); Console.WriteLine("word: {0}, count: {1}", word, sum + count); return(new KeyValuePair <string, int>(word, sum + count)); }).NumPartitions(1).InitialState(initialState).Timeout(TimeSpan.FromSeconds(30)); var snapshots = wordCounts.MapWithState(stateSpec).StateSnapshots(); snapshots.ForeachRDD((double time, RDD <dynamic> rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Snapshots @ Time: {0}", time); Console.WriteLine("-------------------------------------------"); foreach (KeyValuePair <string, int> record in rdd.Collect()) { Console.WriteLine("[{0}, {1}]", record.Key, record.Value); } Console.WriteLine(); }); return(context); }); ssc.Start(); StartFileServer(directory, "words.txt", 100); ssc.AwaitTermination(); ssc.Stop(); }