public void TestRddUnion() { var sparkContext = new SparkContext(null); var rdd = sparkContext.TextFile(@"c:\path\to\rddinput.txt"); var rdd2 = sparkContext.TextFile(@"c:\path\to\rddinput2.txt"); var unionRdd = rdd.Union(rdd2); var paramValuesToUnionMethod = ((unionRdd.RddProxy as MockRddProxy).mockRddReference as object[]); var paramValuesToTextFileMethodInRdd1 = (paramValuesToUnionMethod[0] as MockRddProxy).mockRddReference as object[]; Assert.AreEqual(@"c:\path\to\rddinput.txt", paramValuesToTextFileMethodInRdd1[0]); var paramValuesToTextFileMethodInRdd2 = (paramValuesToUnionMethod[1] as MockRddProxy).mockRddReference as object[]; Assert.AreEqual(@"c:\path\to\rddinput2.txt", paramValuesToTextFileMethodInRdd2[0]); }
public static int Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set Logger = LoggerServiceFactory.GetLogger(typeof(WordCountExample)); if (args.Length != 1) { Console.Error.WriteLine("Usage: WordCount <file>"); return(1); } var sparkContext = new SparkContext(new SparkConf().SetAppName("MobiusWordCount")); try { var lines = sparkContext.TextFile(args[0]); var counts = lines .FlatMap(x => x.Split(' ')) .Map(w => new KeyValuePair <string, int>(w, 1)) .ReduceByKey((x, y) => x + y); foreach (var wordcount in counts.Collect()) { Console.WriteLine("{0}: {1}", wordcount.Key, wordcount.Value); } } catch (Exception ex) { Logger.LogError("Error performing Word Count"); Logger.LogException(ex); } sparkContext.Stop(); return(0); }
public static void Initialize(TestContext context) { var sparkContext = new SparkContext(null); var lines = sparkContext.TextFile(Path.GetTempFileName()); var words = lines.FlatMap(l => l.Split(' ')); doubles = words.Map(w => new KeyValuePair<string, int>(w, 1)).ReduceByKey((x, y) => x + y).Map(kv => (double)kv.Value); }
public static void Initialize() { var sparkContext = new SparkContext(null); var lines = sparkContext.TextFile(Path.GetTempFileName()); var words = lines.FlatMap(l => l.Split(' ')); pairs = words.Map(w => new KeyValuePair<string, int>(w, 1)); }
/// <summary> /// To calculate the wordcount for the Hdfs file /// </summary> private static void WordCount() { var sparkConf = new SparkConf(); sparkConf.SetAppName("MobiusWordCountC#"); sparkConf.SetMaster("yarn"); sparkContext = new SparkContext(sparkConf); try { var lines = sparkContext.TextFile(hdfsFile); var counts = lines .FlatMap(x => x.Split(' ')) .Map(w => new Tuple <string, int>(w, 1)) .ReduceByKey((x, y) => x + y); logger.LogInfo("**********************************************"); foreach (var wordcount in counts.Collect()) { Console.WriteLine("{0}: {1}", wordcount.Item1, wordcount.Item2); } logger.LogInfo("**********************************************"); logger.LogInfo("Executed Successfully................."); } catch (Exception ex) { logger.LogError("Error performing Word Count"); logger.LogException(ex); } }
public static void Initialize() { var sparkContext = new SparkContext(null); var lines = sparkContext.TextFile(Path.GetTempFileName()); words = lines.FlatMap(l => l.Split(' ')); }
public static int Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set Logger = LoggerServiceFactory.GetLogger(typeof(WordCountExample)); if (args.Length != 1) { Console.Error.WriteLine("Usage: WordCount <file>"); return 1; } var sparkContext = new SparkContext(new SparkConf().SetAppName("MobiusWordCount")); try { var lines = sparkContext.TextFile(args[0]); var counts = lines .FlatMap(x => x.Split(' ')) .Map(w => new KeyValuePair<string, int>(w, 1)) .ReduceByKey((x, y) => x + y); foreach (var wordcount in counts.Collect()) { Console.WriteLine("{0}: {1}", wordcount.Key, wordcount.Value); } } catch (Exception ex) { Logger.LogError("Error performing Word Count"); Logger.LogException(ex); } sparkContext.Stop(); return 0; }
public void TestSparkContextProxy() { var sparkContext = new SparkContext("masterUrl", "appName"); sparkContext.AddFile(null); sparkContext.BinaryFiles(null, null); sparkContext.CancelAllJobs(); sparkContext.CancelJobGroup(null); sparkContext.EmptyRDD <string>(); sparkContext.GetLocalProperty(null); sparkContext.HadoopFile(null, null, null, null); sparkContext.HadoopRDD(null, null, null); sparkContext.NewAPIHadoopFile(null, null, null, null); sparkContext.NewAPIHadoopRDD(null, null, null); sparkContext.Parallelize <int>(new int[] { 1, 2, 3, 4, 5 }); sparkContext.SequenceFile(null, null, null, null, null, null); sparkContext.SetCheckpointDir(null); sparkContext.SetJobGroup(null, null); sparkContext.SetLocalProperty(null, null); sparkContext.SetLogLevel(null); sparkContext.TextFile(null); sparkContext.WholeTextFiles(null); sparkContext.Stop(); sparkContext.Union <string>(null); }
public IEnumerable<Model> Get() { _sparkContext = Program.SparkContext; var crimeDataFrame = GetSqlContext() .TextFile(CrimeFilePath) .Cache(); var tempTdd = _sparkContext.TextFile(CrimeFilePath) .Map(l => new object[] { int.Parse(l.Substring(0, 3)), int.Parse(l.Substring(4, 3)), int.Parse(l.Substring(8, 4)), }); var data = GetSqlContext().CreateDataFrame(tempTdd, new StructType(new List<StructField> { new StructField("Field1", new IntegerType()), new StructField("Field2", new IntegerType()), new StructField("Field3", new IntegerType()) })); data.Show(); data.RegisterTempTable("data"); return GetSqlContext().Sql("SELECT Field1, Field2, Field3 FROM data") .Collect() .Select(l => new Model { Field1 = l.Get("Field1"), Field2 = l.Get("Field2"), Field3 = l.Get("Field3"), }).ToList(); }
public void TestRddTextFile() { var sparkContext = new SparkContext(null); var rdd = sparkContext.TextFile(@"c:\path\to\rddinput.txt"); var paramValuesToTextFileMethod = (rdd.RddProxy as MockRddProxy).mockRddReference as object[]; Assert.AreEqual(@"c:\path\to\rddinput.txt", paramValuesToTextFileMethod[0]); Assert.AreEqual(0, int.Parse(paramValuesToTextFileMethod[1].ToString())); //checking default partitions }
public static void Initialize() { var sparkContext = new SparkContext(null); var lines = sparkContext.TextFile(Path.GetTempFileName()); var words = lines.FlatMap(l => l.Split(' ')); doubles = words.Map(w => new KeyValuePair <string, int>(w, 1)).ReduceByKey((x, y) => x + y).Map(kv => (double)kv.Value); }
public static void Initialize() { var sparkContext = new SparkContext(null); var lines = sparkContext.TextFile(Path.GetTempFileName()); var words = lines.FlatMap(l => l.Split(' ')); pairs = words.Map(w => new KeyValuePair <string, int>(w, 1)); }
public void TestSparkContextTextFile() { var sparkContext = new SparkContext(null); var rdd = sparkContext.TextFile(@"c:\path\to\rddinput.txt", 8); var paramValuesToTextFileMethod = (rdd.RddProxy as MockRddProxy).mockRddReference as object[]; Assert.AreEqual(@"c:\path\to\rddinput.txt", paramValuesToTextFileMethod[0]); Assert.AreEqual(8, paramValuesToTextFileMethod[1]); }
public void TestGetDefaultPartitionNum() { var sparkContext = new SparkContext(null); var lines = sparkContext.TextFile(Path.GetTempFileName(), 5); words = lines.FlatMap(l => l.Split(' ')); var defaultNumPartitions = words.GetDefaultPartitionNum(); Assert.AreEqual(2, defaultNumPartitions); }
public void TestTextFile() { RDD <string> rdd = _sc.TextFile($"{TestEnvironment.ResourceDirectory}people.txt"); var strs = new string[] { "Michael, 29", "Andy, 30", "Justin, 19" }; Assert.Equal(strs, rdd.Collect()); // Test a transformation so that SerializedMode is correctly propagated. RDD <int> intRdd = rdd.Map(str => 0); Assert.Equal(new[] { 0, 0, 0 }, intRdd.Collect()); }
static void StartOneTest(string pathPattern, int times, int totalTimes) { var beginTime = DateTime.Now; Logger.LogInfo($"Begin test[{times}]-{totalTimes} , will read : {pathPattern} . {GetCurrentProcessInfo()}"); var sc = new SparkContext(new SparkConf()); var mappingRDD = sc.TextFile(pathPattern).Map <string>(line => line).Cache(); Logger.LogInfo("RDD count = {0}", mappingRDD.Count()); mappingRDD.Unpersist(); var endTime = DateTime.Now; Logger.LogInfo($"End test[{times}]-{totalTimes} of {typeof(TxtStreamTest)}, used time = {(endTime - beginTime).TotalSeconds} s = {endTime - beginTime} . read = {pathPattern} ; {GetCurrentProcessInfo()}"); sc.Stop(); }
public void TestRddMap() { var sparkContext = new SparkContext(null); var rdd = sparkContext.TextFile(@"c:\path\to\rddinput.txt"); var rdd2 = rdd.Map(s => s.ToLower() + ".com"); Assert.IsTrue(rdd2.GetType() == typeof(PipelinedRDD<string>)); var pipelinedRdd = rdd2 as PipelinedRDD<string>; var func = pipelinedRdd.func; var result = func(1, new String[] { "ABC" }); var output = result.First(); Assert.AreEqual("ABC".ToLower() + ".com", output); var pipelinedRdd2 = rdd2.Map(s => "HTTP://" + s) as PipelinedRDD<string>; var func2 = pipelinedRdd2.func; var result2 = func2(1, new String[] { "ABC" }); var output2 = result2.First(); Assert.AreEqual("HTTP://" + ("ABC".ToLower() + ".com"), output2); //tolower and ".com" appended first before adding prefix due to the way func2 wraps func in implementation }
public void TestRddMap() { var sparkContext = new SparkContext(null); var rdd = sparkContext.TextFile(@"c:\path\to\rddinput.txt"); var rdd2 = rdd.Map(s => s.ToLower() + ".com"); Assert.IsTrue(rdd2.GetType() == typeof(PipelinedRDD <string>)); var pipelinedRdd = rdd2 as PipelinedRDD <string>; var func = pipelinedRdd.func; var result = func(1, new String[] { "ABC" }); var output = result.First(); Assert.AreEqual("ABC".ToLower() + ".com", output); var pipelinedRdd2 = rdd2.Map(s => "HTTP://" + s) as PipelinedRDD <string>; var func2 = pipelinedRdd2.func; var result2 = func2(1, new String[] { "ABC" }); var output2 = result2.First(); Assert.AreEqual("HTTP://" + ("ABC".ToLower() + ".com"), output2); //tolower and ".com" appended first before adding prefix due to the way func2 wraps func in implementation }
public void TestSparkContextProxy() { var sparkContext = new SparkContext("masterUrl", "appName"); sparkContext.AddFile(null); sparkContext.BinaryFiles(null, null); sparkContext.CancelAllJobs(); sparkContext.CancelJobGroup(null); sparkContext.EmptyRDD<string>(); sparkContext.GetLocalProperty(null); sparkContext.HadoopFile(null, null, null, null); sparkContext.HadoopRDD(null, null, null); sparkContext.NewAPIHadoopFile(null, null, null, null); sparkContext.NewAPIHadoopRDD(null, null, null); sparkContext.Parallelize<int>(new int[] { 1, 2, 3, 4, 5 }); sparkContext.SequenceFile(null, null, null, null, null, null); sparkContext.SetCheckpointDir(null); sparkContext.SetJobGroup(null, null); sparkContext.SetLocalProperty(null, null); sparkContext.SetLogLevel(null); sparkContext.TextFile(null); sparkContext.WholeTextFiles(null); sparkContext.Stop(); sparkContext.Union<string>(null); }
private static RDD <string> getDataFromFile(SparkContext sc, string filename) { return((sc.TextFile(filename)).Filter(line => { return !line.StartsWith("#"); })); }