public void TestSparkConfMethods() { var sparkConf = new SparkConf(); sparkConf.SetMaster("masterUrl"); Assert.AreEqual("masterUrl", sparkConf.Get(MockSparkConfProxy.MockMasterKey, "")); sparkConf.SetAppName("app name "); Assert.AreEqual("app name ", sparkConf.Get(MockSparkConfProxy.MockAppNameKey, "")); sparkConf.SetSparkHome(@"c:\path\to\sparkfolder"); Assert.AreEqual(@"c:\path\to\sparkfolder", sparkConf.Get(MockSparkConfProxy.MockHomeKey, "")); Assert.AreEqual("default value", sparkConf.Get("non existent key", "default value")); Assert.AreEqual(3, sparkConf.GetInt("non existent key", 3)); }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(SparkXmlExample)); var inputXmlFilePath = args[0]; var outputXmlFilePath = args[1]; var sparkConf = new SparkConf(); sparkConf.SetAppName("myapp"); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext.Read() .Format("com.databricks.spark.xml") .Option("rowTag", "book") .Load(inputXmlFilePath); //"D:\temp\books.xml", "file:/D:/temp/books.xml" or "hdfs://temp/books.xml" df.ShowSchema(); var rowCount = df.Count(); logger.LogInfo("Row count is " + rowCount); var selectedData = df.Select("author", "@id"); selectedData.Write() .Format("com.databricks.spark.xml") .Option("rootTag", "books") .Option("rowTag", "book") .Save(outputXmlFilePath); //"D:\temp\booksUpdated.xml", "file:/D:/temp/booksUpdated.xml" or "hdfs://temp/booksUpdated.xml" sparkContext.Stop(); }
private static void InitializeSparkContext(string[] args) { var sparkConf = new SparkConf(); sparkConf.Set("spark.local.dir", args[0]); sparkConf.SetAppName("SparkCLR perf suite - C#"); SparkContext = new SparkContext(sparkConf); SqlContext = new SqlContext(PerfBenchmark.SparkContext); }
/// <summary> /// This function may be used to get or instantiate a SparkContext and register it as a /// singleton object. Because we can only have one active SparkContext per JVM, /// this is useful when applications may wish to share a SparkContext. /// Note: This function cannot be used to create multiple SparkContext instances /// even if multiple contexts are allowed. /// </summary> /// <param name="conf"></param> /// <returns></returns> public static SparkContext GetOrCreate(SparkConf conf) { if (_activeSparkContext == null) { _activeSparkContext = new SparkContext(conf); } return(_activeSparkContext); }
// Creates and returns a context private static SparkContext CreateSparkContext() { var conf = new SparkConf(); if (Configuration.SparkLocalDirectoryOverride != null) { conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride); } return new SparkContext(conf); }
// Creates and returns a context private static SparkContext CreateSparkContext() { var conf = new SparkConf() { }; conf.SetMaster(Env.SPARK_MASTER_URL); if (Configuration.SparkLocalDirectoryOverride != null) { conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride); } return new SparkContext(conf); }
private SparkContext(string master, string appName, string sparkHome, SparkConf conf) { SparkConf = conf ?? new SparkConf(); if (master != null) SparkConf.SetMaster(master); if (appName != null) SparkConf.SetAppName(appName); if (sparkHome != null) SparkConf.SetSparkHome(sparkHome); SparkContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateSparkContext(SparkConf.SparkConfProxy); }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample)); var sparkConf = new SparkConf(); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext.Read() .Jdbc("jdbc:sqlserver://localhost:1433;databaseName=Temp;;integratedSecurity=true;", "xyz", new Dictionary<string, string>()); df.ShowSchema(); var rowCount = df.Count(); logger.LogInfo("Row count is " + rowCount); }
public RoslynScriptEngine(SparkContext sc, SqlContext sqlContext) { this.sc = sc; sparkConf = sc.GetConf(); host = new SparkCLRHost { sc = sc, sqlContext = sqlContext }; var sparkLocalDir = sparkConf.Get("spark.local.dir", Path.GetTempPath()); compilationDumpDirectory = Path.Combine(sparkLocalDir, Path.GetRandomFileName()); Directory.CreateDirectory(compilationDumpDirectory); options = new CSharpParseOptions(LanguageVersion.CSharp6, DocumentationMode.Parse, SourceCodeKind.Script); }
static void Main(string[] args) { if (args.Length < 2) { Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>"); return; } string checkpointPath = args[0]; string inputDir = args[1]; StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var sparkConf = new SparkConf(); sparkConf.SetAppName("HdfsWordCount"); var sc = new SparkContext(sparkConf); StreamingContext context = new StreamingContext(sc, 30000); context.Checkpoint(checkpointPath); var lines = context.TextFileStream(inputDir); var words = lines.FlatMap(l => l.Split(' ')); var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1)); var wordCounts = pairs.ReduceByKey((x, y) => x + y); wordCounts.ForeachRDD((time, rdd) => { Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); object[] taken = rdd.Take(10); foreach (object record in taken) { Console.WriteLine(record); } Console.WriteLine(); }); return context; }); ssc.Start(); ssc.AwaitTermination(); ssc.Stop(); }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample)); //For SQL Server use the connection string formats below //"jdbc:sqlserver://localhost:1433;databaseName=Temp;integratedSecurity=true;" or //"jdbc:sqlserver://localhost;databaseName=Temp;user=MyUserName;password=myPassword;" var connectionString = args[0]; var tableName = args[1]; var sparkConf = new SparkConf(); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); var df = sqlContext .Read() .Jdbc(connectionString, tableName, new Dictionary<string, string>()); df.ShowSchema(); var rowCount = df.Count(); logger.LogInfo("Row count is " + rowCount); sparkContext.Stop(); }
static void Main(string[] args) { LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set var logger = LoggerServiceFactory.GetLogger(typeof(HiveDataFrameExample)); var sparkConf = new SparkConf(); var sparkContext = new SparkContext(sparkConf); var hiveContext = new HiveContext(sparkContext); var peopleDataFrame = hiveContext.Read().Json(Path.Combine(Environment.CurrentDirectory, @"data\people.json")); const string dbName = "SampleHiveDataBaseForMobius"; const string tableName = "people"; hiveContext.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists hiveContext.Sql(string.Format("USE {0}", dbName)); hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table var tablesDataFrame = hiveContext.Tables(dbName); // get all tables in database logger.LogInfo(string.Format("table count in database {0}: {1}", dbName, tablesDataFrame.Count())); tablesDataFrame.Show(); hiveContext.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table }
static void Main(string[] args) { SparkConf sparkConf = new SparkConf(); SparkContext sc = new SparkContext(sparkConf); SqlContext sqlContext = new SqlContext(sc); var scriptEngine = new RoslynScriptEngine(sc, sqlContext); var repl = new Repl(scriptEngine, new ConsoleIoHandler()); repl.Init(); repl.Run(); scriptEngine.Close(); }
/// <summary> /// when created from checkpoint /// </summary> /// <param name="sparkContextProxy"></param> /// <param name="conf"></param> internal SparkContext(ISparkContextProxy sparkContextProxy, SparkConf conf) { SparkContextProxy = sparkContextProxy; SparkConf = conf; }
/// <summary> /// Initializes a SparkContext instance with a specific spark config. /// </summary> /// <param name="conf">A SparkConf object that represents the settings for spark</param> public SparkContext(SparkConf conf) : this(null, null, null, conf) { }
/// <summary> /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new /// one based on the options set in this builder. /// </summary> /// <returns></returns> public SparkSession GetOrCreate() { var sparkConf = new SparkConf(); foreach (var option in options) { sparkConf.Set(option.Key, option.Value); } var sparkContext = SparkContext.GetOrCreate(sparkConf); return SqlContext.GetOrCreate(sparkContext).SparkSession; }
/// <summary> /// Sets a list of config options based on the given SparkConf /// </summary> public Builder Config(SparkConf conf) { foreach (var keyValuePair in conf.GetAll()) { options[keyValuePair.Key] = keyValuePair.Value; } return this; }
public void InitializeContext() { var conf = new SparkConf(); conf.SetAppName("Activity recognition"); }
/// <summary> /// Creates and returns a context /// </summary> /// <returns>SparkContext</returns> private static SparkContext CreateSparkContext() { var conf = new SparkConf(); // set up local directory var tempDir = Environment.GetEnvironmentVariable("spark.local.dir"); if (string.IsNullOrEmpty(tempDir)) { tempDir = Path.GetTempPath(); } conf.Set("spark.local.dir", tempDir); Logger.DebugFormat("spark.local.dir is set to {0}", tempDir); return new SparkContext(conf); }
/// <summary> /// when created from checkpoint /// </summary> /// <param name="sparkContextProxy"></param> /// <param name="conf"></param> internal SparkContext(ISparkContextProxy sparkContextProxy, SparkConf conf) { SparkContextProxy = sparkContextProxy; SparkConf = conf; _activeSparkContext = this; }
public void TestDStreamMapWithState() { var mapwithStateDStreamProxy = new Mock<IDStreamProxy>(); var streamingContextProxy = new Mock<IStreamingContextProxy>(); streamingContextProxy.Setup(p => p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>())) .Returns(mapwithStateDStreamProxy.Object); var sparkContextProxy = new Mock<ISparkContextProxy>(); var sparkConfProxy = new Mock<ISparkConfProxy>(); var sparkClrProxy = new Mock<ISparkCLRProxy>(); sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object); sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object); sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object); // reset sparkCLRProxy for after test completes var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy; try { SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object; var sparkConf = new SparkConf(false); var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10); var dstreamProxy = new Mock<IDStreamProxy>(); var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc); var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v); var stateDStream = pairDStream.MapWithState(stateSpec); var snapshotDStream = stateDStream.StateSnapshots(); Assert.IsNotNull(stateDStream); Assert.IsNotNull(snapshotDStream); } finally { SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy; } }
internal static void DStreamDirectKafkaWithRepartitionSample() { count = 0; string directory = SparkCLRSamples.Configuration.SampleDataLocation; string checkpointPath = Path.Combine(directory, "checkpoint"); StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath, () => { var conf = new SparkConf(); SparkContext sc = new SparkContext(conf); StreamingContext context = new StreamingContext(sc, 2000L); context.Checkpoint(checkpointPath); var kafkaParams = new Dictionary<string, string> { {"metadata.broker.list", brokers}, {"auto.offset.reset", "smallest"} }; conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString()); var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, new Dictionary<string, long>()); dstream.ForeachRDD((time, rdd) => { long batchCount = rdd.Count(); int numPartitions = rdd.GetNumPartitions(); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Time: {0}", time); Console.WriteLine("-------------------------------------------"); Console.WriteLine("Count: " + batchCount); Console.WriteLine("Partitions: " + numPartitions); // only first batch has data and is repartitioned into 10 partitions if (count++ == 0) { Assert.AreEqual(messages, batchCount); Assert.IsTrue(numPartitions >= partitions); } else { Assert.AreEqual(0, batchCount); Assert.IsTrue(numPartitions == 0); } }); return context; }); ssc.Start(); ssc.AwaitTermination(); }
static void Main(string[] args) { var cassandraHostName = "localhost"; var cassandraKeySpace = "ks"; var cassandraTableToRead = "users"; var cassandraTableToInsert = "filteredusers"; if (args.Length == 4) { cassandraHostName = args[0]; cassandraKeySpace = args[1]; cassandraTableToRead = args[2]; cassandraTableToInsert = args[3]; } /* ** CQL used to create table in Cassandra for this example ** CREATE TABLE users ( username VARCHAR, firstname VARCHAR, lastname VARCHAR, PRIMARY KEY (username) ); INSERT INTO ks.users (username, firstname, lastname) VALUES ('JD123', 'John', 'Doe'); INSERT INTO ks.users (username, firstname, lastname) VALUES ('BillJ', 'Bill', 'Jones'); INSERT INTO ks.users (username, firstname, lastname) VALUES ('SL', 'Steve', 'Little'); CREATE TABLE filteredusers ( username VARCHAR, firstname VARCHAR, lastname VARCHAR, PRIMARY KEY (username) ); */ var sparkConf = new SparkConf().Set("spark.cassandra.connection.host", cassandraHostName); var sparkContext = new SparkContext(sparkConf); var sqlContext = new SqlContext(sparkContext); //read from cassandra table var usersDataFrame = sqlContext.Read() .Format("org.apache.spark.sql.cassandra") .Options(new Dictionary<string, string> { {"keyspace", cassandraKeySpace }, { "table", cassandraTableToRead } }) .Load(); //display rows in the console usersDataFrame.Show(); var createTempTableStatement = string.Format( "CREATE TEMPORARY TABLE userstemp USING org.apache.spark.sql.cassandra OPTIONS(table \"{0}\", keyspace \"{1}\")", cassandraTableToRead, cassandraKeySpace); //create a temp table sqlContext.Sql(createTempTableStatement); //read from temp table, filter it and display schema and rows var filteredUsersDataFrame = sqlContext.Sql("SELECT * FROM userstemp").Filter("username = '******'"); filteredUsersDataFrame.ShowSchema(); filteredUsersDataFrame.Show(); //write filtered rows to another table filteredUsersDataFrame.Write() .Format("org.apache.spark.sql.cassandra") .Options(new Dictionary<string, string> { { "keyspace", cassandraKeySpace }, { "table", cassandraTableToInsert } }) .Save(); //convert to RDD, execute map & filter and collect result var rddCollectedItems = usersDataFrame.ToRDD() .Map( r => string.Format("{0},{1},{2}", r.GetAs<string>("username"), r.GetAs<string>("firstname"), r.GetAs<string>("lastname"))) .Filter(s => s.Contains("SL")) .Collect(); foreach (var rddCollectedItem in rddCollectedItems) { Console.WriteLine(rddCollectedItem); } Console.WriteLine("Completed running example"); }
private static SparkContext CreateSparkContext() { var conf = new SparkConf(); conf.Set("spark.local.dir", @"C:\temp"); return new SparkContext(conf); }