Beispiel #1
1
        public void TestSparkConfMethods()
        {
            var sparkConf = new SparkConf();
            sparkConf.SetMaster("masterUrl");
            Assert.AreEqual("masterUrl", sparkConf.Get(MockSparkConfProxy.MockMasterKey, ""));

            sparkConf.SetAppName("app name ");
            Assert.AreEqual("app name ", sparkConf.Get(MockSparkConfProxy.MockAppNameKey, ""));

            sparkConf.SetSparkHome(@"c:\path\to\sparkfolder");
            Assert.AreEqual(@"c:\path\to\sparkfolder", sparkConf.Get(MockSparkConfProxy.MockHomeKey, ""));

            Assert.AreEqual("default value", sparkConf.Get("non existent key", "default value"));
            Assert.AreEqual(3, sparkConf.GetInt("non existent key", 3));
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(SparkXmlExample));

            var inputXmlFilePath = args[0];
            var outputXmlFilePath = args[1];

            var sparkConf = new SparkConf();
            sparkConf.SetAppName("myapp");
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext = new SqlContext(sparkContext);
            var df = sqlContext.Read()
                                .Format("com.databricks.spark.xml")
                                .Option("rowTag", "book")
                                .Load(inputXmlFilePath); //"D:\temp\books.xml", "file:/D:/temp/books.xml" or "hdfs://temp/books.xml"
            df.ShowSchema();
            var rowCount = df.Count();
            logger.LogInfo("Row count is " + rowCount);

            var selectedData = df.Select("author", "@id");

            selectedData.Write()
                        .Format("com.databricks.spark.xml")
                        .Option("rootTag", "books")
                        .Option("rowTag", "book")
                        .Save(outputXmlFilePath); //"D:\temp\booksUpdated.xml", "file:/D:/temp/booksUpdated.xml" or "hdfs://temp/booksUpdated.xml"

            sparkContext.Stop();
        }
Beispiel #3
0
 private static void InitializeSparkContext(string[] args)
 {
     var sparkConf = new SparkConf();
     sparkConf.Set("spark.local.dir", args[0]);
     sparkConf.SetAppName("SparkCLR perf suite - C#");
     SparkContext = new SparkContext(sparkConf);
     SqlContext = new SqlContext(PerfBenchmark.SparkContext);
 }
Beispiel #4
0
        /// <summary>
        /// This function may be used to get or instantiate a SparkContext and register it as a
        /// singleton object. Because we can only have one active SparkContext per JVM,
        /// this is useful when applications may wish to share a SparkContext.
        /// Note: This function cannot be used to create multiple SparkContext instances
        /// even if multiple contexts are allowed.
        /// </summary>
        /// <param name="conf"></param>
        /// <returns></returns>
        public static SparkContext GetOrCreate(SparkConf conf)
        {
            if (_activeSparkContext == null)
            {
                _activeSparkContext = new SparkContext(conf);
            }

            return(_activeSparkContext);
        }
Beispiel #5
0
 // Creates and returns a context
 private static SparkContext CreateSparkContext()
 {
     var conf = new SparkConf();
     if (Configuration.SparkLocalDirectoryOverride != null)
     {
         conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride);
     }
     return new SparkContext(conf);
 }
Beispiel #6
0
 // Creates and returns a context
 private static SparkContext CreateSparkContext()
 {
     var conf = new SparkConf() { };
     conf.SetMaster(Env.SPARK_MASTER_URL);
     if (Configuration.SparkLocalDirectoryOverride != null)
     {
         conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride);
     }
     return new SparkContext(conf);
 }
        private SparkContext(string master, string appName, string sparkHome, SparkConf conf)
        {
            SparkConf = conf ?? new SparkConf();
            if (master != null)
                SparkConf.SetMaster(master);
            if (appName != null)
                SparkConf.SetAppName(appName);
            if (sparkHome != null)
                SparkConf.SetSparkHome(sparkHome);

            SparkContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateSparkContext(SparkConf.SparkConfProxy);
        }
Beispiel #8
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample));

            var sparkConf = new SparkConf();
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext = new SqlContext(sparkContext);
            var df = sqlContext.Read()
                .Jdbc("jdbc:sqlserver://localhost:1433;databaseName=Temp;;integratedSecurity=true;", "xyz",
                    new Dictionary<string, string>());
            df.ShowSchema();
            var rowCount = df.Count();
            logger.LogInfo("Row count is " + rowCount);
            
        }
Beispiel #9
0
        public RoslynScriptEngine(SparkContext sc, SqlContext sqlContext)
        {
            this.sc = sc;
            sparkConf = sc.GetConf();
            host = new SparkCLRHost
            {
                sc = sc,
                sqlContext = sqlContext
            };

            var sparkLocalDir = sparkConf.Get("spark.local.dir", Path.GetTempPath());
            compilationDumpDirectory = Path.Combine(sparkLocalDir, Path.GetRandomFileName());
            Directory.CreateDirectory(compilationDumpDirectory);

            options = new CSharpParseOptions(LanguageVersion.CSharp6, DocumentationMode.Parse, SourceCodeKind.Script);
        }
Beispiel #10
0
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>");
                return;
            }

            string checkpointPath = args[0];
            string inputDir = args[1];

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    var sparkConf = new SparkConf();
                    sparkConf.SetAppName("HdfsWordCount");
                    var sc = new SparkContext(sparkConf);
                    StreamingContext context = new StreamingContext(sc, 30000);
                    context.Checkpoint(checkpointPath);

                    var lines = context.TextFileStream(inputDir);
                    var words = lines.FlatMap(l => l.Split(' '));
                    var pairs = words.Map(w => new KeyValuePair<string, int>(w, 1));
                    var wordCounts = pairs.ReduceByKey((x, y) => x + y);

                    wordCounts.ForeachRDD((time, rdd) =>
                    {
                        Console.WriteLine("-------------------------------------------");
                        Console.WriteLine("Time: {0}", time);
                        Console.WriteLine("-------------------------------------------");
                        object[] taken = rdd.Take(10);
                        foreach (object record in taken)
                        {
                            Console.WriteLine(record);
                        }
                        Console.WriteLine();
                    });

                    return context;
                });

            ssc.Start();
            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #11
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample));

            //For SQL Server use the connection string formats below
            //"jdbc:sqlserver://localhost:1433;databaseName=Temp;integratedSecurity=true;" or
            //"jdbc:sqlserver://localhost;databaseName=Temp;user=MyUserName;password=myPassword;"
            var connectionString = args[0];
            var tableName = args[1];

            var sparkConf = new SparkConf();
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext = new SqlContext(sparkContext);
            var df = sqlContext
                        .Read()
                        .Jdbc(connectionString, tableName, new Dictionary<string, string>());
            df.ShowSchema();
            var rowCount = df.Count();
            logger.LogInfo("Row count is " + rowCount);
            sparkContext.Stop();
        }
Beispiel #12
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(HiveDataFrameExample));

            var sparkConf = new SparkConf();
            var sparkContext = new SparkContext(sparkConf);
            var hiveContext = new HiveContext(sparkContext);
            var peopleDataFrame = hiveContext.Read().Json(Path.Combine(Environment.CurrentDirectory, @"data\people.json"));

            const string dbName = "SampleHiveDataBaseForMobius";
            const string tableName = "people";
            
            hiveContext.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
            hiveContext.Sql(string.Format("USE {0}", dbName));
            hiveContext.Sql(string.Format("DROP TABLE {0}", tableName)); // drop table if exists

            peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
            var tablesDataFrame = hiveContext.Tables(dbName); // get all tables in database
            logger.LogInfo(string.Format("table count in database {0}: {1}", dbName, tablesDataFrame.Count()));
            tablesDataFrame.Show();

            hiveContext.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table
        }
Beispiel #13
0
 static void Main(string[] args)
 {
     SparkConf sparkConf = new SparkConf();
     SparkContext sc = new SparkContext(sparkConf);
     SqlContext sqlContext = new SqlContext(sc);
     var scriptEngine = new RoslynScriptEngine(sc, sqlContext);
     var repl = new Repl(scriptEngine, new ConsoleIoHandler());
     repl.Init();
     repl.Run();
     scriptEngine.Close();
 }
Beispiel #14
0
 /// <summary>
 /// when created from checkpoint
 /// </summary>
 /// <param name="sparkContextProxy"></param>
 /// <param name="conf"></param>
 internal SparkContext(ISparkContextProxy sparkContextProxy, SparkConf conf)
 {
     SparkContextProxy = sparkContextProxy;
     SparkConf         = conf;
 }
Beispiel #15
0
 /// <summary>
 /// Initializes a SparkContext instance with a specific spark config.
 /// </summary>
 /// <param name="conf">A SparkConf object that represents the settings for spark</param>
 public SparkContext(SparkConf conf)
     : this(null, null, null, conf)
 {
 }
Beispiel #16
0
 /// <summary>
 /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
 /// one based on the options set in this builder.
 /// </summary>
 /// <returns></returns>
 public SparkSession GetOrCreate()
 {
     var sparkConf = new SparkConf();
     foreach (var option in options)
     {
         sparkConf.Set(option.Key, option.Value);
     }
     var sparkContext = SparkContext.GetOrCreate(sparkConf);
     return SqlContext.GetOrCreate(sparkContext).SparkSession;
 }
Beispiel #17
0
        /// <summary>
        /// Sets a list of config options based on the given SparkConf
        /// </summary>
        public Builder Config(SparkConf conf)
        {
            foreach (var keyValuePair in conf.GetAll())
            {
                options[keyValuePair.Key] = keyValuePair.Value;
            }

            return this;
        }
 public void InitializeContext()
 {
     var conf = new SparkConf();
     conf.SetAppName("Activity recognition");
 }
Beispiel #19
0
        /// <summary>
        /// Creates and returns a context
        /// </summary>
        /// <returns>SparkContext</returns>
        private static SparkContext CreateSparkContext()
        {
            var conf = new SparkConf();

            // set up local directory
            var tempDir = Environment.GetEnvironmentVariable("spark.local.dir");
            if (string.IsNullOrEmpty(tempDir))
            {
                tempDir = Path.GetTempPath();
            }

            conf.Set("spark.local.dir", tempDir);
            Logger.DebugFormat("spark.local.dir is set to {0}", tempDir);

            return new SparkContext(conf);
        }
Beispiel #20
0
 /// <summary>
 /// when created from checkpoint
 /// </summary>
 /// <param name="sparkContextProxy"></param>
 /// <param name="conf"></param>
 internal SparkContext(ISparkContextProxy sparkContextProxy, SparkConf conf)
 {
     SparkContextProxy   = sparkContextProxy;
     SparkConf           = conf;
     _activeSparkContext = this;
 }
Beispiel #21
0
        public void TestDStreamMapWithState()
        {
            var mapwithStateDStreamProxy = new Mock<IDStreamProxy>();
            var streamingContextProxy = new Mock<IStreamingContextProxy>();
            streamingContextProxy.Setup(p =>
                p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>()))
                .Returns(mapwithStateDStreamProxy.Object);

            var sparkContextProxy = new Mock<ISparkContextProxy>();

            var sparkConfProxy = new Mock<ISparkConfProxy>();

            var sparkClrProxy = new Mock<ISparkCLRProxy>();
            sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
            sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object);

            // reset sparkCLRProxy for after test completes
            var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;
            try
            {
                SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;

                var sparkConf = new SparkConf(false);
                var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);

                var dstreamProxy = new Mock<IDStreamProxy>();
                var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);

                var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
                var stateDStream = pairDStream.MapWithState(stateSpec);
                var snapshotDStream = stateDStream.StateSnapshots();

                Assert.IsNotNull(stateDStream);
                Assert.IsNotNull(snapshotDStream);
            }
            finally
            {
                SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
            }
        }
Beispiel #22
0
        internal static void DStreamDirectKafkaWithRepartitionSample()
        {
            count = 0;

            string directory = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                () =>
                {
                    var conf = new SparkConf();
                    SparkContext sc = new SparkContext(conf);
                    StreamingContext context = new StreamingContext(sc, 2000L);
                    context.Checkpoint(checkpointPath);

                    var kafkaParams = new Dictionary<string, string> {
                        {"metadata.broker.list", brokers},
                        {"auto.offset.reset", "smallest"}
                    };

                    conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString());
                    var dstream = KafkaUtils.CreateDirectStream(context, new List<string> { topic }, kafkaParams, new Dictionary<string, long>());

                    dstream.ForeachRDD((time, rdd) =>
                        {
                            long batchCount = rdd.Count();
                            int numPartitions = rdd.GetNumPartitions();

                            Console.WriteLine("-------------------------------------------");
                            Console.WriteLine("Time: {0}", time);
                            Console.WriteLine("-------------------------------------------");
                            Console.WriteLine("Count: " + batchCount);
                            Console.WriteLine("Partitions: " + numPartitions);

                            // only first batch has data and is repartitioned into 10 partitions
                            if (count++ == 0)
                            {
                                Assert.AreEqual(messages, batchCount);
                                Assert.IsTrue(numPartitions >= partitions);
                            }
                            else
                            {
                                Assert.AreEqual(0, batchCount);
                                Assert.IsTrue(numPartitions == 0);
                            }
                        });

                    return context;
                });

            ssc.Start();
            ssc.AwaitTermination();
        }
Beispiel #23
0
        static void Main(string[] args)
        {
            var cassandraHostName = "localhost";
            var cassandraKeySpace = "ks";
            var cassandraTableToRead = "users";
            var cassandraTableToInsert = "filteredusers";

            if (args.Length == 4)
            {
                cassandraHostName = args[0];
                cassandraKeySpace = args[1];
                cassandraTableToRead = args[2];
                cassandraTableToInsert = args[3];
            }

            /*
                ** CQL used to create table in Cassandra for this example **

                CREATE TABLE users (
                    username VARCHAR,
                    firstname VARCHAR,
                    lastname VARCHAR,
                PRIMARY KEY (username)
                );

                INSERT INTO ks.users (username, firstname, lastname) VALUES ('JD123', 'John', 'Doe');
                INSERT INTO ks.users (username, firstname, lastname) VALUES ('BillJ', 'Bill', 'Jones');
                INSERT INTO ks.users (username, firstname, lastname) VALUES ('SL', 'Steve', 'Little');

                CREATE TABLE filteredusers (
                    username VARCHAR,
                    firstname VARCHAR,
                    lastname VARCHAR,
                PRIMARY KEY (username)
                );
             */

            var sparkConf = new SparkConf().Set("spark.cassandra.connection.host", cassandraHostName);
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext = new SqlContext(sparkContext);

            //read from cassandra table
            var usersDataFrame =
                sqlContext.Read()
                    .Format("org.apache.spark.sql.cassandra")
                    .Options(new Dictionary<string, string> { {"keyspace", cassandraKeySpace }, { "table", cassandraTableToRead } })
                    .Load();

            //display rows in the console
            usersDataFrame.Show();

            var createTempTableStatement =
                string.Format(
                    "CREATE TEMPORARY TABLE userstemp USING org.apache.spark.sql.cassandra OPTIONS(table \"{0}\", keyspace \"{1}\")",
                    cassandraTableToRead,
                    cassandraKeySpace);

            //create a temp table
            sqlContext.Sql(createTempTableStatement);

            //read from temp table, filter it and display schema and rows
            var filteredUsersDataFrame = sqlContext.Sql("SELECT * FROM userstemp").Filter("username = '******'");
            filteredUsersDataFrame.ShowSchema();
            filteredUsersDataFrame.Show();

            //write filtered rows to another table
            filteredUsersDataFrame.Write()
                .Format("org.apache.spark.sql.cassandra")
                .Options(new Dictionary<string, string> { { "keyspace", cassandraKeySpace }, { "table", cassandraTableToInsert } })
                .Save();

            //convert to RDD, execute map & filter and collect result
            var rddCollectedItems = usersDataFrame.ToRDD()
                                    .Map(
                                        r =>
                                            string.Format("{0},{1},{2}", r.GetAs<string>("username"),
                                                                         r.GetAs<string>("firstname"),
                                                                         r.GetAs<string>("lastname")))
                                    .Filter(s => s.Contains("SL"))
                                    .Collect();

            foreach (var rddCollectedItem in rddCollectedItems)
            {
                Console.WriteLine(rddCollectedItem);
            }

            Console.WriteLine("Completed running example");
        }
Beispiel #24
0
 private static SparkContext CreateSparkContext()
 {
     var conf = new SparkConf();
     conf.Set("spark.local.dir", @"C:\temp");
     return new SparkContext(conf);
 }