Beispiel #1
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(HiveDataFrameExample));

            var sparkConf       = new SparkConf();
            var sparkContext    = new SparkContext(sparkConf);
            var hiveContext     = new HiveContext(sparkContext);
            var peopleDataFrame = hiveContext.Read().Json(Path.Combine(Environment.CurrentDirectory, @"data\people.json"));

            const string dbName    = "SampleHiveDataBaseForMobius";
            const string tableName = "people";

            hiveContext.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
            hiveContext.Sql(string.Format("USE {0}", dbName));
            hiveContext.Sql(string.Format("DROP TABLE {0}", tableName));                 // drop table if exists

            peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName);     // create table
            var tablesDataFrame = hiveContext.Tables(dbName);                            // get all tables in database

            logger.LogInfo(string.Format("table count in database {0}: {1}", dbName, tablesDataFrame.Count()));
            tablesDataFrame.Show();

            hiveContext.Sql(string.Format("SELECT * FROM {0}", tableName)).Show(); // select from table
        }
Beispiel #2
0
        /// <summary>
        ///
        /// To read the data from the xml file and to retrieve the data
        /// </summary>
        private static void SparkXml()
        {
            var sparkConf = new SparkConf();

            sparkConf.SetMaster("yarn");
            sparkConf.SetAppName("SparkXmlMobius");
            sparkContext = new SparkContext(sparkConf);
            var sqlContext = new SqlContext(sparkContext);
            var dataframe  = sqlContext.Read()
                             .Format("com.databricks.spark.xml")
                             .Option("rowTag", "book")
                             .Load(inputXmlFilePath);

            var rowCount = dataframe.Count();

            logger.LogInfo("****Row count is " + rowCount + "****");
            var rowCollections = dataframe.Collect();

            logger.LogInfo("**********************************************");
            foreach (var row in rowCollections)
            {
                Console.WriteLine("{0}", row);
            }
            logger.LogInfo("*********************************************");
            logger.LogInfo("Executed Successfully.................");
        }
Beispiel #3
0
        public void TestSqlContextGetConf()
        {
            // arrange
            const string key   = "key";
            const string value = "value";

            mockSqlContextProxy.Setup(m => m.GetConf(key, "")).Returns(value);
            var mockSparkContextProxy = new Mock <ISparkContextProxy>();

            var mockSparkSessionProxy = new Mock <ISparkSessionProxy>();
            var mockCatalogProxy      = new Mock <ICatalogProxy>();

            mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny <string>()));
            mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object);
            mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object);
            mockSparkSessionProxy.Setup(m => m.SqlContextProxy).Returns(mockSqlContextProxy.Object);

            var mockSparkConfProxy = new Mock <ISparkConfProxy>();

            mockSparkConfProxy.Setup(m => m.GetSparkConfAsString())
            .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;");

            var conf       = new SparkConf(mockSparkConfProxy.Object);
            var sqlContext = new SqlContext(new SparkContext(mockSparkContextProxy.Object, conf));

            sqlContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object;

            //act
            var actualValue = sqlContext.GetConf(key, "");

            // assert
            Assert.AreEqual(value, actualValue);
        }
Beispiel #4
0
        /// <summary>
        /// To calculate the wordcount for the Hdfs file
        /// </summary>
        private static void WordCount()
        {
            var sparkConf = new SparkConf();

            sparkConf.SetAppName("MobiusWordCountC#");
            sparkConf.SetMaster("yarn");
            sparkContext = new SparkContext(sparkConf);
            try
            {
                var lines  = sparkContext.TextFile(hdfsFile);
                var counts = lines
                             .FlatMap(x => x.Split(' '))
                             .Map(w => new Tuple <string, int>(w, 1))
                             .ReduceByKey((x, y) => x + y);
                logger.LogInfo("**********************************************");

                foreach (var wordcount in counts.Collect())
                {
                    Console.WriteLine("{0}: {1}", wordcount.Item1, wordcount.Item2);
                }

                logger.LogInfo("**********************************************");
                logger.LogInfo("Executed Successfully.................");
            }
            catch (Exception ex)
            {
                logger.LogError("Error performing Word Count");
                logger.LogException(ex);
            }
        }
Beispiel #5
0
        /// <summary>
        /// To calculate Pi value
        /// </summary>
        private static void Pi()
        {
            var sparkConf = new SparkConf();

            sparkConf.SetAppName("MobiusSimpleSamplePI");
            sparkConf.SetMaster("yarn");
            sparkContext = new SparkContext(sparkConf);
            try
            {
                const int slices        = 3;
                var       numberOfItems = (int)Math.Min(100000L * slices, int.MaxValue);
                var       values        = new List <int>(numberOfItems);
                for (var i = 0; i <= numberOfItems; i++)
                {
                    values.Add(i);
                }

                var rdd = sparkContext.Parallelize(values, slices);

                logger.LogInfo("Started Calculating Pi");

                CalculatePiUsingAnonymousMethod(numberOfItems, rdd);

                CalculatePiUsingSerializedClassApproach(numberOfItems, rdd);

                logger.LogInfo("Completed calculating the value of Pi");
                logger.LogInfo("Executed Successfully.................");
            }
            catch (Exception ex)
            {
                logger.LogError("Error calculating Pi");
                logger.LogException(ex);
            }
        }
Beispiel #6
0
        public void TestHiveContextRefreshTable()
        {
            // arrange
            var mockSparkContextProxy = new Mock <ISparkContextProxy>();
            var mockSparkSessionProxy = new Mock <ISparkSessionProxy>();
            var mockCatalogProxy      = new Mock <ICatalogProxy>();

            mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny <string>()));
            mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object);
            mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object);

            var mockSparkConfProxy = new Mock <ISparkConfProxy>();

            mockSparkConfProxy.Setup(m => m.GetSparkConfAsString())
            .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;");

            var conf        = new SparkConf(mockSparkConfProxy.Object);
            var hiveContext = new HiveContext(new SparkContext(mockSparkContextProxy.Object, conf));

            hiveContext.SparkSession.SparkSessionProxy = mockSparkSessionProxy.Object;

            // act
            hiveContext.RefreshTable("table");

            // assert
            mockCatalogProxy.Verify(m => m.RefreshTable("table"));
        }
Beispiel #7
0
        public void TestSparkConf()
        {
            var sparkConf = new SparkConf(false);

            sparkConf.SetMaster("master");
            sparkConf.SetAppName("test");
            sparkConf.SetSparkHome("test home");
            sparkConf.Set("key_string", "value");
            sparkConf.Set("key_int", "100");

            var expectedConfigs = new Dictionary <string, string>()
            {
                { "spark.master", "master" },
                { "spark.app.name", "test" },
                { "spark.home", "test home" },
                { "key_string", "value" },
                { "key_int", "100" }
            };

            foreach (KeyValuePair <string, string> kv in expectedConfigs)
            {
                Assert.Equal(kv.Value, sparkConf.Get(kv.Key, string.Empty));
            }

            Assert.Equal(100, sparkConf.GetInt("key_int", 0));

            // Validate GetAll().
            Dictionary <string, string> actualAllConfigs =
                sparkConf.GetAll().ToDictionary(x => x.Key, x => x.Value);

            Assert.Equal(expectedConfigs, actualAllConfigs);
        }
Beispiel #8
0
        /// <summary>
        ///
        /// To process with the given connection string for the SQL
        /// </summary>
        private static void JdbcDataFrame()
        {
            if (!string.IsNullOrEmpty(connectionString) && !string.IsNullOrEmpty(tableName))
            {
                var sparkConf = new SparkConf();
                sparkConf.SetAppName("SqlConnectionFromMobius");
                sparkConf.SetMaster("yarn");
                sparkConf.Set("spark.sql.warehouse.dir", "/user/hive/warehouse");
                sparkContext = new SparkContext(sparkConf);
                var sqlContext = new SqlContext(sparkContext);

                var df = sqlContext
                         .Read()
                         .Jdbc(connectionString, tableName, new Dictionary <string, string>());
                var rowCount = df.Count();

                logger.LogInfo("****Row count is " + rowCount + "****");
                logger.LogInfo("Executed Successfully.................");
            }
            else
            {
                logger.LogInfo("****Please provide correct connectionstring and table name****");
                GetValues();
                JdbcDataFrame();
            }
        }
Beispiel #9
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(SparkXmlExample));

            var inputXmlFilePath  = args[0];
            var outputXmlFilePath = args[1];

            var sparkConf = new SparkConf();

            sparkConf.SetAppName("myapp");
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext   = new SqlContext(sparkContext);
            var df           = sqlContext.Read()
                               .Format("com.databricks.spark.xml")
                               .Option("rowTag", "book")
                               .Load(inputXmlFilePath);  //"D:\temp\books.xml", "file:/D:/temp/books.xml" or "hdfs://temp/books.xml"

            df.ShowSchema();
            var rowCount = df.Count();

            logger.LogInfo("Row count is " + rowCount);

            var selectedData = df.Select("author", "@id");

            selectedData.Write()
            .Format("com.databricks.spark.xml")
            .Option("rootTag", "books")
            .Option("rowTag", "book")
            .Save(outputXmlFilePath);             //"D:\temp\booksUpdated.xml", "file:/D:/temp/booksUpdated.xml" or "hdfs://temp/booksUpdated.xml"

            sparkContext.Stop();
        }
Beispiel #10
0
        private static void InitializeSparkContext(string[] args)
        {
            var sparkConf = new SparkConf();

            sparkConf.Set("spark.local.dir", args[0]);
            sparkConf.SetAppName("SparkCLR perf suite - C#");
            SparkContext = new SparkContext(sparkConf);
            SqlContext   = new SqlContext(PerfBenchmark.SparkContext);
        }
Beispiel #11
0
        /// <summary>
        /// Sets a list of config options based on the given SparkConf
        /// </summary>
        public Builder Config(SparkConf conf)
        {
            foreach (KeyValuePair <string, string> keyValuePair in conf.GetAll())
            {
                _options[keyValuePair.Key] = keyValuePair.Value;
            }

            return(this);
        }
Beispiel #12
0
        // Creates and returns a context
        private static SparkContext CreateSparkContext()
        {
            var conf = new SparkConf();

            if (Configuration.SparkLocalDirectoryOverride != null)
            {
                conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride);
            }
            return(new SparkContext(conf));
        }
Beispiel #13
0
        internal static void DStreamDirectKafkaWithRepartitionSample()
        {
            count = 0;

            string directory      = SparkCLRSamples.Configuration.SampleDataLocation;
            string checkpointPath = Path.Combine(directory, "checkpoint");

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                var conf                 = new SparkConf();
                SparkContext sc          = new SparkContext(conf);
                StreamingContext context = new StreamingContext(sc, 2000L);
                context.Checkpoint(checkpointPath);

                var kafkaParams = new List <Tuple <string, string> > {
                    new Tuple <string, string>("metadata.broker.list", brokers),
                    new Tuple <string, string>("auto.offset.reset", "smallest")
                };

                conf.Set("spark.mobius.streaming.kafka.numPartitions." + topic, partitions.ToString());
                var dstream = KafkaUtils.CreateDirectStream(context, new List <string> {
                    topic
                }, kafkaParams, Enumerable.Empty <Tuple <string, long> >());

                dstream.ForeachRDD((time, rdd) =>
                {
                    long batchCount   = rdd.Count();
                    int numPartitions = rdd.GetNumPartitions();

                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Count: " + batchCount);
                    Console.WriteLine("Partitions: " + numPartitions);

                    // only first batch has data and is repartitioned into 10 partitions
                    if (count++ == 0)
                    {
                        Assert.AreEqual(messages, batchCount);
                        Assert.IsTrue(numPartitions >= partitions);
                    }
                    else
                    {
                        Assert.AreEqual(0, batchCount);
                        Assert.IsTrue(numPartitions == 0);
                    }
                });

                return(context);
            });

            ssc.Start();
            ssc.AwaitTermination();
        }
Beispiel #14
0
        static void Main(string[] args)
        {
            SparkConf    sparkConf    = new SparkConf();
            SparkContext sc           = new SparkContext(sparkConf);
            SqlContext   sqlContext   = new SqlContext(sc);
            var          scriptEngine = new RoslynScriptEngine(sc, sqlContext);
            var          repl         = new Repl(scriptEngine, new ConsoleIoHandler());

            repl.Init();
            repl.Run();
            scriptEngine.Close();
        }
Beispiel #15
0
        /// <summary>
        /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
        /// one based on the options set in this builder.
        /// </summary>
        /// <returns></returns>
        public SparkSession GetOrCreate()
        {
            var sparkConf = new SparkConf();

            foreach (var option in options)
            {
                sparkConf.Set(option.Key, option.Value);
            }
            var sparkContext = SparkContext.GetOrCreate(sparkConf);

            return(SqlContext.GetOrCreate(sparkContext).SparkSession);
        }
Beispiel #16
0
        /// <summary>
        /// Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
        /// one based on the options set in this builder.
        /// </summary>
        /// <returns></returns>
        public SparkSession GetOrCreate()
        {
            var sparkConf = new SparkConf();

            foreach (KeyValuePair <string, string> option in _options)
            {
                sparkConf.Set(option.Key, option.Value);
            }

            _jvmObject.Invoke("config", sparkConf);

            return(new SparkSession((JvmObjectReference)_jvmObject.Invoke("getOrCreate")));
        }
Beispiel #17
0
        // Creates and returns a context
        private static SparkContext CreateSparkContext()
        {
            var conf = new SparkConf()
            {
            };

            conf.SetMaster(Env.SPARK_MASTER_URL);
            if (Configuration.SparkLocalDirectoryOverride != null)
            {
                conf.Set("spark.local.dir", Configuration.SparkLocalDirectoryOverride);
            }
            return(new SparkContext(conf));
        }
Beispiel #18
0
        public static void Main(string[] args)
        {
            // arquivo usado : https://www.kaggle.com/gbonesso/b3-stock-quotes/data?select=COTAHIST_A2009_to_A2020_P.csv
            // essa poc calcula o preco medio da acao nesse periodo

            SparkConf sparkConf = new SparkConf();

            sparkConf.SetMaster("local[*]");  // '*' indica pra usar todos os cores

            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config(sparkConf)
                                 .AppName("SparkNetPOC")
                                 .GetOrCreate();


            Stopwatch sw = new Stopwatch();

            sw.Start();


            DataFrame dataFrameGeral = spark.Read()
                                       .Schema("vazio STRING, TIPREG STRING,DATPRE STRING,CODBDI STRING,CODNEG STRING,TPMERC STRING,NOMRES STRING,ESPECI STRING," +
                                               "PRAZOT STRING,MODREF STRING,PREABE STRING,PREMAX STRING,PREMIN STRING,PREMED STRING,PREULT STRING,PREOFC STRING," +
                                               "PREOFV STRING,TOTNEG STRING,QUATOT STRING," +
                                               "VOLTOT STRING,PREEXE STRING,INDOPC STRING,DATVEN STRING,FATCOT STRING,PTOEXE STRING,CODISI STRING,DISMES STRING")
                                       .Csv(@"C:\InternetDownloads\10318_1101179_compressed_COTAHIST_A2009_to_A2020_P.csv\COTAHIST_A2009_to_A2020_P.csv");


            DataFrame dataFrameColunasUteis = dataFrameGeral
                                              .Drop("vazio", "TIPREG", "DATPRE", "CODBDI", "TPMERC", "NOMRES", "ESPECI", "PRAZOT", "MODREF", "PREABE", "PREMIN",
                                                    "PREMED", "PREULT", "PREOFC", "PREOFV", "TOTNEG", "QUATOT", "VOLTOT", "PREEXE", "INDOPC", "DATVEN", "FATCOT", "PTOEXE", "CODISI", "DISMES");

            DataFrame dataFrameFiltro = dataFrameColunasUteis
                                        .Filter("CODNEG = 'ITSA3' OR CODNEG = 'ABEV3' OR CODNEG = 'PETR4'");

            DataFrame dataFrameFinal = dataFrameFiltro
                                       .GroupBy("CODNEG")
                                       .Agg(Avg("PREMAX"));

            dataFrameFinal.Show();


            spark.Stop();

            sw.Stop();
            Console.WriteLine("Tempo = " + sw.ElapsedMilliseconds);
        }
Beispiel #19
0
        public void TestSparkConfMethods()
        {
            var sparkConf = new SparkConf();

            sparkConf.SetMaster("masterUrl");
            Assert.AreEqual("masterUrl", sparkConf.Get(MockSparkConfProxy.MockMasterKey, ""));

            sparkConf.SetAppName("app name ");
            Assert.AreEqual("app name ", sparkConf.Get(MockSparkConfProxy.MockAppNameKey, ""));

            sparkConf.SetSparkHome(@"c:\path\to\sparkfolder");
            Assert.AreEqual(@"c:\path\to\sparkfolder", sparkConf.Get(MockSparkConfProxy.MockHomeKey, ""));

            Assert.AreEqual("default value", sparkConf.Get("non existent key", "default value"));
            Assert.AreEqual(3, sparkConf.GetInt("non existent key", 3));
        }
Beispiel #20
0
        /// <summary>
        /// Creates and returns a context
        /// </summary>
        /// <returns>SparkContext</returns>
        private static SparkContext CreateSparkContext()
        {
            var conf = new SparkConf();

            // set up local directory
            var tempDir = Environment.GetEnvironmentVariable("spark.local.dir");

            if (string.IsNullOrEmpty(tempDir))
            {
                tempDir = Path.GetTempPath();
            }

            conf.Set("spark.local.dir", tempDir);
            Logger.DebugFormat("spark.local.dir is set to {0}", tempDir);

            return(new SparkContext(conf));
        }
Beispiel #21
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample));

            var sparkConf    = new SparkConf();
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext   = new SqlContext(sparkContext);
            var df           = sqlContext.Read()
                               .Jdbc("jdbc:sqlserver://localhost:1433;databaseName=Temp;;integratedSecurity=true;", "xyz",
                                     new Dictionary <string, string>());

            df.ShowSchema();
            var rowCount = df.Count();

            logger.LogInfo("Row count is " + rowCount);
        }
Beispiel #22
0
        public RoslynScriptEngine(SparkContext sc, SqlContext sqlContext)
        {
            this.sc   = sc;
            sparkConf = sc.GetConf();
            host      = new SparkCLRHost
            {
                sc         = sc,
                sqlContext = sqlContext
            };

            var sparkLocalDir = sparkConf.Get("spark.local.dir", Path.GetTempPath());

            compilationDumpDirectory = Path.Combine(sparkLocalDir, Path.GetRandomFileName());
            Directory.CreateDirectory(compilationDumpDirectory);

            options = new CSharpParseOptions(LanguageVersion.CSharp6, DocumentationMode.Parse, SourceCodeKind.Script);
        }
Beispiel #23
0
        public void TestDStreamMapWithState()
        {
            var mapwithStateDStreamProxy = new Mock <IDStreamProxy>();
            var streamingContextProxy    = new Mock <IStreamingContextProxy>();

            streamingContextProxy.Setup(p =>
                                        p.CreateCSharpStateDStream(It.IsAny <IDStreamProxy>(), It.IsAny <byte[]>(), It.IsAny <string>(), It.IsAny <string>(), It.IsAny <string>()))
            .Returns(mapwithStateDStreamProxy.Object);

            var sparkContextProxy = new Mock <ISparkContextProxy>();

            var sparkConfProxy = new Mock <ISparkConfProxy>();

            var sparkClrProxy = new Mock <ISparkCLRProxy>();

            sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
            sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny <ISparkConfProxy>())).Returns(sparkContextProxy.Object);
            sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny <bool>())).Returns(sparkConfProxy.Object);

            // reset sparkCLRProxy for after test completes
            var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;

            try
            {
                SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;

                var sparkConf = new SparkConf(false);
                var ssc       = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);

                var dstreamProxy = new Mock <IDStreamProxy>();
                var pairDStream  = new DStream <KeyValuePair <string, int> >(dstreamProxy.Object, ssc);

                var stateSpec       = new StateSpec <string, int, int, int>((k, v, s) => v);
                var stateDStream    = pairDStream.MapWithState(stateSpec);
                var snapshotDStream = stateDStream.StateSnapshots();

                Assert.IsNotNull(stateDStream);
                Assert.IsNotNull(snapshotDStream);
            }
            finally
            {
                SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
            }
        }
Beispiel #24
0
        static void Main(string[] args)
        {
            if (args.Length < 2)
            {
                Console.WriteLine("Usage: HdfsWordCount <checkpointDirectory> <inputDirectory>");
                return;
            }

            string checkpointPath = args[0];
            string inputDir       = args[1];

            StreamingContext ssc = StreamingContext.GetOrCreate(checkpointPath,
                                                                () =>
            {
                var sparkConf = new SparkConf();
                sparkConf.SetAppName("HdfsWordCount");
                var sc = new SparkContext(sparkConf);
                StreamingContext context = new StreamingContext(sc, 30000);
                context.Checkpoint(checkpointPath);

                var lines      = context.TextFileStream(inputDir);
                var words      = lines.FlatMap(l => l.Split(' '));
                var pairs      = words.Map(w => new KeyValuePair <string, int>(w, 1));
                var wordCounts = pairs.ReduceByKey((x, y) => x + y);

                wordCounts.ForeachRDD((time, rdd) =>
                {
                    Console.WriteLine("-------------------------------------------");
                    Console.WriteLine("Time: {0}", time);
                    Console.WriteLine("-------------------------------------------");
                    object[] taken = rdd.Take(10);
                    foreach (object record in taken)
                    {
                        Console.WriteLine(record);
                    }
                    Console.WriteLine();
                });

                return(context);
            });

            ssc.Start();
            ssc.AwaitTermination();
            ssc.Stop();
        }
Beispiel #25
0
        public void TestHiveContextConstructor()
        {
            var mockSparkContextProxy = new Mock <ISparkContextProxy>();

            var mockSparkSessionProxy = new Mock <ISparkSessionProxy>();
            var mockCatalogProxy      = new Mock <ICatalogProxy>();

            mockCatalogProxy.Setup(m => m.RefreshTable(It.IsAny <string>()));
            mockSparkSessionProxy.Setup(m => m.GetCatalog()).Returns(mockCatalogProxy.Object);
            mockSparkContextProxy.Setup(m => m.CreateSparkSession()).Returns(mockSparkSessionProxy.Object);

            var mockSparkConfProxy = new Mock <ISparkConfProxy>();

            mockSparkConfProxy.Setup(m => m.GetSparkConfAsString())
            .Returns("spark.master=master;spark.app.name=appname;config1=value1;config2=value2;");

            var conf        = new SparkConf(mockSparkConfProxy.Object);
            var hiveContext = new HiveContext(new SparkContext(mockSparkContextProxy.Object, conf));

            Assert.IsNotNull(hiveContext.SparkSession);
        }
Beispiel #26
0
        static void Main(string[] args)
        {
            LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); //this is optional - DefaultLoggerService will be used if not set
            var logger = LoggerServiceFactory.GetLogger(typeof(JdbcDataFrameExample));

            //For SQL Server use the connection string formats below
            //"jdbc:sqlserver://localhost:1433;databaseName=Temp;integratedSecurity=true;" or
            //"jdbc:sqlserver://localhost;databaseName=Temp;user=MyUserName;password=myPassword;"
            var connectionString = args[0];
            var tableName        = args[1];

            var sparkConf    = new SparkConf();
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext   = new SqlContext(sparkContext);
            var df           = sqlContext
                               .Read()
                               .Jdbc(connectionString, tableName, new Dictionary <string, string>());

            df.ShowSchema();
            var rowCount = df.Count();

            logger.LogInfo("Row count is " + rowCount);
            sparkContext.Stop();
        }
Beispiel #27
0
        static void Main(string[] args)
        {
            var cassandraHostName      = "localhost";
            var cassandraKeySpace      = "ks";
            var cassandraTableToRead   = "users";
            var cassandraTableToInsert = "filteredusers";

            if (args.Length == 4)
            {
                cassandraHostName      = args[0];
                cassandraKeySpace      = args[1];
                cassandraTableToRead   = args[2];
                cassandraTableToInsert = args[3];
            }

            /*
            ** CQL used to create table in Cassandra for this example **
            **
            **  CREATE TABLE users (
            **          username VARCHAR,
            **          firstname VARCHAR,
            **          lastname VARCHAR,
            **      PRIMARY KEY (username)
            **  );
            **
            **  INSERT INTO ks.users (username, firstname, lastname) VALUES ('JD123', 'John', 'Doe');
            **  INSERT INTO ks.users (username, firstname, lastname) VALUES ('BillJ', 'Bill', 'Jones');
            **  INSERT INTO ks.users (username, firstname, lastname) VALUES ('SL', 'Steve', 'Little');
            **
            **  CREATE TABLE filteredusers (
            **          username VARCHAR,
            **          firstname VARCHAR,
            **          lastname VARCHAR,
            **      PRIMARY KEY (username)
            **  );
            */

            var sparkConf    = new SparkConf().Set("spark.cassandra.connection.host", cassandraHostName);
            var sparkContext = new SparkContext(sparkConf);
            var sqlContext   = new SqlContext(sparkContext);

            //read from cassandra table
            var usersDataFrame =
                sqlContext.Read()
                .Format("org.apache.spark.sql.cassandra")
                .Options(new Dictionary <string, string> {
                { "keyspace", cassandraKeySpace }, { "table", cassandraTableToRead }
            })
                .Load();

            //display rows in the console
            usersDataFrame.Show();

            var createTempTableStatement =
                string.Format(
                    "CREATE TEMPORARY TABLE userstemp USING org.apache.spark.sql.cassandra OPTIONS(table \"{0}\", keyspace \"{1}\")",
                    cassandraTableToRead,
                    cassandraKeySpace);

            //create a temp table
            sqlContext.Sql(createTempTableStatement);

            //read from temp table, filter it and display schema and rows
            var filteredUsersDataFrame = sqlContext.Sql("SELECT * FROM userstemp").Filter("username = '******'");

            filteredUsersDataFrame.ShowSchema();
            filteredUsersDataFrame.Show();

            //write filtered rows to another table
            filteredUsersDataFrame.Write()
            .Format("org.apache.spark.sql.cassandra")
            .Options(new Dictionary <string, string> {
                { "keyspace", cassandraKeySpace }, { "table", cassandraTableToInsert }
            })
            .Save();

            //convert to RDD, execute map & filter and collect result
            var rddCollectedItems = usersDataFrame.ToRDD()
                                    .Map(
                r =>
                string.Format("{0},{1},{2}", r.GetAs <string>("username"),
                              r.GetAs <string>("firstname"),
                              r.GetAs <string>("lastname")))
                                    .Filter(s => s.Contains("SL"))
                                    .Collect();

            foreach (var rddCollectedItem in rddCollectedItems)
            {
                Console.WriteLine(rddCollectedItem);
            }

            Console.WriteLine("Completed running example");
        }
Beispiel #28
0
        //private static RDD<string> getUserVisit
        static void Main(string[] args)
        {
            string filepath   = @"hdfs:///common/vistizationData/";
            var    OutputPath = @"hdfs:///user/t-zhuxia/vistizationRes/";

            string uetLogPath = filepath + "gat_20160902_0600.csv";
            var    UICLogPath = filepath + "uic_20160902_0600.csv";
            string AnidPath   = filepath + "ANID_20160831.csv";
            string MuidPath   = filepath + "MUID_20160831.csv";
            var    Visitization_AppInstall_Output = OutputPath + "Visitization_AppInstall_20160902_00";
            var    NewEscrowFile = OutputPath + "NewEscrowCandidates_20160902";


            SparkConf    conf = (new SparkConf()).SetAppName("VisitizationStreaming");
            SparkContext sc   = new SparkContext(conf);

            RDD <string> rawUetLogs = getDataFromFile(sc, uetLogPath);

            var uetLogs = getUETLogs(rawUetLogs);

            var uetLogsKeyValpair = uetLogs.Map(line =>
            {
                if (!string.IsNullOrEmpty(line))
                {
                    UETLogView data = UETLogView.Deserialize(line);
                    string key      = data.DedupKey + "," +
                                      data.ANID + "," +
                                      data.IsNewMUID + "," +
                                      data.UAIPId + "," +
                                      data.ReferrerURL + "," +
                                      data.QueryString + "," +
                                      data.AnalyticsGuid;
                    return(new KeyValuePair <string, string>(key, line));
                }
                return(new KeyValuePair <string, string>(null, null));
            });

            uetLogs = uetLogsKeyValpair.ReduceByKey((x, y) =>
            {
                if (!string.IsNullOrEmpty(x) && !string.IsNullOrEmpty(y))
                {
                    return(x + delimeter + y);
                }
                if (!string.IsNullOrEmpty(x))
                {
                    return(x);
                }
                if (!string.IsNullOrEmpty(y))
                {
                    return(y);
                }
                return(null);
            }).Map <string>(UETLogDedupReducer.INSTANCE.GetData).Filter(line => !string.IsNullOrEmpty(line));

/*****************************************to do after this ****************************************************/
            var uetLogs_PageVisit = uetLogs.Filter(line =>
            {
                UETLogView data = UETLogView.Deserialize(line);
                return(string.IsNullOrEmpty(data.AppInstallClickId));
            });

            Console.Out.WriteLine("----------------uetLogs_PageVisitCount: " + uetLogs_PageVisit.Count());

            var uetLogs_AppInstall = uetLogs.Filter(line =>
            {
                UETLogView data = UETLogView.Deserialize(line);
                return(!string.IsNullOrEmpty(data.AppInstallClickId));
            });
            RDD <string> appInstallVisits = uetLogs_AppInstall.Map <string>(AppInstallProcessor.INSTANCE.GetData);

            Console.Out.WriteLine("----------------appInstallVisitsCount: " + appInstallVisits.Count());

            //appInstallVisits.Repartition(1).SaveAsTextFile(Visitization_AppInstall_Output);

            //----- Get UIC log
            var uicRaw = getDataFromFile(sc, UICLogPath);

            var UserIdConverage = getUICData(uicRaw);

            //----- Join uetlog with uic log
            var uetColumns = uetLogs_PageVisit.Map(line =>
            {
                var uetLog = UETLogView.Deserialize(line);
                return(new KeyValuePair <Guid?, string>(uetLog.UETMatchingGuid, line));
            });

            var uicColumns = UserIdConverage.Map(line =>
            {
                var uic = UserIdCoverageShcema.Deserialize(line);
                return(new KeyValuePair <Guid?, Guid?>(uic.UETMatchingGuid, uic.AnalyticsGuid));
            });

            var UETLogProcessedEntriesPageVisit = uetColumns.LeftOuterJoin(uicColumns).Map(line =>
            {
                var value = UETLogView.Deserialize(line.Value.Item1);
                if (line.Value.Item2.IsDefined)
                {
                    var agid = line.Value.Item2.GetValue();
                    if (agid != null)
                    {
                        value.AnalyticsGuid = agid;
                    }
                    value.DedupKey    = null;
                    value.QueryString = null;
                }
                return(UETLogView.Serialize(value));
            });

            var visitsForUsersKeyValuePair = UETLogProcessedEntriesPageVisit.Map(line =>
            {
                var value = UETLogView.Deserialize(line);
                var key   = value.UAIPId.ToString() + "," + value.TagId.ToString();
                return(new KeyValuePair <string, string>(key, line));
            }).ReduceByKey((x, y) => { return(x + delimeter + y); });

            var visitsForUsers = visitsForUsersKeyValuePair.FlatMap <string>(line =>
            {
                return(VisitizeReducer.INSTANCE.GetData(line));
            });

            // Step 7: First field to fill is UserIdType and build the general "UETUserId", by default it is UAIPID during the construction of SAEventConversionFacts.
            // Step 7.1: Build the TypeOfUser field.
            // The way of deciding the TypeOfUser is:
            // 1. If MUID is not NULL and IsNewMUID is false, UserIdType is MUID (TypeOfUser 2), later will join with UMS MUID view.
            // 2. If MUID is NULL but ANID is not, UserIdType is ANID (TypeOfUser 1), ater will join with UMS ANID view.
            // 3. If both MUID and ANID are NULL, but AnalyticsGuid is nut NULL, UserIdType is AnalyticsGuid (TypeOfUser 3)
            // 4. If AnalyticsGuid is also NULL, UserIdType is Unknown (TypeOfUser -1)

            var VisitForUserWithTypeOfUser = getVisitsForUsersWithTypeOfUser(visitsForUsers);
            // Step 7.2: Get the ANID and MUID sub-table out of the VisitsForUsers_WithTypeOfUser because we need to update
            // the ANID/MUID to StableIdValue according to UMS mapping
            var VisitsForUsers_WithTypeOfUser_ANID = VisitForUserWithTypeOfUser.Filter(line =>
            {
                var data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(data.TypeOfUser == 1);
            });
            var VisitsForUsers_WithTypeOfUser_MUID = VisitForUserWithTypeOfUser.Filter(line =>
            {
                var data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(data.TypeOfUser == 2);
            });
            // Step 7.3: Buid the UMS ANID/MUID view from "/shares/adCenter.BICore.SubjectArea/SubjectArea/Conversion/UMS/ANID_{yyyyMMdd}.ss(12.43GB)/MUID_{yyyyMMdd}.ss(166.66GB)"
            var UMS_ANIDData = getDataFromFile(sc, AnidPath);
            var UMS_MUIDData = getDataFromFile(sc, MuidPath);

            // Step 7.4: Join VisitsForUsers_WithTypeOfUser_ANID(MUID) with UMS_ANID(MUID)_MappingFile to get to use the StableIdValue.
            var VisitsForUsers_WithStableIdANIDGuid = VisitsForUsers_WithTypeOfUser_ANID.Map(line =>
            {
                var data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(data.ANID);
            });

            Console.Out.WriteLine("----------------VisitsForUsers_WithStableIdANIDGuid: " + VisitsForUsers_WithStableIdANIDGuid.Count());

            var VisitsForUsers_WithStableIdMUIDGuid = VisitsForUsers_WithTypeOfUser_MUID.Map(line =>
            {
                var data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(data.MUID);
            });

            Console.Out.WriteLine("----------------VisitsForUsers_WithStableIdMUIDGuid: " + VisitsForUsers_WithStableIdMUIDGuid.Count());

            var anid = getUMS_ANIDData(UMS_ANIDData).Map <KeyValuePair <Guid?, SerializaType> >(line =>
            {
                var an = line.DeserializeObject <UMS_ANID>();
                return(new KeyValuePair <Guid?, SerializaType>(an.ANID, line));
            }).FlatMap <KeyValuePair <Guid?, SerializaType> >(new BroadcastJoinWrapper(VisitsForUsers_WithStableIdANIDGuid, sc).Filter);

            var muid = getUMS_MUIDData(UMS_MUIDData).Map <KeyValuePair <Guid?, SerializaType> >(line =>
            {
                var an = line.DeserializeObject <UMS_MUID>();
                return(new KeyValuePair <Guid?, SerializaType>(an.MUID, line));
            }).FlatMap <KeyValuePair <Guid?, SerializaType> >(new BroadcastJoinWrapper(VisitsForUsers_WithStableIdMUIDGuid, sc).Filter);

            var VisitsForUsers_WithStableIdFromANID = VisitsForUsers_WithTypeOfUser_ANID.Map(line =>
            {
                VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(new KeyValuePair <Guid?, SerializaType>(data.ANID, line));
            }).LeftOuterJoin(anid).Map(line =>
            {
                VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line.Value.Item1);
                var VA                       = new VisitsForUsersWithStableIdFromID();
                VA.UAIPId                    = data.UAIPId;
                VA.TagId                     = data.TagId;
                VA.TagName                   = data.TagName;
                VA.AnalyticsGuid             = data.AnalyticsGuid;
                VA.SAEventConversionFactsRow = data.SAEventConversionFactsRow;
                if (line.Value.Item2.IsDefined)
                {
                    var an      = line.Value.Item2.GetValue().DeserializeObject <UMS_ANID>();
                    VA.StableId = an.ANID;
                }
                else
                {
                    VA.StableId = data.ANID;
                }
                return(VA.SerializeObject());
            });

            var VisitsForUsers_WithStableIdFromMUID = VisitsForUsers_WithTypeOfUser_MUID.Map(line =>
            {
                VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(new KeyValuePair <Guid?, SerializaType>(data.MUID, line));
            }).LeftOuterJoin(muid).Map(line =>
            {
                VisitsForUser_WithTypeOfUser data = VisitsForUser_WithTypeOfUser.Deserialize(line.Value.Item1);
                var VA                       = new VisitsForUsersWithStableIdFromID();
                VA.UAIPId                    = data.UAIPId;
                VA.TagId                     = data.TagId;
                VA.TagName                   = data.TagName;
                VA.AnalyticsGuid             = data.AnalyticsGuid;
                VA.SAEventConversionFactsRow = data.SAEventConversionFactsRow;
                if (line.Value.Item2.IsDefined)
                {
                    var an      = line.Value.Item2.GetValue().DeserializeObject <UMS_MUID>();
                    VA.StableId = an.MUID;
                }
                else
                {
                    VA.StableId = data.MUID;
                }
                return(VA.SerializeObject());
            });

            Console.WriteLine("-----------------VisitsForUsers_WithStableIdFromANID: " + VisitsForUsers_WithStableIdFromANID.Count());
            Console.WriteLine("-----------------VisitsForUsers_WithStableIdFromMUID: " + VisitsForUsers_WithStableIdFromMUID.Count());

            // Step 7.5: Select the UETUserId from the StableId and add the UserType according to whether it is from ANID or MUID
            var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part1 = VisitsForUsers_WithStableIdFromANID.Map(line =>
            {
                var VA = line.DeserializeObject <VisitsForUsersWithStableIdFromID>();
                VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart();
                data.UETUserId                 = VA.StableId;
                data.TypeOfUser                = UserType.A;
                data.UAIPId                    = VA.UAIPId;
                data.TagId                     = VA.TagId;
                data.TagName                   = VA.TagName;
                data.AnalyticsGuid             = VA.AnalyticsGuid;
                data.SAEventConversionFactsRow = VA.SAEventConversionFactsRow;
                return(data.SerializeObject());
            });
            var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part2 = VisitsForUsers_WithStableIdFromMUID.Map(line =>
            {
                var VA = line.DeserializeObject <VisitsForUsersWithStableIdFromID>();
                VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart();
                data.UETUserId                 = VA.StableId;
                data.TypeOfUser                = UserType.M;
                data.UAIPId                    = VA.UAIPId;
                data.TagId                     = VA.TagId;
                data.TagName                   = VA.TagName;
                data.AnalyticsGuid             = VA.AnalyticsGuid;
                data.SAEventConversionFactsRow = VA.SAEventConversionFactsRow;
                return(data.SerializeObject());
            });
            var VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part = VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part2.Union(VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part1);


            // Step 7.6: For the AnalyticsGuid sub-table of the VisitsForUsers_WithTypeOfUser, use AnalyticsGuid as the UETUserId and "AG" as the UserType.
            var VisitsForUsers_WithUETUserId_AnalyticsGuid_Other_UNION_Part = VisitForUserWithTypeOfUser.Filter(line =>
            {
                var data = VisitsForUser_WithTypeOfUser.Deserialize(line);
                return(data.TypeOfUser == 3 || data.TypeOfUser == -1);
            }).Map(line =>
            {
                var Visits = VisitsForUser_WithTypeOfUser.Deserialize(line);
                VisitsForUsersWithUETUserIdMUIDANIDPart data = new VisitsForUsersWithUETUserIdMUIDANIDPart();
                data.UAIPId                    = Visits.UAIPId;
                data.TagId                     = Visits.TagId;
                data.TagName                   = Visits.TagName;
                data.AnalyticsGuid             = Visits.AnalyticsGuid;
                data.SAEventConversionFactsRow = Visits.SAEventConversionFactsRow;
                if (Visits.TypeOfUser == 3)
                {
                    data.UETUserId  = Visits.AnalyticsGuid;
                    data.TypeOfUser = UserType.AG;
                }
                else
                {
                    data.UETUserId  = Visits.UAIPId;
                    data.TypeOfUser = UserType.UA;
                }
                return(data.SerializeObject());
            });
            // Step 7.7: Union result from 7.5 and 7.6
            var VisitsForUsers_WithUETUserId = VisitsForUsers_WithUETUserId_MUID_ANID_UNION_Part.Union(VisitsForUsers_WithUETUserId_AnalyticsGuid_Other_UNION_Part);

            // Step 7.8: Reduce on UETUserId, UAIPId, TagId, using UserCombineReducer
            VisitsForUsers_WithUETUserId = VisitsForUsers_WithUETUserId.Map(line =>
            {
                var data = line.DeserializeObject <VisitsForUsersWithUETUserIdMUIDANIDPart>();
                return(new VisitsForUsersWithUETUserId(data, data.SAEventConversionFactsRow.Visits[0].Events[0].EventDateTime).SerializeObject());
            });

            var VisitsForUsers_Current = VisitsForUsers_WithUETUserId
                                         .Map(line =>
            {
                var data = line.DeserializeObject <VisitsForUsersWithUETUserId>();
                return(new KeyValuePair <long, string>(data.EventDateTime, line));
            })
                                         .SortByKey()
                                         .Map(line =>
            {
                var data = line.Value.DeserializeObject <VisitsForUsersWithUETUserId>();
                var key  = string.Format("{0},{1},{2}", data.UETUserId, data.UAIPId, data.TagId);
                return(new KeyValuePair <string, string>(key, line.Value));
            })
                                         .ReduceByKey((x, y) =>
            {
                if (!string.IsNullOrEmpty(x) && !string.IsNullOrEmpty(y))
                {
                    return(x + delimeter + y);
                }
                if (!string.IsNullOrEmpty(x))
                {
                    return(x);
                }
                if (!string.IsNullOrEmpty(y))
                {
                    return(y);
                }
                return(null);
            }).Map <SerializaType>(UserCombineReducer.INSTANCE.getData);

            // Step 8: Handle the current hour result with Escrow visits from the previous hour:
            //As EscrowFile doesn't exists, so skip this step

            // Step 9: Calculate conversions for each visit using GoalConversionProcessor and output it.
            var VisitsWithConversions = VisitsForUsers_Current.MapPartitions(GoalConversionProcessor.INSTANCE.getData);

            // Step 10: Update the Escrow file
            var VisitsWithConversions_notUAIP = VisitsWithConversions.Filter(line =>
            {
                var data = line.DeserializeObject <VisitsWithConversion>();
                return(data.SAEventConversionFactsRow.UserIdType != UETUserIdType.UAIPID);
            });

            var NewEscrowCandidates = VisitsWithConversions_notUAIP.MapPartitions(EscrowCandidateProcessor.INSTANCE.getData);

            // Step 10.2: Output the result to the new escrow file
            NewEscrowCandidates.Repartition(1).SaveAsTextFile(NewEscrowFile);
            return;
        }