Example #1
0
        static void Main(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: Remember to include input and output path as arguments");
                Environment.Exit(1);
            }

            var sparkConf = SparkConfUtils.GetSparkConfigurationForFilePath(args);

            var spark = SparkSession
                        .Builder()
                        .AppName("Batch Job example using Apache Spark .Net")
                        .GetOrCreate();

            if (sparkConf != null)
            {
                sparkConf.ToList().ForEach(kv => { spark.Conf().Set(kv.Key, kv.Value); });
            }

            var df = spark
                     .Read()
                     .Schema(GetSchema())
                     .Option("header", true)
                     .Csv(args[0]);

            var processedDF = ProcessDataset(df);

            ShowDatasetInfo(processedDF);

            WriteData(processedDF, args[1]);

            Console.WriteLine("Finished .Net Spark Job!!");
        }
Example #2
0
        static void Main(string[] args)
        {
            var host = "localhost";
            var port = 9999;

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Emotion_Prediction")
                                 .GetOrCreate();
            DataFrame lines = spark
                              .ReadStream()
                              .Format("socket")
                              .Option("host", host)
                              .Option("port", port)
                              .Load();

            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, " => " + Predict(str) });

            DataFrame arrayDf = lines.Select(Explode(udfArray(lines["value"])));

            StreamingQuery query = arrayDf
                                   .WriteStream()
                                   .Format("console")
                                   .Start();

            query.AwaitTermination();
        }
Example #3
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Datasource <path to SPARK_HOME/examples/src/main/resources/>");

                Environment.Exit(1);
            }

            string parquet = Path.Combine(args[0], "users.parquet");
            string json    = Path.Combine(args[0], "people.json");
            string csv     = Path.Combine(args[0], "people.csv");
            string orc     = Path.Combine(args[0], "users.orc");

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("SQL Datasource example using .NET for Apache Spark")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            RunBasicDatasourceExample(spark, parquet, json, csv, orc);

            RunParquetExample(spark, json);

            RunDatasourceExample(spark);

            spark.Stop();
        }
Example #4
0
        public void TestThreadLocalSessions()
        {
            SparkSession.ClearActiveSession();

            void testChildThread(string appName)
            {
                var thread = new Thread(() =>
                {
                    Assert.Null(SparkSession.GetActiveSession());

                    SparkSession.SetActiveSession(
                        SparkSession.Builder().AppName(appName).GetOrCreate());

                    // Since we are in the child thread, GetActiveSession() should return the child
                    // SparkSession.
                    SparkSession activeSession = SparkSession.GetActiveSession();
                    Assert.NotNull(activeSession);
                    Assert.Equal(appName, activeSession.Conf().Get("spark.app.name", null));
                });

                thread.Start();
                thread.Join();
            }

            for (int i = 0; i < 5; ++i)
            {
                testChildThread(i.ToString());
            }

            Assert.Null(SparkSession.GetActiveSession());
        }
Example #5
0
        public void Run(string[] args)
        {
            string servidoresKafka = args[0];
            string topico          = args[1];
            string modelo          = args[2];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Streaming com Kafka")
                                 .GetOrCreate();

            // Criando um dataframe pra receber dados do Kafka
            DataFrame df = spark
                           .ReadStream()
                           .Format("kafka")
                           .Option("kafka.bootstrap.servers", servidoresKafka)
                           .Option("subscribe", topico)
                           .Load()
                           .SelectExpr("CAST(value AS STRING)");

            /* Criando schema pra validar o JSON que virá nas mensagens do Kafka
             * Exemplo do JSON:
             * {
             *      "cliente": "Fulano",
             *      "produto": "Mochila",
             *      "opiniao": "Muito boa!"
             * }
             */
            var schema = new StructType(new[]
            {
                new StructField("cliente", new StringType()),
                new StructField("produto", new StringType()),
                new StructField("opiniao", new StringType())
            }); // struct<cliente:string,produto:string,valor_total:float>

            // Fazendo o parse do JSON pra um array ...
            df = df.WithColumn("json", Functions.FromJson(
                                   df.Col("value"),
                                   schema.SimpleString)
                               )
                 .Select("json.*"); // ... e retornando todas as colunas do array como um novo dataframe

            //Registrando uma função personalizada pra ser usada no dataframe
            spark.Udf().Register <string, float>("AnaliseDeSentimento",
                                                 (texto) => AnalisarSentimento(texto, modelo));
            // Criando nova coluna nota com o resultado da análise de sentimento
            df = df.WithColumn("nota", Functions.CallUDF("AnaliseDeSentimento", df.Col("opiniao")));

            // Colocando o streaming pra funcionar
            StreamingQuery query = df
                                   .WriteStream()
                                   .OutputMode(OutputMode.Append)
                                   .Format("console")
                                   //.Trigger(Trigger.Continuous(2000))
                                   //.Foreach(new RedisForeachWriter())
                                   .Start();

            query.AwaitTermination();   // Necessário pra deixar a aplcação no ar para processar os dados
        }
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();

            spark.Range(5).Show();
            spark.Range(10, 12).Show();
        }
Example #7
0
        public static void leerTxt()
        {
            Console.WriteLine("Hello World!");
            // Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words
            DataFrame words = dataFrame
                              .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                              .Select(Functions.Explode(Functions.Col("words"))
                                      .Alias("word"))
                              .GroupBy("word")
                              .Count()
                              .OrderBy(Functions.Col("count").Desc());

            // Show results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
Example #8
0
        static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName("DemoApp")
                        .GetOrCreate();

            var dataFrame = spark.Sql("select id, rand() as random_number from range(1000)");

            dataFrame
            .Write()
            .Format("csv")
            .Option("header", true)
            .Option("sep", "|")
            .Mode("overwrite")
            .Save(args[0]);

            foreach (var row in dataFrame.Collect())
            {
                if (row[0] as int? % 2 == 0)
                {
                    Console.WriteLine($"line: {row[0]}");
                }
            }
        }
Example #9
0
        static void Main(string[] args)
        {
            Console.WriteLine("Start SparkSession");
            SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate();
            DataFrame    dfCsv        =
                sparkSession
                .Read()
                .Option("delimiter", ";")
                .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " +
                        "SYM string , SYM_UL string , " +
                        "CECHA string , NAZWA_1 string ,NAZWA_2 string , " +
                        "STAN_NA string")
                .Csv("streets.csv");
            DataFrame dataIn = dfCsv
                               .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"]));
            DataFrame dataGroup = dataIn
                                  .Select("STREET")
                                  .GroupBy("STREET")
                                  .Count()
                                  .WithColumnRenamed("count", "COUNT");
            DataFrame dataOut = dataGroup
                                .OrderBy(dataGroup["COUNT"]
                                         .Desc()
                                         );

            dataOut
            .Coalesce(1)
            .Write()
            .Option("delimiter", ";")
            .Csv("result");
            sparkSession.Stop();
            Console.WriteLine("Stop SparkSession");
        }
Example #10
0
        static void Main(string[] args)
        {
            var file = args[0];

            Console.WriteLine("Reading file from:" + file);

            // Create Spark context
            var spark = SparkSession.Builder()
                        .Master("local[*]")
                        .AppName("SparkDotNet")
                        .GetOrCreate();

            // Read csv file
            var df = spark.Read()
                     .Option("sep", "\t")
                     .Option("header", "true")
                     .Option("inferSchema", "true")
                     .Csv(file);

            // Show schema and some rows
            df.PrintSchema();
            df.Show();

            // Register as a table
            df.CreateOrReplaceTempView("nyse");
            spark.Sql("SELECT `stock_symbol`, AVG(`stock_price_open`) FROM nyse GROUP BY 1").Show();
        }
Example #11
0
        static void Main(string[] args)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName("word_count_sample")
                        .GetOrCreate();
            var conf = spark.Conf();

            HttpClient client = new HttpClient();

            var content = client.GetAsync("https://raw.githubusercontent.com/AMustapha/meduim_words/master/storie.txt")
                          .Result.Content.ReadAsStringAsync().Result;

            File.WriteAllText("../../../storie.txt", content);

            DataFrame dataFrame = spark.Read().Text("storie.txt");
            var       words     = dataFrame
                                  .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                                  .Select(Functions.Explode(Functions.Col("words"))
                                          .Alias("word"))
                                  .GroupBy("word")
                                  .Count()
                                  .OrderBy(Functions.Col("count").Desc());

            // Show results
            words.Show();
        }
Example #12
0
        static void Main(string[] args)
        {
            //1. Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            //2. Create initial DataFrame
            DataFrame dataFrame = spark.Read()
                                  //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP")
                                  .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP")
                                  .Csv("DataBook.csv");

            dataFrame.Show();

            //Drop any rows with Null/Empty values
            DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na();
            DataFrame            CleanedProjects     = dropEmptytablesrows.Drop("any");
            var testdata = 0;

            //remove unnecessary Columns
            CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp");
            CleanedProjects.Show();
            // Stop Spark session--checked
            spark.Stop();
        }
        public static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // // Create initial DataFrame

            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
            Console.WriteLine("SCRAPY");
        }
Example #14
0
        static void Main(string[] args)
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("emrApp")
                                 .GetOrCreate();

            DataFrame dataFrame = spark
                                  .Read()
                                  .Format("avro")
                                  .Load(args[0]);

            RegionModel regionModel = new RegionModel();

            Func <Column, Column> udfConvertRegion = Udf <string, string>(
                city => {
                var regionCode      = city.Split('_')[1].Substring(0, 1);
                var convertedRegion = String.Empty;
                regionModel.ConversionTable.TryGetValue(regionCode, out convertedRegion);
                return(convertedRegion);
            }     // city_23 --> 23 --> 2 --> {2 : Brisbane} --> ** Brisbane **
                );

            dataFrame = dataFrame
                        .WithColumn("Region", udfConvertRegion(dataFrame["address.city"]))
                        .Drop("orderunits", "address");

            dataFrame
            .Coalesce(1)
            .Write()
            .Format("csv")
            .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}");
        }
Example #15
0
        public SparkFixture()
        {
            string workerDirEnvVarName = Services.ConfigurationService.WorkerDirEnvVarName;

            // The worker directory must be set for the Microsoft.Spark.Worker executable.
            if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable(workerDirEnvVarName)))
            {
                throw new Exception($"Environment variable '{workerDirEnvVarName}' must be set.");
            }

            BuildSparkCmd(out var filename, out var args);

            // Configure the process using the StartInfo properties.
            _process.StartInfo.FileName  = filename;
            _process.StartInfo.Arguments = args;
            // UseShellExecute defaults to true in .NET Framework,
            // but defaults to false in .NET Core. To support both, set it
            // to false which is required for stream redirection.
            _process.StartInfo.UseShellExecute        = false;
            _process.StartInfo.RedirectStandardInput  = true;
            _process.StartInfo.RedirectStandardOutput = true;
            _process.StartInfo.RedirectStandardError  = true;

            bool isSparkReady = false;

            _process.OutputDataReceived += (sender, arguments) =>
            {
                // Scala-side driver for .NET emits the following message after it is
                // launched and ready to accept connections.
                if (!isSparkReady &&
                    arguments.Data.Contains("Backend running debug mode"))
                {
                    isSparkReady = true;
                }
            };

            _process.Start();
            _process.BeginOutputReadLine();

            bool processExited = false;

            while (!isSparkReady && !processExited)
            {
                processExited = _process.WaitForExit(500);
            }

            if (processExited)
            {
                _process.Dispose();

                // The process should not have been exited.
                throw new Exception(
                          $"Process exited prematurely with '{filename} {args}'.");
            }

            Spark = SparkSession
                    .Builder()
                    .AppName("Microsoft.Spark.E2ETest")
                    .GetOrCreate();
        }
Example #16
0
        private static void Exemplo1()
        {
            // Create a Spark session
            var spark = SparkSession
                        .Builder()
                        .AppName("word_count_sample")
                        .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words
            var words = dataFrame
                        .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                        .Select(Functions.Explode(Functions.Col("words"))
                                .Alias("word"))
                        .GroupBy("word")
                        .Count()
                        .OrderBy(Functions.Col("count").Desc());

            // Show results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
Example #17
0
        static void Main(string[] args)
        {
            // Create a Spark session
            var spark = SparkSession
                        .Builder()
                        .AppName("post_analysis")
                        .GetOrCreate();


            // Create initial DataFrame
            var dataFrame = spark.Read()
                            .Json(@"C:\data\3dprinting.meta.stackexchange.com\Posts.json");

            dataFrame.CreateOrReplaceTempView("posts");


            // TODO: Filter tags to only contain C# questions
            //dataFrame
            //    .Select(Split(Col("Body"), " ").As("words"))
            //    .Select(Explode(Col("words")).As("word"))
            //    .GroupBy("word").Count().OrderBy(Col("count").Desc())
            //    .Show();

            // TODO: process text to find code fences and extract C# code


            // TODO: parse remaining C# code
            //var parseCSharp =
            //    Udf<string, IDictionary<string, string[]>>(
            //        (str) => GetSyntaxKindsMap(str));
        }
Example #18
0
        static void Main(string[] args)
        {
            // Set the debug backend port, has to be same as the one in the Dockerfile.
            System.Environment.SetEnvironmentVariable("DOTNETBACKEND_PORT", "12345");

            // Create a Spark session.
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();

            // Create initial DataFrame.
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words.
            DataFrame words = dataFrame
                              .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                              .Select(Functions.Explode(Functions.Col("words"))
                                      .Alias("word"))
                              .GroupBy("word")
                              .Count()
                              .OrderBy(Functions.Col("count").Desc());

            // Show results.
            words.Show();

            // Stop Spark session.
            spark.Stop();
        }
Example #19
0
        static void Main(string[] args)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .GetOrCreate();

            // Create initial DataFrame
            string    filePath  = args[0];
            DataFrame dataFrame = spark.Read().Text(filePath);

            //Count words
            DataFrame words =
                dataFrame
                .Select(Split(Col("value"), " ").Alias("words"))
                .Select(Explode(Col("words")).Alias("word"))
                .GroupBy("word")
                .Count()
                .OrderBy(Col("count").Desc());

            // Display results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
Example #20
0
        static void Main(string[] args)
        {
            // Create a Spark session
            var spark = SparkSession
                        .Builder()
                        .AppName("DotNet-Word-Count")
                        .GetOrCreate();

            // Create initial DataFrame
            var df = spark.Read().Text("file:/home/anderson.souza/bin/lorem_action.txt");

            df.PrintSchema();

            // Count words
            var words = df
                        .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                        .Select(Functions.Explode(Functions.Col("words")).Alias("words"))
                        .Select(Functions.RegexpReplace(Functions.Col("words"), "\\.|,", "").Alias("words"))
                        .Filter(Functions.Length(Functions.Col("words")) > 5).Alias("words")
                        .GroupBy("words")
                        .Count()
                        .OrderBy(Functions.Col("count").Desc());

            // Show results
            words.PrintSchema();
            words.Show();
        }
Example #21
0
        public static void leerJSON()
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            // A CSV dataset is pointed to by path.
            // The path can be either a single CSV file or a directory of CSV files
            string path = "data/sample_data.csv";

            //Dataset<Row> df = spark.Read().Csv(path);//.csv(path);
            DataFrame df = spark.Read().Csv(path);

            df.Show();
            // +------------------+
            // |               _c0|
            // +------------------+
            // |      name;age;job|
            // |Jorge;30;Developer|
            // |  Bob;32;Developer|
            // +------------------+

            //realizar conteo de nombres con sql
            DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data");

            // Show results
            sqlDf.Show();

            // Stop Spark session
            spark.Stop();
        }
        private static void Run(string logLevel, List <string> metricsToObtain)
        {
            var spark = SparkSession
                        .Builder()
                        .AppName(SparkAppName)
                        .GetOrCreate();

            spark.SparkContext.SetLogLevel(logLevel);

            // Read initial dataframes as non-streaming DataFrames
            var moviesDataFrame = ReadCsvIntoDataframe(spark, MoviesCsvFile, SchemaLoader.MovieSchema);

            // Read initial dataframes as non-streaming DataFrames
            var ratingsDataFrame = ReadCsvIntoDataframe(spark, RatingsCsvFile, SchemaLoader.RatingSchema);

            foreach (var metric in metricsToObtain)
            {
                var watch = new Stopwatch();
                watch.Start();
                var colRows = RunMetric(moviesDataFrame, ratingsDataFrame, metric);
                watch.Stop();

                PrintRows(colRows.ToList(), watch.Elapsed.TotalSeconds);
            }
        }
Example #23
0
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();

            spark.Sql("CREATE DATABASE InputData");

            spark.Catalog.SetCurrentDatabase("InputData");
            spark.Catalog.CreateTable("id_list", "./ID.parquet");

            var tables = spark.Catalog.ListTables("InputData");

            foreach (var row in tables.Collect())
            {
                var name     = row[0].ToString();
                var database = row[1].ToString();

                Console.WriteLine($"Database: {database}, Table: {name}");
                var table = spark.Catalog.ListColumns(database, name);
                foreach (var column in table.Collect())
                {
                    var columnName = column[0].ToString();
                    var dataType   = column[2].ToString();

                    Console.WriteLine($"{columnName}\t{dataType}");
                }
            }
        }
Example #24
0
        static void Main(string[] args)
        {
            SparkSession ss =
                SparkSession
                .Builder()
                .AppName(".NET for Spark Streaming")
                .GetOrCreate();

            DataFrame stream =
                ss
                .ReadStream()
                .Format("socket")
                .Option("host", "localhost")
                .Option("port", 9000)
                .Load();

            DataFrame grade =
                stream
                .Select(Col("value"));

            StreamingQuery query =
                grade
                .WriteStream()
                .OutputMode(OutputMode.Append)
                .Format("console")
                .Start();

            query.AwaitTermination();
        }
Example #25
0
        public static void ExecuteSimpleVerificationSuiteWithExternalFile()
        {
            var spark = SparkSession.Builder().GetOrCreate();
            var data  = spark.Read().Json("data/inventory.json");

            data.Show();

            VerificationResult verificationResult = new VerificationSuite()
                                                    .OnData(data)
                                                    .AddCheck(
                new Check(CheckLevel.Error, "integrity checks")
                .HasSize(value => value == 5)
                .IsComplete("id")
                .IsUnique("id")
                .IsComplete("productName")
                .IsContainedIn("priority", new[] { "high", "low" })
                .IsNonNegative("numViews")
                )
                                                    .AddCheck(
                new Check(CheckLevel.Warning, "distribution checks")
                .ContainsURL("description", value => value >= .5)
                )
                                                    .Run();

            verificationResult.Debug();
        }
Example #26
0
        public static void RunSparkStream(string streamInputPath)
        {
            var foreachWriter = new TestForeachWriter();

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("itur")
                                 .GetOrCreate();


            var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[]
            {
                new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()),
                new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType())
            });

            DataFrame lines = spark
                              .ReadStream()
                              .Schema(mySchema)
                              .Csv(streamInputPath);

            s_query = lines
                      .WriteStream()
                      .Foreach(foreachWriter)
                      .Trigger(Trigger.ProcessingTime(5000))
                      .Start();


            s_query.AwaitTermination();
        }
Example #27
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <SparkContext>(_spark.SparkContext);

            Assert.IsType <Builder>(SparkSession.Builder());

            SparkSession.ClearDefaultSession();
            SparkSession.SetDefaultSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetDefaultSession());

            Assert.IsType <RuntimeConfig>(_spark.Conf());

            Assert.IsType <SparkSession>(_spark.NewSession());

            Assert.IsType <DataFrameReader>(_spark.Read());

            Assert.IsType <DataFrame>(_spark.Range(10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5));

            _spark.Range(10).CreateOrReplaceTempView("testView");
            Assert.IsType <DataFrame>(_spark.Table("testView"));

            Assert.IsType <DataStreamReader>(_spark.ReadStream());

            Assert.IsType <UdfRegistration>(_spark.Udf());

            Assert.IsType <Catalog>(_spark.Catalog());
        }
        static void Main(string[] args)
        {
            var spark = SparkSession.Builder().GetOrCreate();
            var df    = spark.Read().Json("people.json");

            df.Show();
        }
Example #29
0
        /// <summary>
        /// To integrate with Hive operations
        /// </summary>
        private static void HiveDataFrame()
        {
            var builder = SparkSession.Builder().EnableHiveSupport();

            builder = builder.Config("spark.master", "yarn");
            builder = builder.Config("spark.app.name", "HiveDataFrame");
            builder = builder.Config("spark.sql.warehouse.dir", "/user/hive/warehouse");
            session = builder.GetOrCreate();
            var peopleDataFrame = session.Read().Json(jsonFilePath);

            logger.LogInfo("****Create table if not exists****");
            session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
            logger.LogInfo("****Database Created****");
            session.Sql(string.Format("USE {0}", dbName));

            logger.LogInfo("****Create Table operation started****");
            peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
            logger.LogInfo("****Table Created successfully****");
            var tablesDataFrame = session.Table(tableName);

            logger.LogInfo(string.Format("****Table count in database {0}: {1}", dbName, tablesDataFrame.Count()) + "****");
            var rowCollections = tablesDataFrame.Collect();

            logger.LogInfo("**********************************************");
            foreach (var row in rowCollections)
            {
                Console.WriteLine("{0}", row);
            }
            logger.LogInfo("*********************************************");
            logger.LogInfo("Executed Successfully.................");
        }
Example #30
0
        static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
        }