Beispiel #1
0
        public static void leerTxt()
        {
            Console.WriteLine("Hello World!");
            // Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words
            DataFrame words = dataFrame
                              .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                              .Select(Functions.Explode(Functions.Col("words"))
                                      .Alias("word"))
                              .GroupBy("word")
                              .Count()
                              .OrderBy(Functions.Col("count").Desc());

            // Show results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
Beispiel #2
0
        static void Main(string[] args)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .GetOrCreate();

            // Create initial DataFrame
            string    filePath  = args[0];
            DataFrame dataFrame = spark.Read().Text(filePath);

            //Count words
            DataFrame words =
                dataFrame
                .Select(Split(Col("value"), " ").Alias("words"))
                .Select(Explode(Col("words")).Alias("word"))
                .GroupBy("word")
                .Count()
                .OrderBy(Col("count").Desc());

            // Display results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
Beispiel #3
0
        public static void leerJSON()
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            // A CSV dataset is pointed to by path.
            // The path can be either a single CSV file or a directory of CSV files
            string path = "data/sample_data.csv";

            //Dataset<Row> df = spark.Read().Csv(path);//.csv(path);
            DataFrame df = spark.Read().Csv(path);

            df.Show();
            // +------------------+
            // |               _c0|
            // +------------------+
            // |      name;age;job|
            // |Jorge;30;Developer|
            // |  Bob;32;Developer|
            // +------------------+

            //realizar conteo de nombres con sql
            DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data");

            // Show results
            sqlDf.Show();

            // Stop Spark session
            spark.Stop();
        }
Beispiel #4
0
        static void Main(string[] args)
        {
            // Set the debug backend port, has to be same as the one in the Dockerfile.
            System.Environment.SetEnvironmentVariable("DOTNETBACKEND_PORT", "12345");

            // Create a Spark session.
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();

            // Create initial DataFrame.
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words.
            DataFrame words = dataFrame
                              .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                              .Select(Functions.Explode(Functions.Col("words"))
                                      .Alias("word"))
                              .GroupBy("word")
                              .Count()
                              .OrderBy(Functions.Col("count").Desc());

            // Show results.
            words.Show();

            // Stop Spark session.
            spark.Stop();
        }
Beispiel #5
0
        static void Main(string[] args)
        {
            Console.WriteLine("Start SparkSession");
            SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate();
            DataFrame    dfCsv        =
                sparkSession
                .Read()
                .Option("delimiter", ";")
                .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " +
                        "SYM string , SYM_UL string , " +
                        "CECHA string , NAZWA_1 string ,NAZWA_2 string , " +
                        "STAN_NA string")
                .Csv("streets.csv");
            DataFrame dataIn = dfCsv
                               .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"]));
            DataFrame dataGroup = dataIn
                                  .Select("STREET")
                                  .GroupBy("STREET")
                                  .Count()
                                  .WithColumnRenamed("count", "COUNT");
            DataFrame dataOut = dataGroup
                                .OrderBy(dataGroup["COUNT"]
                                         .Desc()
                                         );

            dataOut
            .Coalesce(1)
            .Write()
            .Option("delimiter", ";")
            .Csv("result");
            sparkSession.Stop();
            Console.WriteLine("Stop SparkSession");
        }
Beispiel #6
0
        static void Main(string[] args)
        {
            //1. Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            //2. Create initial DataFrame
            DataFrame dataFrame = spark.Read()
                                  //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP")
                                  .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP")
                                  .Csv("DataBook.csv");

            dataFrame.Show();

            //Drop any rows with Null/Empty values
            DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na();
            DataFrame            CleanedProjects     = dropEmptytablesrows.Drop("any");
            var testdata = 0;

            //remove unnecessary Columns
            CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp");
            CleanedProjects.Show();
            // Stop Spark session--checked
            spark.Stop();
        }
        public static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // // Create initial DataFrame

            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
            Console.WriteLine("SCRAPY");
        }
Beispiel #8
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Datasource <path to SPARK_HOME/examples/src/main/resources/>");

                Environment.Exit(1);
            }

            string parquet = Path.Combine(args[0], "users.parquet");
            string json    = Path.Combine(args[0], "people.json");
            string csv     = Path.Combine(args[0], "people.csv");
            string orc     = Path.Combine(args[0], "users.orc");

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("SQL Datasource example using .NET for Apache Spark")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            RunBasicDatasourceExample(spark, parquet, json, csv, orc);

            RunParquetExample(spark, json);

            RunDatasourceExample(spark);

            spark.Stop();
        }
Beispiel #9
0
        static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
        }
Beispiel #10
0
        public void TestStop()
        {
            var mockSparkSessionProxy = new Mock <ISparkSessionProxy>();
            var sparkSession          = new SparkSession(mockSparkSessionProxy.Object);

            sparkSession.Stop();
            mockSparkSessionProxy.Verify(m => m.Stop(), Times.Once);
        }
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: GitHubProjects <path to projects.csv>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("GitHub and Spark Batch")
                                 .GetOrCreate();

            DataFrame projectsDf = spark
                                   .Read()
                                   .Schema("id INT, url STRING, owner_id INT, " +
                                           "name STRING, descriptor STRING, language STRING, " +
                                           "created_at STRING, forked_from INT, deleted STRING, " +
                                           "updated_at STRING")
                                   .Csv(args[0]);

            projectsDf.Show();

            // Drop any rows with NA values
            DataFrameNaFunctions dropEmptyProjects = projectsDf.Na();
            DataFrame            cleanedProjects   = dropEmptyProjects.Drop("any");

            // Remove unnecessary columns
            cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id");
            cleanedProjects.Show();

            // Average number of times each language has been forked
            DataFrame groupedDF = cleanedProjects
                                  .GroupBy("language")
                                  .Agg(Avg(cleanedProjects["forked_from"]));

            // Sort by most forked languages first
            groupedDF.OrderBy(Desc("avg(forked_from)")).Show();

            spark.Udf().Register <string, bool>(
                "MyUDF",
                (date) => DateTime.TryParse(date, out DateTime convertedDate) &&
                (convertedDate > s_referenceDate));

            cleanedProjects.CreateOrReplaceTempView("dateView");

            DataFrame dateDf = spark.Sql(
                "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView");

            dateDf.Show();

            spark.Stop();
        }
Beispiel #12
0
        //Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution().
        //private static void Sudokures(string cores, string nodes, string mem, int nrows){
        private static void Sudokures(int nrows)
        {
            // Initialisation de la session Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config("spark.executor.memory", "4G")
                                 .GetOrCreate();
            //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances")
            //.Config("spark.driver.cores", cores)
            //.Config("spark.executor.instances", nodes)
            //.Config("spark.executor.memory", mem)
            //.GetOrCreate();

            // Intégration du csv dans un dataframe
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(_filePath);

            //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction
            DataFrame df2 = df.Limit(nrows);

            //Watch seulement pour la résolution des sudokus
            var watch2 = new System.Diagnostics.Stopwatch();

            watch2.Start();

            // Création de la spark User Defined Function
            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => Sudokusolution(sudoku));

            // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi
            df2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved");

            sqlDf.Show();

            watch2.Stop();

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms");
            //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms");
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();

            spark.Stop();
        }
Beispiel #13
0
        public static void Main(string[] args)
        {
            // arquivo usado : https://www.kaggle.com/gbonesso/b3-stock-quotes/data?select=COTAHIST_A2009_to_A2020_P.csv
            // essa poc calcula o preco medio da acao nesse periodo

            SparkConf sparkConf = new SparkConf();

            sparkConf.SetMaster("local[*]");  // '*' indica pra usar todos os cores

            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config(sparkConf)
                                 .AppName("SparkNetPOC")
                                 .GetOrCreate();


            Stopwatch sw = new Stopwatch();

            sw.Start();


            DataFrame dataFrameGeral = spark.Read()
                                       .Schema("vazio STRING, TIPREG STRING,DATPRE STRING,CODBDI STRING,CODNEG STRING,TPMERC STRING,NOMRES STRING,ESPECI STRING," +
                                               "PRAZOT STRING,MODREF STRING,PREABE STRING,PREMAX STRING,PREMIN STRING,PREMED STRING,PREULT STRING,PREOFC STRING," +
                                               "PREOFV STRING,TOTNEG STRING,QUATOT STRING," +
                                               "VOLTOT STRING,PREEXE STRING,INDOPC STRING,DATVEN STRING,FATCOT STRING,PTOEXE STRING,CODISI STRING,DISMES STRING")
                                       .Csv(@"C:\InternetDownloads\10318_1101179_compressed_COTAHIST_A2009_to_A2020_P.csv\COTAHIST_A2009_to_A2020_P.csv");


            DataFrame dataFrameColunasUteis = dataFrameGeral
                                              .Drop("vazio", "TIPREG", "DATPRE", "CODBDI", "TPMERC", "NOMRES", "ESPECI", "PRAZOT", "MODREF", "PREABE", "PREMIN",
                                                    "PREMED", "PREULT", "PREOFC", "PREOFV", "TOTNEG", "QUATOT", "VOLTOT", "PREEXE", "INDOPC", "DATVEN", "FATCOT", "PTOEXE", "CODISI", "DISMES");

            DataFrame dataFrameFiltro = dataFrameColunasUteis
                                        .Filter("CODNEG = 'ITSA3' OR CODNEG = 'ABEV3' OR CODNEG = 'PETR4'");

            DataFrame dataFrameFinal = dataFrameFiltro
                                       .GroupBy("CODNEG")
                                       .Agg(Avg("PREMAX"));

            dataFrameFinal.Show();


            spark.Stop();

            sw.Stop();
            Console.WriteLine("Tempo = " + sw.ElapsedMilliseconds);
        }
Beispiel #14
0
        private static void Main(string[] args)
        {
            if (args.Length != 4)
            {
                Console.WriteLine("Usage:");
                Console.WriteLine("\t<spark-submit> --master local");
                Console.WriteLine("\t\t--class org.apache.spark.deploy.dotnet.DotnetRunner <path-to-microsoft-spark-jar>");
                Console.WriteLine("\t\tTpch.exe <tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>");

                return;
            }

            var tpchRoot     = args[0];
            var queryNumber  = args[1];
            var numIteration = int.Parse(args[2]);
            var isSQL        = bool.Parse(args[3]);

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("TPC-H Benchmark for DotNet")
                                 .GetOrCreate();

            for (var i = 0; i < numIteration; ++i)
            {
                Stopwatch sw     = Stopwatch.StartNew();
                Stopwatch swFunc = new Stopwatch();
                if (!isSQL)
                {
                    var tpchFunctional = new TpchFunctionalQueries(tpchRoot, spark);
                    swFunc.Start();
                    tpchFunctional.Run(queryNumber.ToString());
                    swFunc.Stop();
                }
                else
                {
                    var tpchSql = new TpchSqlQueries(tpchRoot, spark);
                    tpchSql.Run(queryNumber.ToString());
                }
                sw.Stop();

                var typeStr = isSQL ? "SQL" : "Functional";
                Console.WriteLine($"TPCH_Result,DotNet,{typeStr},{queryNumber},{i},{sw.ElapsedMilliseconds},{swFunc.ElapsedMilliseconds}");
            }

            spark.Stop();
        }
Beispiel #15
0
        public string SparkTest([FromServices] IAWSSettings awsSettings)
        {
            string result = "ok";

            try
            {
                SparkSession spark = SparkSession
                                     .Builder()
                                     .AppName("itur")
                                     .GetOrCreate();

                var mySchema = new Microsoft.Spark.Sql.Types.StructType(new[]
                {
                    new StructField("IturCode", new Microsoft.Spark.Sql.Types.StringType()),
                    new StructField("IturERP", new Microsoft.Spark.Sql.Types.StringType()),
                    new StructField("QuantityEdit", new Microsoft.Spark.Sql.Types.StringType()),
                    new StructField("PartialQuantity", new Microsoft.Spark.Sql.Types.StringType())
                });

                string assemblyLocation = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
                string iturInputPath    = Path.Combine(assemblyLocation, "data", "itur.csv");

                DataFrame df = spark.Read()
                               .Format("csv")
                               .Schema(mySchema)
                               .Option("delimiter", ",")
                               .Option("header", true)
                               //.Option("dateFormat", "dd/MM/yyyy")
                               .Load(iturInputPath);

                string dt         = DateTime.Now.ToString("MMddhhmmss");
                string outputfile = Path.Combine(assemblyLocation, "outputData", $"itur_out{dt}.json");
                df.Write().Json(outputfile);

                //string toPath = $"s3n://{awsSettings.AccessKey}:{awsSettings.SecretKey}@{_bucketName}/{path}";
                //spark.Range(100).Repartition(5).Write().Mode("overwrite").Text(toPath) ;

                spark.Stop();
            }
            catch (Exception ex)
            {
                result = ex.Message;
            }
            return(result);
        }
Beispiel #16
0
        public void Run(string[] args)
        {
            if (args.Length != 2)
            {
                Console.Error.WriteLine(
                    "Usage: <path to yelptest.csv> <path to MLModel.zip>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET for Apache Spark Sentiment Analysis")
                                 .GetOrCreate();

            // Read in and display Yelp reviews
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(args[0]);

            df.Show();

            // Use ML.NET in a UDF to evaluate each review
            spark.Udf().Register <string, bool>(
                "MLudf",
                (text) => Sentiment(text, args[1]));

            // Use Spark SQL to call ML.NET UDF
            // Display results of sentiment analysis on reviews
            df.CreateOrReplaceTempView("Reviews");
            DataFrame sqlDf = spark.Sql("SELECT ReviewText, MLudf(ReviewText) FROM Reviews");

            sqlDf.Show();

            // Print out first 20 rows of data
            // Prevent data getting cut off by setting truncate = 0
            sqlDf.Show(20, 0, false);

            spark.Stop();
        }
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Sql.VectorDataFrameUdfs <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 // Lower the shuffle partitions to speed up groupBy() operations.
                                 .Config("spark.sql.shuffle.partitions", "3")
                                 .AppName("SQL VectorUdfs example using .NET for Apache Spark")
                                 .GetOrCreate();

            DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]);

            StructType schema = df.Schema();

            Console.WriteLine(schema.SimpleString);

            df.Show();

            df.PrintSchema();

            // Grouped Map Vector UDF
            // able to return different shapes and record lengths
            df.GroupBy("age")
            .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                r => CountCharacters(r))
            .Show();

            spark.Stop();
        }
Beispiel #18
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Logging <path to Apache User Logs>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Apache User Log Processing")
                                 .GetOrCreate();

            // Read input log file and display it
            DataFrame df = spark.Read().Text(args[0]);

            df.Show();

            // Step 1: UDF to determine if each line is a valid log entry
            // Remove any invalid entries before further filtering
            spark.Udf().Register <string, bool>(
                "GeneralReg",
                log => Regex.IsMatch(log, s_apacheRx));

            df.CreateOrReplaceTempView("Logs");

            // Apply the UDF to get valid log entries
            DataFrame generalDf = spark.Sql(
                "SELECT logs.value, GeneralReg(logs.value) FROM Logs");

            // Only keep log entries that matched the reg ex
            generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]);
            generalDf.Show();

            // View the resulting schema
            // Notice we created a new column "GeneralReg(value)"
            generalDf.PrintSchema();

            // Step 2: Choose valid log entries that start with 10
            spark.Udf().Register <string, bool>(
                "IPReg",
                log => Regex.IsMatch(log, "^(?=10)"));

            generalDf.CreateOrReplaceTempView("IPLogs");

            // Apply UDF to get valid log entries starting with 10
            // Use SQL "WHERE" rather than doing ipDf.Filter(),
            // which avoids creating an extra column "IPReg(value)"
            DataFrame ipDf = spark.Sql(
                "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)");

            ipDf.Show();

            // Step 3: Choose valid log entries that start
            // with 10 and deal with spam
            spark.Udf().Register <string, bool>(
                "SpamRegEx",
                log => Regex.IsMatch(log, "\\b(?=spam)\\b"));

            ipDf.CreateOrReplaceTempView("SpamLogs");

            // Apply UDF to get valid, start with 10, spam entries
            DataFrame spamDF = spark.Sql(
                "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)");

            // Let's explore the columns in the data we have filtered
            // Use LINQ to count the number of GET requests
            int numGetRequests = spamDF
                                 .Collect()
                                 .Where(r => ContainsGet(r.GetAs <string>("value")))
                                 .Count();

            Console.WriteLine("Number of GET requests: " + numGetRequests);

            spark.Stop();
        }
Beispiel #19
0
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName(".NET Spark SQL basic example")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            // Need to explicitly specify the schema since pickling vs. arrow formatting
            // will return different types. Pickling will turn longs into ints if the values fit.
            // Same as the "age INT, name STRING" DDL-format string.
            var inputSchema = new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("name", new StringType())
            });
            DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]);

            Spark.Sql.Types.StructType schema = df.Schema();
            Console.WriteLine(schema.SimpleString);

            IEnumerable <Row> rows = df.Collect();

            foreach (Row row in rows)
            {
                Console.WriteLine(row);
            }

            df.Show();

            df.PrintSchema();

            df.Select("name", "age", "age", "name").Show();

            df.Select(df["name"], df["age"] + 1).Show();

            df.Filter(df["age"] > 21).Show();

            df.GroupBy("age")
            .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
            .Show();

            df.CreateOrReplaceTempView("people");

            // Registering Udf for SQL expression.
            DataFrame sqlDf = spark.Sql("SELECT * FROM people");

            sqlDf.Show();

            spark.Udf().Register <int?, string, string>(
                "my_udf",
                (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));

            sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
            sqlDf.Show();

            // Using UDF via data frames.
            Func <Column, Column, Column> addition = Udf <int?, string, string>(
                (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));

            df.Select(addition(df["age"], df["name"])).Show();

            // Chaining example:
            Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!");

            df.Select(addition2(addition(df["age"], df["name"]))).Show();

            // Multiple UDF example:
            df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();

            // UDF return type as array.
            Func <Column, Column> udfArray =
                Udf <string, string[]>((str) => new string[] { str, str + str });

            df.Select(Explode(udfArray(df["name"]))).Show();

            // UDF return type as map.
            Func <Column, Column> udfMap =
                Udf <string, IDictionary <string, string[]> >(
                    (str) => new Dictionary <string, string[]> {
                { str, new[] { str, str } }
            });

            df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50);

            // Joins.
            DataFrame joinedDf = df.Join(df, "name");

            joinedDf.Show();

            DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });

            joinedDf2.Show();

            DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");

            joinedDf3.Show();

            spark.Stop();
        }
Beispiel #20
0
        public static void leerCSV()
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            // A CSV dataset is pointed to by path.
            // The path can be either a single CSV file or a directory of CSV files
            string path = "data/sample_data.csv";

            //Dataset<Row> df = spark.Read().Csv(path);//.csv(path);
            DataFrame df = spark.Read().Csv(path);

            df.Show();
            // +------------------+
            // |               _c0|
            // +------------------+
            // |      name;age;job|
            // |Jorge;30;Developer|
            // |  Bob;32;Developer|
            // +------------------+



            // Read a csv with delimiter, the default delimiter is ","
            //Dataset<Row> df2 = spark.read().option("delimiter", ";").csv(path);
            DataFrame df2 = spark.Read().Option("delimiter", ";").Csv(path);

            df2.Show();
            // +-----+---+---------+
            // |  _c0|_c1|      _c2|
            // +-----+---+---------+
            // | name|age|      job|
            // |Jorge| 30|Developer|
            // |  Bob| 32|Developer|
            // +-----+---+---------+

            // Read a csv with delimiter and a header
            //Dataset<Row> df3 = spark.read().option("delimiter", ";").option("header", "true").csv(path);
            DataFrame df3 = spark.Read().Option("delimiter", ";").Option("header", "true").Csv(path);

            df3.Show();
            // +-----+---+---------+
            // | name|age|      job|
            // +-----+---+---------+
            // |Jorge| 30|Developer|
            // |  Bob| 32|Developer|
            // +-----+---+---------+

            // You can also use options() to use multiple options
            Dictionary <string, string> optionsMap = new Dictionary <string, string>();

            optionsMap.Add("delimiter", ";");
            optionsMap.Add("header", "true");
            var df4 = spark.Read().Options(optionsMap).Csv(path);

            // "output" is a folder which contains multiple csv files and a _SUCCESS file.
            df3.Write().Csv("output");

            // Read all files in a folder, please make sure only CSV files should present in the folder.
            string    folderPath = "data/sample_data.csv";
            DataFrame df5        = spark.Read().Csv(folderPath);

            df5.Show();
            // Wrong schema because non-CSV files are read
            // +-----------+
            // |        _c0|
            // +-----------+
            // |238val_238|
            // |  86val_86|
            // |311val_311|
            // |  27val_27|
            // |165val_165|
            // +-----------+
            // Stop Spark session
            spark.Stop();
        }
Beispiel #21
0
        public static void Main(string[] args)
        {
            // Create Spark session.
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Hyperspace example")
                                 .Config("spark.some.config.option", "some-value")
                                 .GetOrCreate();

            // Sample department records.
            var departments = new List <GenericRow>()
            {
                new GenericRow(new object[] { 10, "Accounting", "New York" }),
                new GenericRow(new object[] { 20, "Research", "Dallas" }),
                new GenericRow(new object[] { 30, "Sales", "Chicago" }),
                new GenericRow(new object[] { 40, "Operations", "Boston" })
            };

            // Sample employee records.
            var employees = new List <GenericRow>()
            {
                new GenericRow(new object[] { 7369, "SMITH", 20 }),
                new GenericRow(new object[] { 7499, "ALLEN", 30 }),
                new GenericRow(new object[] { 7521, "WARD", 30 }),
                new GenericRow(new object[] { 7566, "JONES", 20 }),
                new GenericRow(new object[] { 7698, "BLAKE", 30 }),
                new GenericRow(new object[] { 7782, "CLARK", 10 }),
                new GenericRow(new object[] { 7788, "SCOTT", 20 }),
                new GenericRow(new object[] { 7839, "KING", 10 }),
                new GenericRow(new object[] { 7844, "TURNER", 30 }),
                new GenericRow(new object[] { 7876, "ADAMS", 20 }),
                new GenericRow(new object[] { 7900, "JAMES", 30 }),
                new GenericRow(new object[] { 7934, "MILLER", 10 }),
                new GenericRow(new object[] { 7902, "FORD", 20 }),
                new GenericRow(new object[] { 7654, "MARTIN", 30 })
            };

            // Save example data records as Parquet.
            string deptLocation = "departments";

            spark.CreateDataFrame(departments, new StructType(new List <StructField>()
            {
                new StructField("deptId", new IntegerType()),
                new StructField("deptName", new StringType()),
                new StructField("location", new StringType())
            }))
            .Write()
            .Mode("overwrite")
            .Parquet(deptLocation);

            string empLocation = "employees";

            spark.CreateDataFrame(employees, new StructType(new List <StructField>()
            {
                new StructField("empId", new IntegerType()),
                new StructField("empName", new StringType()),
                new StructField("deptId", new IntegerType())
            }))
            .Write()
            .Mode("overwrite")
            .Parquet(empLocation);

            // Create Hyperspace indexes.
            var hyperspace = new Hyperspace(spark);

            DataFrame deptDF = spark.Read().Parquet(deptLocation);
            DataFrame empDF  = spark.Read().Parquet(empLocation);

            var deptIndexConfig = new IndexConfig(
                "deptIndex",
                new[] { "deptId" },
                new[] { "deptName" });
            var empIndexConfig = new IndexConfig("empIndex",
                                                 new[] { "deptId" },
                                                 new[] { "empName" });

            hyperspace.CreateIndex(deptDF, deptIndexConfig);
            hyperspace.CreateIndex(empDF, empIndexConfig);

            // List all indexes.
            hyperspace.Indexes().Show();

            // Enable Hyperspace to leverage indexes.
            spark.EnableHyperspace();

            // Example of index usage for filtered selection.
            DataFrame eqFilter = deptDF.Filter("deptId = 20").Select("deptName");

            eqFilter.Show();
            hyperspace.Explain(eqFilter, false);

            // Example of index usage for join.
            DataFrame eqJoin = empDF
                               .Join(deptDF, "deptId")
                               .Select(empDF.Col("empName"), deptDF.Col("deptName"));

            eqJoin.Show();
            hyperspace.Explain(eqJoin, false);

            // Stop Spark session.
            spark.Stop();
        }
Beispiel #22
0
        static void Main(string[] args)
        {
            /*
             * Copiar mysql-connector-java-8.0.19.jar para pasta do Spark / Hadoop
             * Rodar o comando abaixo a partir da pasta inicial deste projeto:
             *   %SPARK_HOME%\bin\spark-submit
             *   --master local
             *   --class org.apache.spark.deploy.dotnet.DotnetRunner
             *   bin\Debug\netcoreapp3.1\microsoft-spark-2-4_2.11-1.0.0.jar
             *   dotnet
             *   bin\Debug\netcoreapp3.1\BatchDemo.dll
             *   data\amostra.csv
             *   jdbc:mysql://localhost:3306/teste_spark beneficios spark_user my-secret-password
             */

            if (args.Length == 0)
            {
                throw new ArgumentException("Informar os caminhos onde encontrar os arquivos CSV");
            }

            string arquivoEntrada = args[0];

            // Obtém a referência ao contexto de execução do Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Exemplo Batch")
                                 .GetOrCreate();

            // Definindo um schema fixo, com os nomes de coluna que eu quero e seus tipos
            StructType schema = new StructType(new[]
            {
                new StructField("MES_REFERENCIA", new StringType()),
                new StructField("MES_COMPETENCIA", new StringType()),
                new StructField("UF", new StringType()),
                new StructField("CODIGO_MUNICIPIO", new IntegerType()),
                new StructField("MUNICIPIO", new StringType()),
                new StructField("CODIGO_FAVORECIDO", new StringType()),
                new StructField("NOME", new StringType()),
                new StructField("DATA_SAQUE", new DateType()),
                new StructField("VALOR_TEXTO", new StringType())
            });

            // Leitura dos dados em disco para dentro do Spark
            DataFrame df = spark.Read()
                           .Format("csv")
                           .Schema(schema)
                           .Option("sep", ";")
                           .Option("header", true)
                           .Option("dateFormat", "dd/MM/yyyy")
                           .Load(arquivoEntrada);

            df.PrintSchema();
            df.Show(5, 10);

            // Removendo colunas que não precisamos mais
            df = df.Drop("MES_REFERENCIA")
                 .Drop("MES_COMPETENCIA")
                 .Drop("CODIGO_MUNICIPIO")
                 .Drop("CODIGO_FAVORECIDO");
            df.Show(5, 10);

            // Convertendo a coluna VALOR de string para decimal, considerando que o padrão brasileiro é diferente do americano
            df = df.WithColumn("VALOR", RegexpReplace(
                                   RegexpReplace(
                                       df.Col("VALOR_TEXTO")
                                       , "\\.", "")
                                   , ",", ".")
                               .Cast("decimal(10,2)"))
                 .Drop("VALOR_TEXTO");
            df.PrintSchema();
            df.Show(5, 10);

            // Efetuando um filtro em cima dos dados
            df = df.Where(df.Col("UF").NotEqual("AC"));
            //df = df.Where("UF <> 'AC'");  // passar uma expressão WHERE também funciona como filtro
            df.Show(5, 10);

            spark.Udf().Register <string, string, string>("ConcatenarMunicipio",
                                                          (uf, municipio) => ConcatenarMunicipio(uf, municipio));

            // Criando uma nova coluna a partir de uma concatenação e removendo colunas antigas e que não precisamos mais
            df = df.WithColumn("MUNICIPIO",
                               CallUDF("ConcatenarMunicipio", df.Col("UF"), df.Col("MUNICIPIO")))
                 .Drop("UF");
            // Efetuando uma agregação
            DataFrame somatorio = df.GroupBy("MUNICIPIO")
                                  .Sum("VALOR")
                                  .WithColumnRenamed("sum(VALOR)", "SOMA_BENEFICIOS");

            somatorio
            .OrderBy(somatorio.Col("SOMA_BENEFICIOS").Desc())
            .Show(15, 40);

            if (args.Length >= 2)
            {
                string urlJdbc = args[1];   // jdbc:mysql://localhost:3306/teste_spark
                string tabela  = args[2];   // beneficios
                string usuario = args[3];   // spark_user
                string senha   = args[4];   // my-secret-password

                // Salvando em banco de dados com funcionalidade nativa do Spark
                somatorio
                .Write()
                .Format("jdbc")
                .Option("driver", "com.mysql.cj.jdbc.Driver")
                .Option("url", "jdbc:mysql://localhost:3306/teste_spark")
                .Option("dbtable", "beneficios")
                .Option("user", "spark_user")
                .Option("password", "my-secret-password")
                .Mode(SaveMode.Overwrite)
                .Save();
            }
            spark.Stop();
        }
Beispiel #23
0
        public string DeltaTest([FromServices] IAWSSettings awsSettings)
        {
            string result = String.Empty;

            try
            {
                SparkSession spark = SparkSession
                                     .Builder()
                                     .AppName("DeltaTest")
                                     .GetOrCreate();

                string tempDirectory = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);

                string dt   = DateTime.Now.ToString("MMddhhmmss");
                string path = Path.Combine(tempDirectory, $"delta-table{dt}");

                // Write data to a Delta table.
                DataFrame data = spark.Range(0, 5);

                result += "Write data to a Delta table >> spark.Range(0, 5)" + "              ";
                foreach (var row in data.ToDF().Collect())
                {
                    result += row.Values[0];
                    result += " | ";
                }
                result += "              ";
                data.Write().Format("delta").Save(path);

                // Create a second iteration of the table.
                data    = spark.Range(5, 10);
                result += "Create a second iteration of the table >> spark.Range(0, 5)" + "              ";
                foreach (var row in data.ToDF().Collect())
                {
                    result += row.Values[0];
                    result += " | ";
                }
                result += "              ";
                data.Write().Format("delta").Mode("overwrite").Save(path);

                // Load the data into a DeltaTable object.
                DeltaTable deltaTable = DeltaTable.ForPath(path);
                result += "Load the data into a DeltaTable object >> DeltaTable.ForPath" + "              ";
                foreach (var row in deltaTable.ToDF().Collect())
                {
                    result += row.Values[0];
                    result += " | ";
                }
                result += "              ";
                // Update every even value by adding 100 to it.
                deltaTable.Update(
                    condition: Functions.Expr("id % 2 == 0"),
                    set: new Dictionary <string, Column>()
                {
                    { "id", Functions.Expr("id + 100") }
                });

                result += "Update every even value by adding 100 to it." + "              ";
                foreach (var row in deltaTable.ToDF().Collect())
                {
                    result += row.Values[0];
                    result += " | ";
                }
                result += "              ";

                // Delete every even value.
                deltaTable.Delete(condition: Functions.Expr("id % 2 == 0"));
                result += "Delete every even value  id % 2 == 0" + "              ";
                foreach (var row in deltaTable.ToDF().Collect())
                {
                    result += row.Values[0];
                    result += " | ";
                }
                result += "              ";

                // Upsert (merge) new data.
                DataFrame newData = spark.Range(0, 20).As("newData").ToDF();
                result += "Upsert (merge) new data" + Environment.NewLine;
                foreach (var row in newData.ToDF().Collect())
                {
                    result += row.Values[0];
                    result += " | ";
                }
                result += "              ";

                deltaTable.As("oldData")
                .Merge(newData, "oldData.id = newData.id")
                .WhenMatched()
                .Update(
                    new Dictionary <string, Column>()
                {
                    { "id", Functions.Col("newData.id") }
                })
                .WhenNotMatched()
                .InsertExpr(new Dictionary <string, string>()
                {
                    { "id", "newData.id" }
                })
                .Execute();


                spark.Stop();
            }
            catch (Exception ex)
            {
                result = ex.Message;
            }
            return(result);
        }