Beispiel #1
0
        private void RunBasicDatasourceExample(SparkSession spark, string parquet, string json, string csv, string orc)
        {
            DataFrame df = spark.Read().Load(parquet);

            df.PrintSchema();

            df.Select("name", "favorite_color")
            .Write()
            .Mode(SaveMode.Overwrite)
            .Save("namesPartByColor.parquet");

            df.Write()
            .Mode(SaveMode.Overwrite)
            .PartitionBy("favorite_color")
            .BucketBy(42, "name")
            .SaveAsTable("people_partitioned_bucketed");

            df = spark.Read().Format("json").Load(json);

            df.PrintSchema();

            df.Select("name", "age")
            .Write()
            .Mode(SaveMode.Overwrite)
            .Format("parquet")
            .Save("namesAndAges.parquet");

            df = spark.Read()
                 .Format("csv")
                 .Option("sep", ";")
                 .Option("inferSchema", true)
                 .Option("header", true)
                 .Load(csv);

            df = spark.Read().Orc(orc);

            df.Write()
            .Format("orc")
            .Options(new Dictionary <string, string>
            {
                { "orc.bloom.filter.columns", "favorite_color" },
                { "orc.dictionary.key.threshold", "1.0" },
                { "orc.column.encoding.direct", "name" }
            })
            .Mode(SaveMode.Overwrite)
            .Save("users_with_options.orc");

            df.Write()
            .BucketBy(42, "name")
            .SortBy("favorite_color")
            .SaveAsTable("people_bucketed");

            spark.Sql($"SELECT * FROM parquet.`{parquet}`").Show();

            spark.Sql("SELECT * FROM people_bucketed").Show();
            spark.Sql("SELECT * FROM people_partitioned_bucketed").Show();

            spark.Sql("DROP TABLE IF EXISTS people_bucketed");
            spark.Sql("DROP TABLE IF EXISTS people_partitioned_bucketed");
        }
        static void BasicDfExample(SparkSession spark)
        {
            var dataFrame = spark.Read().Json("/Users/ed/spark-2.4.6-bin-without-hadoop/examples/src/main/resources/people.json");

            dataFrame.Show();

            dataFrame.PrintSchema();

            dataFrame.Select("name").Show();

            dataFrame.Select(dataFrame["name"], dataFrame["age"] + 1).Show();
            dataFrame.Select(dataFrame["name"], dataFrame["age"].Plus(1)).Show();

            dataFrame.Filter(dataFrame["age"] > 21).Show();
            dataFrame.Filter(dataFrame["age"].Gt(21)).Show();

            dataFrame.GroupBy(dataFrame["age"]).Count().Show();

            dataFrame.CreateOrReplaceTempView("people");
            var sqlDataFrame = spark.Sql("SELECT * FROM people");

            dataFrame.CreateGlobalTempView("people");
            spark.Sql("SELECT * FROM global_temp.people").Show();
            spark.NewSession().Sql("SELECT * FROM global_temp.people").Show();
        }
Beispiel #3
0
        public void TestBucketizer()
        {
            var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };

            string expectedHandle    = "skip";
            string expectedUid       = "uid";
            string expectedInputCol  = "input_col";
            string expectedOutputCol = "output_col";

            var bucketizer = new Bucketizer(expectedUid);

            bucketizer.SetInputCol(expectedInputCol)
            .SetOutputCol(expectedOutputCol)
            .SetHandleInvalid(expectedHandle)
            .SetSplits(expectedSplits);

            Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid());

            Assert.Equal(expectedUid, bucketizer.Uid());

            DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");

            DataFrame output = bucketizer.Transform(input);

            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));

            Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
            Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
            Assert.Equal(expectedSplits, bucketizer.GetSplits());
        }
Beispiel #4
0
        /// <summary>
        /// To integrate with Hive operations
        /// </summary>
        private static void HiveDataFrame()
        {
            var builder = SparkSession.Builder().EnableHiveSupport();

            builder = builder.Config("spark.master", "yarn");
            builder = builder.Config("spark.app.name", "HiveDataFrame");
            builder = builder.Config("spark.sql.warehouse.dir", "/user/hive/warehouse");
            session = builder.GetOrCreate();
            var peopleDataFrame = session.Read().Json(jsonFilePath);

            logger.LogInfo("****Create table if not exists****");
            session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
            logger.LogInfo("****Database Created****");
            session.Sql(string.Format("USE {0}", dbName));

            logger.LogInfo("****Create Table operation started****");
            peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
            logger.LogInfo("****Table Created successfully****");
            var tablesDataFrame = session.Table(tableName);

            logger.LogInfo(string.Format("****Table count in database {0}: {1}", dbName, tablesDataFrame.Count()) + "****");
            var rowCollections = tablesDataFrame.Collect();

            logger.LogInfo("**********************************************");
            foreach (var row in rowCollections)
            {
                Console.WriteLine("{0}", row);
            }
            logger.LogInfo("*********************************************");
            logger.LogInfo("Executed Successfully.................");
        }
Beispiel #5
0
        public void TestBucketizer()
        {
            var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };

            string expectedHandle    = "skip";
            string expectedUid       = "uid";
            string expectedInputCol  = "input_col";
            string expectedOutputCol = "output_col";

            var bucketizer = new Bucketizer(expectedUid);

            bucketizer.SetInputCol(expectedInputCol)
            .SetOutputCol(expectedOutputCol)
            .SetHandleInvalid(expectedHandle)
            .SetSplits(expectedSplits);

            Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid());

            Assert.Equal(expectedUid, bucketizer.Uid());

            DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");

            DataFrame output = bucketizer.Transform(input);

            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));

            Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
            Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
            Assert.Equal(expectedSplits, bucketizer.GetSplits());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "bucket");
                bucketizer.Save(savePath);

                Bucketizer loadedBucketizer = Bucketizer.Load(savePath);
                Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
            }

            Assert.NotEmpty(bucketizer.ExplainParams());

            Param handleInvalidParam = bucketizer.GetParam("handleInvalid");

            Assert.NotEmpty(handleInvalidParam.Doc);
            Assert.NotEmpty(handleInvalidParam.Name);
            Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid());

            Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam));
            bucketizer.Set(handleInvalidParam, "keep");
            Assert.Equal("keep", bucketizer.GetHandleInvalid());

            Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid());
        }
Beispiel #6
0
        private static void AnalyseBrandDemand(SparkSession spark, CustomerSentimentContext context)
        {
            Console.WriteLine("Analysing popular brand demand");

            var brands = context.BrandSentiment
                         .OrderByDescending(b => b.ReviewCount)
                         .ThenBy(b => b.SentimentRank)
                         .Take(10)
                         .Select(b => $"'{b.Brand}'")
                         .ToArray();

            var brandList = string.Join(',', brands);

            var brandsDemand = spark.Sql(
                "SELECT EM.brand, FROM_UNIXTIME(ER.unix_time, 'MM') as month, COUNT(1) as demand " +
                "FROM ElectronicsReviews ER " +
                "JOIN ElectronicsMetadata EM ON EM.asin = ER.asin " +
                $"WHERE EM.brand IN ({brandList}) " +
                "GROUP BY EM.brand, FROM_UNIXTIME(ER.unix_time, 'MM') " +
                "ORDER BY EM.brand, FROM_UNIXTIME(ER.unix_time, 'MM')");

            brandsDemand.Cache();
            brandsDemand.CreateOrReplaceTempView("BrandsDemand");

            var items = Mapper.MapRows(
                brandsDemand.Collect(),
                r => new BrandDemand
            {
                Brand  = r.GetAs <string>(0),
                Month  = int.Parse(r.GetAs <string>(1)),
                Demand = r.GetAs <int>(2)
            },
                o => $"{o.Brand}-{o.Month}");

            context.BrandDemand.RemoveRange(context.BrandDemand);
            context.BrandDemand.AddRange(items);
            context.SaveChanges();

            foreach (var brand in brands)
            {
                Console.WriteLine($"Analysing consumer demand for {brand}");

                var brandDemand = spark.Sql(
                    "SELECT * " +
                    "FROM BrandsDemand " +
                    $"WHERE brand = {brand}");

                brandDemand.Show();
            }
        }
        static void CreateUsingRangeInSql(SparkSession spark)
        {
            Console.WriteLine("Range in SQL");
            var dataFrame = spark.Sql("select id from range(1000)");

            dataFrame.Show(5);

            /*
             *  +---+
             | id|
             +---+
             |  0|
             |  1|
             |  2|
             |  3|
             |  4|
             +---+
             */

            dataFrame = spark.Sql("select id, 'Literal' as `Another Column` from range(1000)");
            dataFrame.Show(5);

            /*
             *  +---+--------------+
             | id|Another Column|
             +---+--------------+
             |  0|       Literal|
             |  1|       Literal|
             |  2|       Literal|
             |  3|       Literal|
             |  4|       Literal|
             +---+--------------+
             */

            dataFrame = spark.Sql("select id, 'Literal' as `Another Column`, pmod(id, 2) as `Mod`  from range(1000)");
            dataFrame.Show(5);

            /*
             *  +---+--------------+---+
             | id|Another Column|Mod|
             +---+--------------+---+
             |  0|       Literal|  0|
             |  1|       Literal|  1|
             |  2|       Literal|  0|
             |  3|       Literal|  1|
             |  4|       Literal|  0|
             +---+--------------+---+
             */
        }
        public void TestTryAddThread()
        {
            using var threadPool = new JvmThreadPoolGC(
                      _loggerService, _jvmBridge, TimeSpan.FromMinutes(30));

            var thread = new Thread(() => _spark.Sql("SELECT TRUE"));

            thread.Start();

            Assert.True(threadPool.TryAddThread(thread));
            // Subsequent call should return false, because the thread has already been added.
            Assert.False(threadPool.TryAddThread(thread));

            thread.Join();
        }
Beispiel #9
0
        static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
        }
        public static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // // Create initial DataFrame

            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
            Console.WriteLine("SCRAPY");
        }
Beispiel #11
0
        public static void leerJSON()
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            // A CSV dataset is pointed to by path.
            // The path can be either a single CSV file or a directory of CSV files
            string path = "data/sample_data.csv";

            //Dataset<Row> df = spark.Read().Csv(path);//.csv(path);
            DataFrame df = spark.Read().Csv(path);

            df.Show();
            // +------------------+
            // |               _c0|
            // +------------------+
            // |      name;age;job|
            // |Jorge;30;Developer|
            // |  Bob;32;Developer|
            // +------------------+

            //realizar conteo de nombres con sql
            DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data");

            // Show results
            sqlDf.Show();

            // Stop Spark session
            spark.Stop();
        }
Beispiel #12
0
        private static DataFrame LoadReviewPathFile(string reviewPath, SparkSession spark)
        {
            Console.WriteLine("Loading Electronics_Reviews.json File");

            var ratingSchema = new StructType(new[]
            {
                new StructField("reviewerID", new StringType(), isNullable: false),
                new StructField("asin", new StringType(), isNullable: false),
                new StructField("reviewText", new StringType()),
                new StructField("unixReviewTime", new LongType())
            });

            var dfRatings = spark
                            .Read()
                            .Schema(ratingSchema)
                            .Json(reviewPath);

            var itemIds = spark.Sql(
                "SELECT asin AS id " +
                "FROM ElectronicsMetadata");

            var avaliableItemReviws = dfRatings
                                      .Join(itemIds, dfRatings["asin"] == itemIds["id"])
                                      .Drop("id");

            Console.WriteLine("Done");
            Console.WriteLine();

            return(avaliableItemReviws);
        }
Beispiel #13
0
        public void TestTokenizer()
        {
            string expectedUid       = "theUid";
            string expectedInputCol  = "input_col";
            string expectedOutputCol = "output_col";

            DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
                                         " from range(100)");

            Tokenizer tokenizer = new Tokenizer(expectedUid)
                                  .SetInputCol(expectedInputCol)
                                  .SetOutputCol(expectedOutputCol);

            DataFrame output = tokenizer.Transform(input);

            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
            Assert.Equal(expectedInputCol, tokenizer.GetInputCol());
            Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "Tokenizer");
                tokenizer.Save(savePath);

                Tokenizer loadedTokenizer = Tokenizer.Load(savePath);
                Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid());
            }

            Assert.Equal(expectedUid, tokenizer.Uid());
        }
        public void TestWord2VecModel()
        {
            DataFrame documentDataFrame =
                _spark.Sql("SELECT split('Hi I heard about Spark', ' ') as text");

            Word2Vec word2vec = new Word2Vec()
                                .SetInputCol("text")
                                .SetOutputCol("result")
                                .SetMinCount(1);

            Word2VecModel model = word2vec.Fit(documentDataFrame);

            const int expectedSynonyms = 2;
            DataFrame synonyms         = model.FindSynonyms("Hi", expectedSynonyms);

            Assert.Equal(expectedSynonyms, synonyms.Count());
            synonyms.Show();

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "word2vecModel");
                model.Save(savePath);

                Word2VecModel loadedModel = Word2VecModel.Load(savePath);
                Assert.Equal(model.Uid(), loadedModel.Uid());
            }
        }
Beispiel #15
0
        private static void AnalyseCategoryDemand(SparkSession spark, CustomerSentimentContext context)
        {
            Console.WriteLine("Analysing category consumer demand");

            var categoriesDemand = spark.Sql(
                "SELECT EM.main_cat, FROM_UNIXTIME(ER.unix_time, 'MM') as month, COUNT(1) as demand " +
                "FROM ElectronicsReviews ER " +
                "JOIN ElectronicsMetadata EM ON EM.asin = ER.asin " +
                "GROUP BY EM.main_cat, from_unixtime(ER.unix_time, 'MM') " +
                "ORDER BY EM.main_cat, FROM_UNIXTIME(ER.unix_time, 'MM')");

            categoriesDemand.Cache();
            categoriesDemand.CreateOrReplaceTempView("CategoryDemand");

            var items = Mapper.MapRows(
                categoriesDemand.Collect(),
                r => new CategoryDemand
            {
                Category = r.GetAs <string>(0),
                Month    = int.Parse(r.GetAs <string>(1)),
                Demand   = r.GetAs <int>(2)
            },
                o => $"{o.Category}-{o.Month}");

            context.CategoryDemand.RemoveRange(context.CategoryDemand);
            context.CategoryDemand.AddRange(items);
            context.SaveChangesAsync();

            var categories = spark.Sql("SELECT main_cat FROM CategoryDemand GROUP BY main_cat")
                             .Collect()
                             .Select(r => r.GetAs <string>(0))
                             .ToArray();

            foreach (var category in categories)
            {
                Console.WriteLine($"Analysing consumer demand for {category}");

                var categoryDemand = spark.Sql(
                    "SELECT * " +
                    "FROM CategoryDemand " +
                    $"WHERE main_cat = '{category}'");

                categoryDemand.Show();
            }
        }
Beispiel #16
0
        private static void ElectronicsReviewsSentimentAnalysis(SparkSession spark)
        {
            spark.Udf().Register <string, int>("sentiment_udf", text => Sentiment(text));

            var reviewsSentiment = spark.Sql("SELECT *, sentiment_udf(review_text) AS sentiment FROM ElectronicsReviews");

            reviewsSentiment.Cache();
            reviewsSentiment.CreateOrReplaceTempView("ElectronicsReviewSentiment");
        }
        public void TestSignaturesV3_0_X()
        {
            DataFrame df = _spark
                           .Read()
                           .Schema("age INT, name STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}people.json");

            DataFrameWriterV2 dfwV2 = df.WriteTo("testtable");

            Assert.IsType <DataFrameWriterV2>(dfwV2.Using("json"));

            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key1", "value"));
            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key2", true));
            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key3", 1L));
            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key4", 2D));

            Assert.IsType <DataFrameWriterV2>(dfwV2.Options(
                                                  new Dictionary <string, string>()
            {
                { "key", "value" }
            }));

            Assert.IsType <DataFrameWriterV2>(dfwV2.TableProperty("prop", "value"));

            _spark.Sql("DROP TABLE IF EXISTS default.testtable");
            dfwV2.Create();

            Assert.IsType <DataFrameWriterV2>(dfwV2.PartitionedBy(df.Col("age")));

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported
            // with v2 tables.
            Assert.Throws <Exception>(() => dfwV2.Replace());

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported
            // with v2 tables.
            Assert.Throws <Exception>(() => dfwV2.CreateOrReplace());

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: Table default.testtable does not support
            // append in batch mode.
            Assert.Throws <Exception>(() => dfwV2.Append());

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: Table default.testtable does not support
            // overwrite by filter in batch mode.
            Assert.Throws <Exception>(() => dfwV2.Overwrite(df.Col("age")));

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: Table default.testtable does not support
            // dynamic overwrite in batch mode.
            Assert.Throws <Exception>(() => dfwV2.OverwritePartitions());
        }
Beispiel #18
0
        //Méthode qui est appelée depuis le main pour lancer une session spark avec un nombbre de noyaux et d'instances différents et lancer la résolution du soduku grace à la méthode Sudokusolution().
        //private static void Sudokures(string cores, string nodes, string mem, int nrows){
        private static void Sudokures(int nrows)
        {
            // Initialisation de la session Spark
            SparkSession spark = SparkSession
                                 .Builder()
                                 .Config("spark.executor.memory", "4G")
                                 .GetOrCreate();
            //.AppName("Resolution of " + nrows + " sudokus using DlxLib with " + cores + " cores and " + nodes + " instances")
            //.Config("spark.driver.cores", cores)
            //.Config("spark.executor.instances", nodes)
            //.Config("spark.executor.memory", mem)
            //.GetOrCreate();

            // Intégration du csv dans un dataframe
            DataFrame df = spark
                           .Read()
                           .Option("header", true)
                           .Option("inferSchema", true)
                           .Csv(_filePath);

            //limit du dataframe avec un nombre de ligne prédéfini lors de l'appel de la fonction
            DataFrame df2 = df.Limit(nrows);

            //Watch seulement pour la résolution des sudokus
            var watch2 = new System.Diagnostics.Stopwatch();

            watch2.Start();

            // Création de la spark User Defined Function
            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => Sudokusolution(sudoku));

            // Appel de l'UDF dans un nouveau dataframe spark qui contiendra les résultats aussi
            df2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT Sudokus, SukoduUDF(Sudokus) as Resolution from Resolved");

            sqlDf.Show();

            watch2.Stop();

            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution : " + watch2.ElapsedMilliseconds + " ms");
            //Console.WriteLine($"Execution Time for " + nrows + " sudoku resolution with " + cores + " core and " + nodes + " instance: " + watch2.ElapsedMilliseconds + " ms");
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();
            Console.WriteLine();

            spark.Stop();
        }
Beispiel #19
0
 /// <summary>
 /// Drops tables in <paramref name="tableNames"/> after calling <paramref name="action"/>.
 /// </summary>
 /// <param name="spark">The <see cref="SparkSession"/></param>
 /// <param name="tableNames">Names of the tables to drop</param>
 /// <param name="action"><see cref="Action"/> to execute.</param>
 public static void WithTable(SparkSession spark, IEnumerable <string> tableNames, Action action)
 {
     try
     {
         action();
     }
     finally
     {
         tableNames.ToList().ForEach(name => spark.Sql($"DROP TABLE IF EXISTS {name}"));
     }
 }
        public void Run(string[] args)
        {
            if (args.Length != 1)
            {
                Console.Error.WriteLine(
                    "Usage: GitHubProjects <path to projects.csv>");
                Environment.Exit(1);
            }

            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("GitHub and Spark Batch")
                                 .GetOrCreate();

            DataFrame projectsDf = spark
                                   .Read()
                                   .Schema("id INT, url STRING, owner_id INT, " +
                                           "name STRING, descriptor STRING, language STRING, " +
                                           "created_at STRING, forked_from INT, deleted STRING, " +
                                           "updated_at STRING")
                                   .Csv(args[0]);

            projectsDf.Show();

            // Drop any rows with NA values
            DataFrameNaFunctions dropEmptyProjects = projectsDf.Na();
            DataFrame            cleanedProjects   = dropEmptyProjects.Drop("any");

            // Remove unnecessary columns
            cleanedProjects = cleanedProjects.Drop("id", "url", "owner_id");
            cleanedProjects.Show();

            // Average number of times each language has been forked
            DataFrame groupedDF = cleanedProjects
                                  .GroupBy("language")
                                  .Agg(Avg(cleanedProjects["forked_from"]));

            // Sort by most forked languages first
            groupedDF.OrderBy(Desc("avg(forked_from)")).Show();

            spark.Udf().Register <string, bool>(
                "MyUDF",
                (date) => DateTime.TryParse(date, out DateTime convertedDate) &&
                (convertedDate > s_referenceDate));

            cleanedProjects.CreateOrReplaceTempView("dateView");

            DataFrame dateDf = spark.Sql(
                "SELECT *, MyUDF(dateView.updated_at) AS datebefore FROM dateView");

            dateDf.Show();

            spark.Stop();
        }
Beispiel #21
0
        internal void Run(string queryNumber)
        {
            Console.WriteLine($"Spark .NET TPCH SQL Query: #{queryNumber}");
            Type thisType    = GetType();
            var  queryString = (string)thisType.GetField(
                $"s_q{queryNumber}", BindingFlags.Static | BindingFlags.NonPublic).GetValue(null);

            var sw = Stopwatch.StartNew();

            _spark.Sql(queryString).Show(numRows: 20, truncate: 0);
            Console.WriteLine($"\tElapsed: {sw.Elapsed}");
        }
        static void CreateUsingRangeAndDataFrameAPI(SparkSession spark)
        {
            Console.WriteLine("spark.Sql");
            var dataFrame = spark.Sql("select id from range(1000)");

            dataFrame.Show(5);

            /*
             *  +---+
             | id|
             +---+
             |  0|
             |  1|
             |  2|
             |  3|
             |  4|
             +---+
             *
             */

            Console.WriteLine("spark.Sql().WithColumn");
            dataFrame = dataFrame.WithColumn("Another Column", Functions.Lit("Literal"));
            dataFrame.Show(5);

            /*
             *  +---+--------------+
             | id|Another Column|
             +---+--------------+
             |  0|       Literal|
             |  1|       Literal|
             |  2|       Literal|
             |  3|       Literal|
             |  4|       Literal|
             +---+--------------+
             */

            Console.WriteLine("spark.Sql().WithColumn");
            dataFrame = dataFrame.WithColumn("Mod", Functions.Pmod(Functions.Col("id"), Functions.Lit(2)));
            dataFrame.Show(5);

            /*
             *  +---+--------------+---+
             | id|Another Column|Mod|
             +---+--------------+---+
             |  0|       Literal|  0|
             |  1|       Literal|  1|
             |  2|       Literal|  0|
             |  3|       Literal|  1|
             |  4|       Literal|  0|
             +---+--------------+---+
             */
        }
Beispiel #23
0
 public void TestInnerJvmException()
 {
     try
     {
         _spark.Sql("THROW!!!");
     }
     catch (Exception ex)
     {
         Assert.NotNull(ex.InnerException);
         Assert.IsType <JvmException>(ex.InnerException);
         Assert.False(string.IsNullOrWhiteSpace(ex.InnerException.Message));
     }
 }
Beispiel #24
0
        public void TestCountVectorizerModel()
        {
            DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " +
                                         "'TOKENIZE') as input from range(100)");

            const string inputColumn  = "input";
            const string outputColumn = "output";
            const double minTf        = 10.0;
            const bool   binary       = false;

            var vocabulary = new List <string>()
            {
                "hello",
                "I",
                "AM",
                "TO",
                "TOKENIZE"
            };

            var countVectorizerModel = new CountVectorizerModel(vocabulary);

            Assert.IsType <CountVectorizerModel>(new CountVectorizerModel("my-uid", vocabulary));

            countVectorizerModel = countVectorizerModel
                                   .SetInputCol(inputColumn)
                                   .SetOutputCol(outputColumn)
                                   .SetMinTF(minTf)
                                   .SetBinary(binary);

            Assert.Equal(inputColumn, countVectorizerModel.GetInputCol());
            Assert.Equal(outputColumn, countVectorizerModel.GetOutputCol());
            Assert.Equal(minTf, countVectorizerModel.GetMinTF());
            Assert.Equal(binary, countVectorizerModel.GetBinary());
            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "countVectorizerModel");
                countVectorizerModel.Save(savePath);

                CountVectorizerModel loadedModel = CountVectorizerModel.Load(savePath);
                Assert.Equal(countVectorizerModel.Uid(), loadedModel.Uid());
            }

            Assert.IsType <int>(countVectorizerModel.GetVocabSize());
            Assert.NotEmpty(countVectorizerModel.ExplainParams());
            Assert.NotEmpty(countVectorizerModel.ToString());

            Assert.IsType <StructType>(countVectorizerModel.TransformSchema(input.Schema()));
            Assert.IsType <DataFrame>(countVectorizerModel.Transform(input));

            TestFeatureBase(countVectorizerModel, "minDF", 100);
        }
Beispiel #25
0
        public void TestPipelineModelTransform()
        {
            var expectedSplits =
                new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };

            string expectedHandle    = "skip";
            string expectedUid       = "uid";
            string expectedInputCol  = "input_col";
            string expectedOutputCol = "output_col";

            var bucketizer = new Bucketizer(expectedUid);

            bucketizer.SetInputCol(expectedInputCol)
            .SetOutputCol(expectedOutputCol)
            .SetHandleInvalid(expectedHandle)
            .SetSplits(expectedSplits);

            var stages = new JavaTransformer[] {
                bucketizer
            };

            PipelineModel pipelineModel = new PipelineModel("randomUID", stages);

            DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");

            DataFrame output = pipelineModel.Transform(input);

            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));

            Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
            Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
            Assert.Equal(expectedSplits, bucketizer.GetSplits());

            Assert.IsType <StructType>(pipelineModel.TransformSchema(input.Schema()));
            Assert.IsType <DataFrame>(output);

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "pipelineModel");
                pipelineModel.Save(savePath);

                PipelineModel loadedPipelineModel = PipelineModel.Load(savePath);
                Assert.Equal(pipelineModel.Uid(), loadedPipelineModel.Uid());

                string writePath = Path.Join(tempDirectory.Path, "pipelineModelWithWrite");
                pipelineModel.Write().Save(writePath);

                PipelineModel loadedPipelineModelWithRead = pipelineModel.Read().Load(writePath);
                Assert.Equal(pipelineModel.Uid(), loadedPipelineModelWithRead.Uid());
            }
        }
Beispiel #26
0
        private static void AnalyseCategorySentiment(SparkSession spark, CustomerSentimentContext context)
        {
            Console.WriteLine("Analyzing category sentiment");

            var itemCategorySentiment = spark.Sql(
                "SELECT EM.main_cat, SUM(ERS.sentiment) / COUNT(1) * 100 as sentiment_rank, COUNT(1) review_count " +
                "FROM ElectronicsMetadata EM " +
                "JOIN ElectronicsReviewSentiment ERS ON ERS.asin = EM.asin " +
                "GROUP BY EM.main_cat");

            itemCategorySentiment.Cache();
            itemCategorySentiment.CreateOrReplaceTempView("ItemCategorySentiment");

            Console.WriteLine("Analyzing categories with best consumer sentiment.");

            var categorySentiment = spark.Sql(
                "SELECT * " +
                "FROM ItemCategorySentiment " +
                "ORDER BY sentiment_rank DESC, review_count DESC");

            categorySentiment.Show();

            var items = Mapper.MapRows(
                categorySentiment.Collect(),
                r => new ItemCategorySentiment
            {
                Category      = r.GetAs <string>(0),
                SentimentRank = r.GetAs <double>(1),
                ReviewCount   = r.GetAs <int>(2)
            },
                o => o.Category);

            context.ItemCategorySentiment.RemoveRange(context.ItemCategorySentiment);
            context.ItemCategorySentiment.AddRange(items);

            context.SaveChanges();
        }
        public void TestWord2Vec()
        {
            DataFrame documentDataFrame = _spark.Sql("SELECT split('Spark dotnet is cool', ' ')");

            const string expectedInputCol          = "text";
            const string expectedOutputCol         = "result";
            const int    expectedMinCount          = 0;
            const int    expectedMaxIter           = 10;
            const int    expectedMaxSentenceLength = 100;
            const int    expectedNumPartitions     = 1000;
            const int    expectedSeed       = 10000;
            const double expectedStepSize   = 1.9;
            const int    expectedVectorSize = 20;
            const int    expectedWindowSize = 200;

            Word2Vec word2vec = new Word2Vec()
                                .SetInputCol(expectedInputCol)
                                .SetOutputCol(expectedOutputCol)
                                .SetMinCount(expectedMinCount)
                                .SetMaxIter(expectedMaxIter)
                                .SetMaxSentenceLength(expectedMaxSentenceLength)
                                .SetNumPartitions(expectedNumPartitions)
                                .SetSeed(expectedSeed)
                                .SetStepSize(expectedStepSize)
                                .SetVectorSize(expectedVectorSize)
                                .SetWindowSize(expectedWindowSize);

            Assert.Equal(expectedInputCol, word2vec.GetInputCol());
            Assert.Equal(expectedOutputCol, word2vec.GetOutputCol());
            Assert.Equal(expectedMinCount, word2vec.GetMinCount());
            Assert.Equal(expectedMaxIter, word2vec.GetMaxIter());
            Assert.Equal(expectedMaxSentenceLength, word2vec.GetMaxSentenceLength());
            Assert.Equal(expectedNumPartitions, word2vec.GetNumPartitions());
            Assert.Equal(expectedSeed, word2vec.GetSeed());
            Assert.Equal(expectedStepSize, word2vec.GetStepSize());
            Assert.Equal(expectedVectorSize, word2vec.GetVectorSize());
            Assert.Equal(expectedWindowSize, word2vec.GetWindowSize());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "word2vec");
                word2vec.Save(savePath);

                Word2Vec loadedWord2Vec = Word2Vec.Load(savePath);
                Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid());
            }

            TestFeatureBase(word2vec, "maxIter", 2);
        }
Beispiel #28
0
        private void RunParquetExample(SparkSession spark, string json)
        {
            DataFrame peopleDf = spark.Read().Json(json);

            peopleDf.Write().Mode(SaveMode.Overwrite).Parquet("people.parquet");

            DataFrame parquetFile = spark.Read().Parquet("people.parquet");

            parquetFile.CreateTempView("parquet");

            DataFrame teenagers = spark.Sql(
                "SELECT name FROM parquet WHERE age >= 13 and age <= 19");

            teenagers.Show();
        }
Beispiel #29
0
        public void TestIDFModel()
        {
            int    expectedDocFrequency = 1980;
            string expectedInputCol     = "rawFeatures";
            string expectedOutputCol    = "features";

            DataFrame sentenceData =
                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

            Tokenizer tokenizer = new Tokenizer()
                                  .SetInputCol("sentence")
                                  .SetOutputCol("words");

            DataFrame wordsData = tokenizer.Transform(sentenceData);

            HashingTF hashingTF = new HashingTF()
                                  .SetInputCol("words")
                                  .SetOutputCol(expectedInputCol)
                                  .SetNumFeatures(20);

            DataFrame featurizedData = hashingTF.Transform(wordsData);

            IDF idf = new IDF()
                      .SetInputCol(expectedInputCol)
                      .SetOutputCol(expectedOutputCol)
                      .SetMinDocFreq(expectedDocFrequency);

            IDFModel idfModel = idf.Fit(featurizedData);

            DataFrame rescaledData = idfModel.Transform(featurizedData);

            Assert.Contains(expectedOutputCol, rescaledData.Columns());

            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());

            using (var tempDirectory = new TemporaryDirectory())
            {
                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                idfModel.Save(modelPath);

                IDFModel loadedModel = IDFModel.Load(modelPath);
                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
            }

            TestFeatureBase(idfModel, "minDocFreq", 1000);
        }
Beispiel #30
0
        /// Tests for the Catclog Functions - returned from SparkSession.Catalog
        public void CatalogFunctions()
        {
            Catalog catalog = _spark.Catalog();

            Assert.IsType <DataFrame>(catalog.ListDatabases());
            Assert.IsType <DataFrame>(catalog.ListFunctions());
            Assert.IsType <DataFrame>(catalog.ListFunctions("default"));

            DataFrame table = catalog.CreateTable("users",
                                                  Path.Combine(TestEnvironment.ResourceDirectory, "users.parquet"));

            Assert.IsType <DataFrame>(table);

            Assert.IsType <string>(catalog.CurrentDatabase());
            Assert.IsType <bool>(catalog.DatabaseExists("default"));

            Assert.IsType <bool>(catalog.DropGlobalTempView("no-view"));
            Assert.IsType <bool>(catalog.DropTempView("no-view"));
            Assert.IsType <bool>(catalog.FunctionExists("default", "functionname"));
            Assert.IsType <bool>(catalog.FunctionExists("functionname"));
            Assert.IsType <Database>(catalog.GetDatabase("default"));
            Assert.IsType <Function>(catalog.GetFunction("abs"));
            Assert.IsType <Function>(catalog.GetFunction(null, "abs"));
            Assert.IsType <Table>(catalog.GetTable("users"));
            Assert.IsType <Table>(catalog.GetTable("default", "users"));
            Assert.IsType <bool>(catalog.IsCached("users"));
            Assert.IsType <DataFrame>(catalog.ListColumns("users"));
            Assert.IsType <DataFrame>(catalog.ListColumns("default", "users"));
            Assert.IsType <DataFrame>(catalog.ListDatabases());
            Assert.IsType <DataFrame>(catalog.ListFunctions());
            Assert.IsType <DataFrame>(catalog.ListFunctions("default"));
            Assert.IsType <DataFrame>(catalog.ListTables());
            Assert.IsType <DataFrame>(catalog.ListTables("default"));

            catalog.RefreshByPath("/");
            catalog.RefreshTable("users");
            catalog.SetCurrentDatabase("default");
            catalog.CacheTable("users");
            catalog.UncacheTable("users");
            catalog.ClearCache();

            Assert.IsType <bool>(catalog.TableExists("users"));
            Assert.IsType <bool>(catalog.TableExists("default", "users"));

            _spark.Sql(@"CREATE TABLE IF NOT EXISTS usersp USING PARQUET PARTITIONED BY (name)  
                            AS SELECT * FROM users");
            catalog.RecoverPartitions("usersp");
        }