Example #1
0
        private void RunBasicDatasourceExample(SparkSession spark, string parquet, string json, string csv, string orc)
        {
            DataFrame df = spark.Read().Load(parquet);

            df.PrintSchema();

            df.Select("name", "favorite_color")
            .Write()
            .Mode(SaveMode.Overwrite)
            .Save("namesPartByColor.parquet");

            df.Write()
            .Mode(SaveMode.Overwrite)
            .PartitionBy("favorite_color")
            .BucketBy(42, "name")
            .SaveAsTable("people_partitioned_bucketed");

            df = spark.Read().Format("json").Load(json);

            df.PrintSchema();

            df.Select("name", "age")
            .Write()
            .Mode(SaveMode.Overwrite)
            .Format("parquet")
            .Save("namesAndAges.parquet");

            df = spark.Read()
                 .Format("csv")
                 .Option("sep", ";")
                 .Option("inferSchema", true)
                 .Option("header", true)
                 .Load(csv);

            df = spark.Read().Orc(orc);

            df.Write()
            .Format("orc")
            .Options(new Dictionary <string, string>
            {
                { "orc.bloom.filter.columns", "favorite_color" },
                { "orc.dictionary.key.threshold", "1.0" },
                { "orc.column.encoding.direct", "name" }
            })
            .Mode(SaveMode.Overwrite)
            .Save("users_with_options.orc");

            df.Write()
            .BucketBy(42, "name")
            .SortBy("favorite_color")
            .SaveAsTable("people_bucketed");

            spark.Sql($"SELECT * FROM parquet.`{parquet}`").Show();

            spark.Sql("SELECT * FROM people_bucketed").Show();
            spark.Sql("SELECT * FROM people_partitioned_bucketed").Show();

            spark.Sql("DROP TABLE IF EXISTS people_bucketed");
            spark.Sql("DROP TABLE IF EXISTS people_partitioned_bucketed");
        }
Example #2
0
        public void TestEmailSearchTopNReducerBasics()
        {
            // Read the sample data.
            DataFrame df = _spark
                           .Read()
                           .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}neighbors.json");

            // Trim the IMAddress column.
            Func <Column, Column> trimIMAddress = Udf <string, string>((str) => str.StartsWith("sip:") ? str.Substring(4) : str);

            df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"]));

            // Reduce
            df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"),
                                                    CollectList("Surname").Alias("Surnames"),
                                                    CollectList("DisplayName").Alias("DisplayNames"),
                                                    CollectList("EmailAddress").Alias("EmailAddresses"),
                                                    CollectList("RelevanceScore").Alias("RelevanceScores"),
                                                    CollectList("IMAddress").Alias("IMAddresses"));
            // Format the output.
            df = df.Select(df["puser"],
                           df["ptenant"],
                           ConcatWs(";", df["GivenNames"]).Alias("GivenNames"),
                           ConcatWs(";", df["Surnames"]).Alias("Surnames"),
                           ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"),
                           ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"),
                           ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"),
                           ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses"));

            Assert.Equal(2, df.Count());
            foreach (Row row in df.Collect())
            {
                string puser = row.GetAs <string>("puser");
                Assert.Equal("MSFT", row.GetAs <string>("ptenant"));
                Assert.Equal("1101.0;900.0;857.0", row.GetAs <string>("RelevanceScores"));
                switch (puser)
                {
                case "ruih":
                    Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs <string>("GivenNames"));
                    Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs <string>("Surnames"));
                    Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs <string>("DisplayNames"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses"));
                    break;

                case "rui":
                    Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs <string>("GivenNames"));
                    Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs <string>("Surnames"));
                    Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs <string>("DisplayNames"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses"));
                    Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses"));
                    break;

                default:
                    throw new Exception($"Unexpected age: {puser}.");
                }
            }
        }
        public void TestSignaturesV2_3_X()
        {
            DataFrameReader dfr = _spark.Read();

            Assert.IsType <DataFrameReader>(dfr.Format("json"));

            Assert.IsType <DataFrameReader>(
                dfr.Schema(
                    new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("name", new StringType())
            })));
            Assert.IsType <DataFrameReader>(dfr.Schema("age INT, name STRING"));

            Assert.IsType <DataFrameReader>(dfr.Option("stringOption", "value"));
            Assert.IsType <DataFrameReader>(dfr.Option("boolOption", true));
            Assert.IsType <DataFrameReader>(dfr.Option("longOption", 1L));
            Assert.IsType <DataFrameReader>(dfr.Option("doubleOption", 3D));

            Assert.IsType <DataFrameReader>(
                dfr.Options(
                    new Dictionary <string, string>
            {
                { "option1", "value1" },
                { "option2", "value2" }
            }));

            string jsonFile = $"{TestEnvironment.ResourceDirectory}people.json";

            Assert.IsType <DataFrame>(dfr.Load());
            Assert.IsType <DataFrame>(dfr.Load(jsonFile));
            Assert.IsType <DataFrame>(dfr.Load(jsonFile, jsonFile));

            Assert.IsType <DataFrame>(dfr.Json(jsonFile));
            Assert.IsType <DataFrame>(dfr.Json(jsonFile, jsonFile));

            string csvFile = $"{TestEnvironment.ResourceDirectory}people.csv";

            Assert.IsType <DataFrame>(dfr.Csv(csvFile));
            Assert.IsType <DataFrame>(dfr.Csv(csvFile, csvFile));

            string parquetFile = $"{TestEnvironment.ResourceDirectory}users.parquet";

            Assert.IsType <DataFrame>(dfr.Parquet(parquetFile));
            Assert.IsType <DataFrame>(dfr.Parquet(parquetFile, parquetFile));

            string orcFile = $"{TestEnvironment.ResourceDirectory}users.orc";

            Assert.IsType <DataFrame>(dfr.Orc(orcFile));
            Assert.IsType <DataFrame>(dfr.Orc(orcFile, orcFile));

            dfr = _spark.Read();
            string textFile = $"{TestEnvironment.ResourceDirectory}people.txt";

            Assert.IsType <DataFrame>(dfr.Text(textFile));
            Assert.IsType <DataFrame>(dfr.Text(textFile, textFile));
        }
Example #4
0
        public void TestGroupedMapUdf()
        {
            DataFrame df = _spark
                           .Read()
                           .Schema("age INT, name STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}more_people.json");

            // Data:
            // { "name":"Michael"}
            // { "name":"Andy", "age":30}
            // { "name":"Seth", "age":30}
            // { "name":"Justin", "age":19}
            // { "name":"Kathy", "age":19}

            Row[] rows = df.GroupBy("age")
                         .Apply(
                new StructType(new[]
            {
                new StructField("age", new IntegerType()),
                new StructField("nameCharCount", new IntegerType())
            }),
                batch => ArrowBasedCountCharacters(batch))
                         .Collect()
                         .ToArray();

            Assert.Equal(3, rows.Length);
            foreach (Row row in rows)
            {
                int?age       = row.GetAs <int?>("age");
                int charCount = row.GetAs <int>("nameCharCount");
                switch (age)
                {
                case null:
                    Assert.Equal(7, charCount);
                    break;

                case 19:
                    Assert.Equal(11, charCount);
                    break;

                case 30:
                    Assert.Equal(8, charCount);
                    break;

                default:
                    throw new Exception($"Unexpected age: {age}.");
                }
            }
        }
Example #5
0
        private void RunParquetExample(SparkSession spark, string json)
        {
            DataFrame peopleDf = spark.Read().Json(json);

            peopleDf.Write().Mode(SaveMode.Overwrite).Parquet("people.parquet");

            DataFrame parquetFile = spark.Read().Parquet("people.parquet");

            parquetFile.CreateTempView("parquet");

            DataFrame teenagers = spark.Sql(
                "SELECT name FROM parquet WHERE age >= 13 and age <= 19");

            teenagers.Show();
        }
Example #6
0
        private static DataFrame LoadMetadataFile(string metadataPath, SparkSession spark)
        {
            Console.WriteLine("Loading Electronics_Metadata.json File");

            var metadataSchema = new StructType(new[]
            {
                new StructField("asin", new StringType(), isNullable: false),
                new StructField("title", new StringType()),
                new StructField("brand", new StringType()),
                new StructField("main_cat", new StringType()),
                new StructField("price", new StringType()),
                new StructField("category", new ArrayType(new StringType())),
                new StructField("description", new StringType()),
                new StructField("image", new ArrayType(new StringType())),
                new StructField("date", new StringType()),
                new StructField("rank", new StringType()),
            });

            var dfMetadata = spark
                             .Read()
                             .Schema(metadataSchema)
                             .Json(metadataPath);

            Console.WriteLine("Done");
            Console.WriteLine();

            return(dfMetadata);
        }
Example #7
0
        private static DataFrame LoadReviewPathFile(string reviewPath, SparkSession spark)
        {
            Console.WriteLine("Loading Electronics_Reviews.json File");

            var ratingSchema = new StructType(new[]
            {
                new StructField("reviewerID", new StringType(), isNullable: false),
                new StructField("asin", new StringType(), isNullable: false),
                new StructField("reviewText", new StringType()),
                new StructField("unixReviewTime", new LongType())
            });

            var dfRatings = spark
                            .Read()
                            .Schema(ratingSchema)
                            .Json(reviewPath);

            var itemIds = spark.Sql(
                "SELECT asin AS id " +
                "FROM ElectronicsMetadata");

            var avaliableItemReviws = dfRatings
                                      .Join(itemIds, dfRatings["asin"] == itemIds["id"])
                                      .Drop("id");

            Console.WriteLine("Done");
            Console.WriteLine();

            return(avaliableItemReviws);
        }
 private static DataFrame ReadCsvIntoDataframe(SparkSession sparkSession, string filename, StructType schema)
 {
     return(sparkSession.Read()
            .Format("csv")
            .Option("header", "true").Schema(schema)
            .Load(filename));
 }
Example #9
0
        public static void leerJSON()
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            // A CSV dataset is pointed to by path.
            // The path can be either a single CSV file or a directory of CSV files
            string path = "data/sample_data.csv";

            //Dataset<Row> df = spark.Read().Csv(path);//.csv(path);
            DataFrame df = spark.Read().Csv(path);

            df.Show();
            // +------------------+
            // |               _c0|
            // +------------------+
            // |      name;age;job|
            // |Jorge;30;Developer|
            // |  Bob;32;Developer|
            // +------------------+

            //realizar conteo de nombres con sql
            DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data");

            // Show results
            sqlDf.Show();

            // Stop Spark session
            spark.Stop();
        }
Example #10
0
        static void Main(string[] args)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .GetOrCreate();

            // Create initial DataFrame
            string    filePath  = args[0];
            DataFrame dataFrame = spark.Read().Text(filePath);

            //Count words
            DataFrame words =
                dataFrame
                .Select(Split(Col("value"), " ").Alias("words"))
                .Select(Explode(Col("words")).Alias("word"))
                .GroupBy("word")
                .Count()
                .OrderBy(Col("count").Desc());

            // Display results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
 public DataFrameFunctionsTests(SparkFixture fixture)
 {
     _spark = fixture.Spark;
     _df    = _spark
              .Read()
              .Json($"{TestEnvironment.ResourceDirectory}people.json");
 }
Example #12
0
        static void Main(string[] args)
        {
            Console.WriteLine("Start SparkSession");
            SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate();
            DataFrame    dfCsv        =
                sparkSession
                .Read()
                .Option("delimiter", ";")
                .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " +
                        "SYM string , SYM_UL string , " +
                        "CECHA string , NAZWA_1 string ,NAZWA_2 string , " +
                        "STAN_NA string")
                .Csv("streets.csv");
            DataFrame dataIn = dfCsv
                               .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"]));
            DataFrame dataGroup = dataIn
                                  .Select("STREET")
                                  .GroupBy("STREET")
                                  .Count()
                                  .WithColumnRenamed("count", "COUNT");
            DataFrame dataOut = dataGroup
                                .OrderBy(dataGroup["COUNT"]
                                         .Desc()
                                         );

            dataOut
            .Coalesce(1)
            .Write()
            .Option("delimiter", ";")
            .Csv("result");
            sparkSession.Stop();
            Console.WriteLine("Stop SparkSession");
        }
Example #13
0
        static void Main(string[] args)
        {
            //1. Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();
            //2. Create initial DataFrame
            DataFrame dataFrame = spark.Read()
                                  //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP")
                                  .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP")
                                  .Csv("DataBook.csv");

            dataFrame.Show();

            //Drop any rows with Null/Empty values
            DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na();
            DataFrame            CleanedProjects     = dropEmptytablesrows.Drop("any");
            var testdata = 0;

            //remove unnecessary Columns
            CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp");
            CleanedProjects.Show();
            // Stop Spark session--checked
            spark.Stop();
        }
        public static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("word_count_sample")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // // Create initial DataFrame

            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
            Console.WriteLine("SCRAPY");
        }
Example #15
0
        static void Main(string[] args)
        {
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("emrApp")
                                 .GetOrCreate();

            DataFrame dataFrame = spark
                                  .Read()
                                  .Format("avro")
                                  .Load(args[0]);

            RegionModel regionModel = new RegionModel();

            Func <Column, Column> udfConvertRegion = Udf <string, string>(
                city => {
                var regionCode      = city.Split('_')[1].Substring(0, 1);
                var convertedRegion = String.Empty;
                regionModel.ConversionTable.TryGetValue(regionCode, out convertedRegion);
                return(convertedRegion);
            }     // city_23 --> 23 --> 2 --> {2 : Brisbane} --> ** Brisbane **
                );

            dataFrame = dataFrame
                        .WithColumn("Region", udfConvertRegion(dataFrame["address.city"]))
                        .Drop("orderunits", "address");

            dataFrame
            .Coalesce(1)
            .Write()
            .Format("csv")
            .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}");
        }
Example #16
0
        private static void RunDataFrameSample(bool createNewSession)
        {
            SparkSession ss = GetSparkSession();

            if (createNewSession)
            {
                ss = sparkSession.NewSession();
            }

            var peopleDataFrame = ss.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson));
            var count           = peopleDataFrame.Count();

            Console.WriteLine("Count of items in DataFrame {0}", count);

            var sortedDataFrame = peopleDataFrame.Sort(new string[] { "name", "age" }, new bool[] { true, false });

            sortedDataFrame.Show();

            if (SparkCLRSamples.Configuration.IsValidationEnabled)
            {
                var sortedDF = sortedDataFrame.Collect().ToArray();
                Assert.AreEqual("789", sortedDF[0].GetAs <string>("id"));
                Assert.AreEqual("123", sortedDF[1].GetAs <string>("id"));
                Assert.AreEqual("531", sortedDF[2].GetAs <string>("id"));
                Assert.AreEqual("456", sortedDF[3].GetAs <string>("id"));
            }
        }
Example #17
0
        static void Main(string[] args)
        {
            // Set the debug backend port, has to be same as the one in the Dockerfile.
            System.Environment.SetEnvironmentVariable("DOTNETBACKEND_PORT", "12345");

            // Create a Spark session.
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();

            // Create initial DataFrame.
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words.
            DataFrame words = dataFrame
                              .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                              .Select(Functions.Explode(Functions.Col("words"))
                                      .Alias("word"))
                              .GroupBy("word")
                              .Count()
                              .OrderBy(Functions.Col("count").Desc());

            // Show results.
            words.Show();

            // Stop Spark session.
            spark.Stop();
        }
Example #18
0
        public static void leerTxt()
        {
            Console.WriteLine("Hello World!");
            // Create a Spark session
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("word_count_sample")
                                 .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark.Read().Text("input.txt");

            // Count words
            DataFrame words = dataFrame
                              .Select(Functions.Split(Functions.Col("value"), " ").Alias("words"))
                              .Select(Functions.Explode(Functions.Col("words"))
                                      .Alias("word"))
                              .GroupBy("word")
                              .Count()
                              .OrderBy(Functions.Col("count").Desc());

            // Show results
            words.Show();

            // Stop Spark session
            spark.Stop();
        }
        static void BasicDfExample(SparkSession spark)
        {
            var dataFrame = spark.Read().Json("/Users/ed/spark-2.4.6-bin-without-hadoop/examples/src/main/resources/people.json");

            dataFrame.Show();

            dataFrame.PrintSchema();

            dataFrame.Select("name").Show();

            dataFrame.Select(dataFrame["name"], dataFrame["age"] + 1).Show();
            dataFrame.Select(dataFrame["name"], dataFrame["age"].Plus(1)).Show();

            dataFrame.Filter(dataFrame["age"] > 21).Show();
            dataFrame.Filter(dataFrame["age"].Gt(21)).Show();

            dataFrame.GroupBy(dataFrame["age"]).Count().Show();

            dataFrame.CreateOrReplaceTempView("people");
            var sqlDataFrame = spark.Sql("SELECT * FROM people");

            dataFrame.CreateGlobalTempView("people");
            spark.Sql("SELECT * FROM global_temp.people").Show();
            spark.NewSession().Sql("SELECT * FROM global_temp.people").Show();
        }
Example #20
0
        /// <summary>
        /// To integrate with Hive operations
        /// </summary>
        private static void HiveDataFrame()
        {
            var builder = SparkSession.Builder().EnableHiveSupport();

            builder = builder.Config("spark.master", "yarn");
            builder = builder.Config("spark.app.name", "HiveDataFrame");
            builder = builder.Config("spark.sql.warehouse.dir", "/user/hive/warehouse");
            session = builder.GetOrCreate();
            var peopleDataFrame = session.Read().Json(jsonFilePath);

            logger.LogInfo("****Create table if not exists****");
            session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists
            logger.LogInfo("****Database Created****");
            session.Sql(string.Format("USE {0}", dbName));

            logger.LogInfo("****Create Table operation started****");
            peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table
            logger.LogInfo("****Table Created successfully****");
            var tablesDataFrame = session.Table(tableName);

            logger.LogInfo(string.Format("****Table count in database {0}: {1}", dbName, tablesDataFrame.Count()) + "****");
            var rowCollections = tablesDataFrame.Collect();

            logger.LogInfo("**********************************************");
            foreach (var row in rowCollections)
            {
                Console.WriteLine("{0}", row);
            }
            logger.LogInfo("*********************************************");
            logger.LogInfo("Executed Successfully.................");
        }
Example #21
0
        static void runSpark(string file_path, string cores, string nodes, int nrows)
        {
            // Create Spark session
            SparkSession spark =
                SparkSession
                .Builder()
                .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)")
                .Config("spark.executor.cores", cores)
                .Config("spark.executor.instances", nodes)
                .GetOrCreate();

            // Create initial DataFrame
            DataFrame dataFrame = spark
                                  .Read()
                                  .Option("header", true)
                                  .Option("inferSchema", true)
                                  .Schema("quizzes string, solutions string")
                                  .Csv(file_path);

            DataFrame dataFrame2 = dataFrame.Limit(nrows);

            spark.Udf().Register <string, string>(
                "SukoduUDF",
                (sudoku) => sudokusolution(sudoku));

            dataFrame2.CreateOrReplaceTempView("Resolved");
            DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved");

            sqlDf.Show();

            spark.Stop();
        }
 public UdfComplexTypesTests(SparkFixture fixture)
 {
     _spark = fixture.Spark;
     _df    = _spark
              .Read()
              .Json(Path.Combine($"{TestEnvironment.ResourceDirectory}people.json"));
 }
Example #23
0
        public void TestSignaturesV2_3_X()
        {
            Assert.IsType <SparkContext>(_spark.SparkContext);

            Assert.IsType <Builder>(SparkSession.Builder());

            SparkSession.ClearDefaultSession();
            SparkSession.SetDefaultSession(_spark);
            Assert.IsType <SparkSession>(SparkSession.GetDefaultSession());

            Assert.IsType <RuntimeConfig>(_spark.Conf());

            Assert.IsType <SparkSession>(_spark.NewSession());

            Assert.IsType <DataFrameReader>(_spark.Read());

            Assert.IsType <DataFrame>(_spark.Range(10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10));
            Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5));

            _spark.Range(10).CreateOrReplaceTempView("testView");
            Assert.IsType <DataFrame>(_spark.Table("testView"));

            Assert.IsType <DataStreamReader>(_spark.ReadStream());

            Assert.IsType <UdfRegistration>(_spark.Udf());

            Assert.IsType <Catalog>(_spark.Catalog());
        }
Example #24
0
        public void TestRead()
        {
            var mockSparkSessionProxy = new Mock <ISparkSessionProxy>();
            var sparkSession          = new SparkSession(mockSparkSessionProxy.Object);
            var reader = sparkSession.Read();

            mockSparkSessionProxy.Verify(m => m.Read(), Times.Once);
        }
Example #25
0
 public DataFrameTests(SparkFixture fixture)
 {
     _spark = fixture.Spark;
     _df    = _spark
              .Read()
              .Schema("age INT, name STRING")
              .Json($"{TestEnvironment.ResourceDirectory}people.json");
 }
Example #26
0
 public UdfSimpleTypesTests(SparkFixture fixture)
 {
     _spark = fixture.Spark;
     _df    = _spark
              .Read()
              .Schema("name STRING, age INT, date DATE")
              .Json(Path.Combine($"{TestEnvironment.ResourceDirectory}people.json"));
 }
Example #27
0
 public DataFrameTests()
 {
     fixture = new SparkFixture();
     _spark  = fixture.Spark;
     _df     = _spark
               .Read()
               .Schema("age INT, name STRING")
               .Json($"Resources/people.json");
 }
Example #28
0
        private void RunDatasourceExample(SparkSession spark)
        {
            DataFrame jdbcDf = spark.Read()
                               .Format("jdbc")
                               .Options(
                new Dictionary <string, string>
            {
                { "url", "jdbc:postgresql:postgres" },
                { "dbtable", "table_name" },
                { "user", "user_name" },
                { "password", "password" }
            })
                               .Load();

            jdbcDf.Show();

            DataFrame jdbcDf2 = spark.Read()
                                .Format("jdbc")
                                .Options(
                new Dictionary <string, string>
            {
                { "url", "jdbc:postgresql:postgres" },
                { "dbtable", "table_name" },
                { "user", "user_name" },
                { "password", "password" },
                { "customSchema", "another_id int, another_name STRING" }
            })
                                .Load();

            jdbcDf2.Show();

            jdbcDf.Write()
            .Format("jdbc")
            .Options(
                new Dictionary <string, string>
            {
                { "url", "jdbc:postgresql:postgres" },
                { "dbtable", "table_name" },
                { "user", "user_name" },
                { "password", "password" }
            })
            .Mode(SaveMode.Append)
            .Save();
        }
        static void Main(string[] args)
        {
            // Verify environment variables
            if (args.Length != 4)
            {
                Console.Error.WriteLine("Usage: $TENANT_ID $ADLS_NAME $ADLS_SP_CLIENT_ID $ADLS_SP_CLIENT_SECRET");
                Environment.Exit(1);
            }

            // Specify file path in Azure Data Lake Gen1
            string filePath =
                $"adl://{args[1]}.azuredatalakestore.net/parquet/people.parquet";

            // Create SparkSession
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Azure Data Lake Storage example using .NET for Apache Spark")
                                 .Config("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem")
                                 .Config("fs.adl.oauth2.access.token.provider.type", "ClientCredential")
                                 .Config("fs.adl.oauth2.client.id", args[2])
                                 .Config("fs.adl.oauth2.credential", args[3])
                                 .Config("fs.adl.oauth2.refresh.url", $"https://login.microsoftonline.com/{args[0]}/oauth2/token")
                                 .GetOrCreate();

            // Create sample data
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "John Doe" }),
                new GenericRow(new object[] { 2, "Jane Doe" }),
                new GenericRow(new object[] { 3, "Foo Bar" })
            };

            // Create schema for sample data
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Id", new IntegerType()),
                new StructField("Name", new StringType()),
            });

            // Create DataFrame using data and schema
            DataFrame df = spark.CreateDataFrame(data, schema);

            // Print DataFrame
            df.Show();

            // Write DataFrame to Azure Data Lake Gen1
            df.Write().Mode(SaveMode.Overwrite).Parquet(filePath);

            // Read saved DataFrame from Azure Data Lake Gen1
            DataFrame readDf = spark.Read().Parquet(filePath);

            // Print DataFrame
            readDf.Show();
        }
        public void TestSignaturesV3_0_X()
        {
            DataFrame df = _spark
                           .Read()
                           .Schema("age INT, name STRING")
                           .Json($"{TestEnvironment.ResourceDirectory}people.json");

            DataFrameWriterV2 dfwV2 = df.WriteTo("testtable");

            Assert.IsType <DataFrameWriterV2>(dfwV2.Using("json"));

            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key1", "value"));
            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key2", true));
            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key3", 1L));
            Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key4", 2D));

            Assert.IsType <DataFrameWriterV2>(dfwV2.Options(
                                                  new Dictionary <string, string>()
            {
                { "key", "value" }
            }));

            Assert.IsType <DataFrameWriterV2>(dfwV2.TableProperty("prop", "value"));

            _spark.Sql("DROP TABLE IF EXISTS default.testtable");
            dfwV2.Create();

            Assert.IsType <DataFrameWriterV2>(dfwV2.PartitionedBy(df.Col("age")));

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported
            // with v2 tables.
            Assert.Throws <Exception>(() => dfwV2.Replace());

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported
            // with v2 tables.
            Assert.Throws <Exception>(() => dfwV2.CreateOrReplace());

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: Table default.testtable does not support
            // append in batch mode.
            Assert.Throws <Exception>(() => dfwV2.Append());

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: Table default.testtable does not support
            // overwrite by filter in batch mode.
            Assert.Throws <Exception>(() => dfwV2.Overwrite(df.Col("age")));

            // Throws the following exception:
            // org.apache.spark.sql.AnalysisException: Table default.testtable does not support
            // dynamic overwrite in batch mode.
            Assert.Throws <Exception>(() => dfwV2.OverwritePartitions());
        }