private void RunBasicDatasourceExample(SparkSession spark, string parquet, string json, string csv, string orc) { DataFrame df = spark.Read().Load(parquet); df.PrintSchema(); df.Select("name", "favorite_color") .Write() .Mode(SaveMode.Overwrite) .Save("namesPartByColor.parquet"); df.Write() .Mode(SaveMode.Overwrite) .PartitionBy("favorite_color") .BucketBy(42, "name") .SaveAsTable("people_partitioned_bucketed"); df = spark.Read().Format("json").Load(json); df.PrintSchema(); df.Select("name", "age") .Write() .Mode(SaveMode.Overwrite) .Format("parquet") .Save("namesAndAges.parquet"); df = spark.Read() .Format("csv") .Option("sep", ";") .Option("inferSchema", true) .Option("header", true) .Load(csv); df = spark.Read().Orc(orc); df.Write() .Format("orc") .Options(new Dictionary <string, string> { { "orc.bloom.filter.columns", "favorite_color" }, { "orc.dictionary.key.threshold", "1.0" }, { "orc.column.encoding.direct", "name" } }) .Mode(SaveMode.Overwrite) .Save("users_with_options.orc"); df.Write() .BucketBy(42, "name") .SortBy("favorite_color") .SaveAsTable("people_bucketed"); spark.Sql($"SELECT * FROM parquet.`{parquet}`").Show(); spark.Sql("SELECT * FROM people_bucketed").Show(); spark.Sql("SELECT * FROM people_partitioned_bucketed").Show(); spark.Sql("DROP TABLE IF EXISTS people_bucketed"); spark.Sql("DROP TABLE IF EXISTS people_partitioned_bucketed"); }
public void TestEmailSearchTopNReducerBasics() { // Read the sample data. DataFrame df = _spark .Read() .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING") .Json($"{TestEnvironment.ResourceDirectory}neighbors.json"); // Trim the IMAddress column. Func <Column, Column> trimIMAddress = Udf <string, string>((str) => str.StartsWith("sip:") ? str.Substring(4) : str); df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"])); // Reduce df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"), CollectList("Surname").Alias("Surnames"), CollectList("DisplayName").Alias("DisplayNames"), CollectList("EmailAddress").Alias("EmailAddresses"), CollectList("RelevanceScore").Alias("RelevanceScores"), CollectList("IMAddress").Alias("IMAddresses")); // Format the output. df = df.Select(df["puser"], df["ptenant"], ConcatWs(";", df["GivenNames"]).Alias("GivenNames"), ConcatWs(";", df["Surnames"]).Alias("Surnames"), ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"), ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"), ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"), ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses")); Assert.Equal(2, df.Count()); foreach (Row row in df.Collect()) { string puser = row.GetAs <string>("puser"); Assert.Equal("MSFT", row.GetAs <string>("ptenant")); Assert.Equal("1101.0;900.0;857.0", row.GetAs <string>("RelevanceScores")); switch (puser) { case "ruih": Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs <string>("GivenNames")); Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs <string>("Surnames")); Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs <string>("DisplayNames")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses")); break; case "rui": Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs <string>("GivenNames")); Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs <string>("Surnames")); Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs <string>("DisplayNames")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses")); break; default: throw new Exception($"Unexpected age: {puser}."); } } }
public void TestSignaturesV2_3_X() { DataFrameReader dfr = _spark.Read(); Assert.IsType <DataFrameReader>(dfr.Format("json")); Assert.IsType <DataFrameReader>( dfr.Schema( new StructType(new[] { new StructField("age", new IntegerType()), new StructField("name", new StringType()) }))); Assert.IsType <DataFrameReader>(dfr.Schema("age INT, name STRING")); Assert.IsType <DataFrameReader>(dfr.Option("stringOption", "value")); Assert.IsType <DataFrameReader>(dfr.Option("boolOption", true)); Assert.IsType <DataFrameReader>(dfr.Option("longOption", 1L)); Assert.IsType <DataFrameReader>(dfr.Option("doubleOption", 3D)); Assert.IsType <DataFrameReader>( dfr.Options( new Dictionary <string, string> { { "option1", "value1" }, { "option2", "value2" } })); string jsonFile = $"{TestEnvironment.ResourceDirectory}people.json"; Assert.IsType <DataFrame>(dfr.Load()); Assert.IsType <DataFrame>(dfr.Load(jsonFile)); Assert.IsType <DataFrame>(dfr.Load(jsonFile, jsonFile)); Assert.IsType <DataFrame>(dfr.Json(jsonFile)); Assert.IsType <DataFrame>(dfr.Json(jsonFile, jsonFile)); string csvFile = $"{TestEnvironment.ResourceDirectory}people.csv"; Assert.IsType <DataFrame>(dfr.Csv(csvFile)); Assert.IsType <DataFrame>(dfr.Csv(csvFile, csvFile)); string parquetFile = $"{TestEnvironment.ResourceDirectory}users.parquet"; Assert.IsType <DataFrame>(dfr.Parquet(parquetFile)); Assert.IsType <DataFrame>(dfr.Parquet(parquetFile, parquetFile)); string orcFile = $"{TestEnvironment.ResourceDirectory}users.orc"; Assert.IsType <DataFrame>(dfr.Orc(orcFile)); Assert.IsType <DataFrame>(dfr.Orc(orcFile, orcFile)); dfr = _spark.Read(); string textFile = $"{TestEnvironment.ResourceDirectory}people.txt"; Assert.IsType <DataFrame>(dfr.Text(textFile)); Assert.IsType <DataFrame>(dfr.Text(textFile, textFile)); }
public void TestGroupedMapUdf() { DataFrame df = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}more_people.json"); // Data: // { "name":"Michael"} // { "name":"Andy", "age":30} // { "name":"Seth", "age":30} // { "name":"Justin", "age":19} // { "name":"Kathy", "age":19} Row[] rows = df.GroupBy("age") .Apply( new StructType(new[] { new StructField("age", new IntegerType()), new StructField("nameCharCount", new IntegerType()) }), batch => ArrowBasedCountCharacters(batch)) .Collect() .ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { int?age = row.GetAs <int?>("age"); int charCount = row.GetAs <int>("nameCharCount"); switch (age) { case null: Assert.Equal(7, charCount); break; case 19: Assert.Equal(11, charCount); break; case 30: Assert.Equal(8, charCount); break; default: throw new Exception($"Unexpected age: {age}."); } } }
private void RunParquetExample(SparkSession spark, string json) { DataFrame peopleDf = spark.Read().Json(json); peopleDf.Write().Mode(SaveMode.Overwrite).Parquet("people.parquet"); DataFrame parquetFile = spark.Read().Parquet("people.parquet"); parquetFile.CreateTempView("parquet"); DataFrame teenagers = spark.Sql( "SELECT name FROM parquet WHERE age >= 13 and age <= 19"); teenagers.Show(); }
private static DataFrame LoadMetadataFile(string metadataPath, SparkSession spark) { Console.WriteLine("Loading Electronics_Metadata.json File"); var metadataSchema = new StructType(new[] { new StructField("asin", new StringType(), isNullable: false), new StructField("title", new StringType()), new StructField("brand", new StringType()), new StructField("main_cat", new StringType()), new StructField("price", new StringType()), new StructField("category", new ArrayType(new StringType())), new StructField("description", new StringType()), new StructField("image", new ArrayType(new StringType())), new StructField("date", new StringType()), new StructField("rank", new StringType()), }); var dfMetadata = spark .Read() .Schema(metadataSchema) .Json(metadataPath); Console.WriteLine("Done"); Console.WriteLine(); return(dfMetadata); }
private static DataFrame LoadReviewPathFile(string reviewPath, SparkSession spark) { Console.WriteLine("Loading Electronics_Reviews.json File"); var ratingSchema = new StructType(new[] { new StructField("reviewerID", new StringType(), isNullable: false), new StructField("asin", new StringType(), isNullable: false), new StructField("reviewText", new StringType()), new StructField("unixReviewTime", new LongType()) }); var dfRatings = spark .Read() .Schema(ratingSchema) .Json(reviewPath); var itemIds = spark.Sql( "SELECT asin AS id " + "FROM ElectronicsMetadata"); var avaliableItemReviws = dfRatings .Join(itemIds, dfRatings["asin"] == itemIds["id"]) .Drop("id"); Console.WriteLine("Done"); Console.WriteLine(); return(avaliableItemReviws); }
private static DataFrame ReadCsvIntoDataframe(SparkSession sparkSession, string filename, StructType schema) { return(sparkSession.Read() .Format("csv") .Option("header", "true").Schema(schema) .Load(filename)); }
public static void leerJSON() { SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // A CSV dataset is pointed to by path. // The path can be either a single CSV file or a directory of CSV files string path = "data/sample_data.csv"; //Dataset<Row> df = spark.Read().Csv(path);//.csv(path); DataFrame df = spark.Read().Csv(path); df.Show(); // +------------------+ // | _c0| // +------------------+ // | name;age;job| // |Jorge;30;Developer| // | Bob;32;Developer| // +------------------+ //realizar conteo de nombres con sql DataFrame sqlDf = spark.Sql("SELECT * FROM sample_data"); // Show results sqlDf.Show(); // Stop Spark session spark.Stop(); }
static void Main(string[] args) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame string filePath = args[0]; DataFrame dataFrame = spark.Read().Text(filePath); //Count words DataFrame words = dataFrame .Select(Split(Col("value"), " ").Alias("words")) .Select(Explode(Col("words")).Alias("word")) .GroupBy("word") .Count() .OrderBy(Col("count").Desc()); // Display results words.Show(); // Stop Spark session spark.Stop(); }
public DataFrameFunctionsTests(SparkFixture fixture) { _spark = fixture.Spark; _df = _spark .Read() .Json($"{TestEnvironment.ResourceDirectory}people.json"); }
static void Main(string[] args) { Console.WriteLine("Start SparkSession"); SparkSession sparkSession = SparkSession.Builder().AppName("Street Counter").GetOrCreate(); DataFrame dfCsv = sparkSession .Read() .Option("delimiter", ";") .Schema("WOJ string ,POW string ,GMI string ,RODZ_GMI string , " + "SYM string , SYM_UL string , " + "CECHA string , NAZWA_1 string ,NAZWA_2 string , " + "STAN_NA string") .Csv("streets.csv"); DataFrame dataIn = dfCsv .WithColumn("STREET", Functions.ConcatWs(" ", dfCsv["CECHA"], dfCsv["NAZWA_1"], dfCsv["NAZWA_2"])); DataFrame dataGroup = dataIn .Select("STREET") .GroupBy("STREET") .Count() .WithColumnRenamed("count", "COUNT"); DataFrame dataOut = dataGroup .OrderBy(dataGroup["COUNT"] .Desc() ); dataOut .Coalesce(1) .Write() .Option("delimiter", ";") .Csv("result"); sparkSession.Stop(); Console.WriteLine("Stop SparkSession"); }
static void Main(string[] args) { //1. Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); //2. Create initial DataFrame DataFrame dataFrame = spark.Read() //.Schema("Assertid STRING,properties STRING,Value BOOLEAN,TimeSatmp TIMESTAMP") .Schema("Assertid STRING,properties STRING,Value STRING,TimeSatmp TIMESTAMP") .Csv("DataBook.csv"); dataFrame.Show(); //Drop any rows with Null/Empty values DataFrameNaFunctions dropEmptytablesrows = dataFrame.Na(); DataFrame CleanedProjects = dropEmptytablesrows.Drop("any"); var testdata = 0; //remove unnecessary Columns CleanedProjects = CleanedProjects.Drop("Assertid", "properties", "Value", "TimeSatmp"); CleanedProjects.Show(); // Stop Spark session--checked spark.Stop(); }
public static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); Console.WriteLine("SCRAPY"); }
static void Main(string[] args) { SparkSession spark = SparkSession .Builder() .AppName("emrApp") .GetOrCreate(); DataFrame dataFrame = spark .Read() .Format("avro") .Load(args[0]); RegionModel regionModel = new RegionModel(); Func <Column, Column> udfConvertRegion = Udf <string, string>( city => { var regionCode = city.Split('_')[1].Substring(0, 1); var convertedRegion = String.Empty; regionModel.ConversionTable.TryGetValue(regionCode, out convertedRegion); return(convertedRegion); } // city_23 --> 23 --> 2 --> {2 : Brisbane} --> ** Brisbane ** ); dataFrame = dataFrame .WithColumn("Region", udfConvertRegion(dataFrame["address.city"])) .Drop("orderunits", "address"); dataFrame .Coalesce(1) .Write() .Format("csv") .Save($"{args[1]}/{DateTime.UtcNow.ToString("yyyy/MM/dd/hh-mm-ss")}"); }
private static void RunDataFrameSample(bool createNewSession) { SparkSession ss = GetSparkSession(); if (createNewSession) { ss = sparkSession.NewSession(); } var peopleDataFrame = ss.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(DataFrameSamples.PeopleJson)); var count = peopleDataFrame.Count(); Console.WriteLine("Count of items in DataFrame {0}", count); var sortedDataFrame = peopleDataFrame.Sort(new string[] { "name", "age" }, new bool[] { true, false }); sortedDataFrame.Show(); if (SparkCLRSamples.Configuration.IsValidationEnabled) { var sortedDF = sortedDataFrame.Collect().ToArray(); Assert.AreEqual("789", sortedDF[0].GetAs <string>("id")); Assert.AreEqual("123", sortedDF[1].GetAs <string>("id")); Assert.AreEqual("531", sortedDF[2].GetAs <string>("id")); Assert.AreEqual("456", sortedDF[3].GetAs <string>("id")); } }
static void Main(string[] args) { // Set the debug backend port, has to be same as the one in the Dockerfile. System.Environment.SetEnvironmentVariable("DOTNETBACKEND_PORT", "12345"); // Create a Spark session. SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame. DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words. DataFrame words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results. words.Show(); // Stop Spark session. spark.Stop(); }
public static void leerTxt() { Console.WriteLine("Hello World!"); // Create a Spark session SparkSession spark = SparkSession .Builder() .AppName("word_count_sample") .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark.Read().Text("input.txt"); // Count words DataFrame words = dataFrame .Select(Functions.Split(Functions.Col("value"), " ").Alias("words")) .Select(Functions.Explode(Functions.Col("words")) .Alias("word")) .GroupBy("word") .Count() .OrderBy(Functions.Col("count").Desc()); // Show results words.Show(); // Stop Spark session spark.Stop(); }
static void BasicDfExample(SparkSession spark) { var dataFrame = spark.Read().Json("/Users/ed/spark-2.4.6-bin-without-hadoop/examples/src/main/resources/people.json"); dataFrame.Show(); dataFrame.PrintSchema(); dataFrame.Select("name").Show(); dataFrame.Select(dataFrame["name"], dataFrame["age"] + 1).Show(); dataFrame.Select(dataFrame["name"], dataFrame["age"].Plus(1)).Show(); dataFrame.Filter(dataFrame["age"] > 21).Show(); dataFrame.Filter(dataFrame["age"].Gt(21)).Show(); dataFrame.GroupBy(dataFrame["age"]).Count().Show(); dataFrame.CreateOrReplaceTempView("people"); var sqlDataFrame = spark.Sql("SELECT * FROM people"); dataFrame.CreateGlobalTempView("people"); spark.Sql("SELECT * FROM global_temp.people").Show(); spark.NewSession().Sql("SELECT * FROM global_temp.people").Show(); }
/// <summary> /// To integrate with Hive operations /// </summary> private static void HiveDataFrame() { var builder = SparkSession.Builder().EnableHiveSupport(); builder = builder.Config("spark.master", "yarn"); builder = builder.Config("spark.app.name", "HiveDataFrame"); builder = builder.Config("spark.sql.warehouse.dir", "/user/hive/warehouse"); session = builder.GetOrCreate(); var peopleDataFrame = session.Read().Json(jsonFilePath); logger.LogInfo("****Create table if not exists****"); session.Sql(string.Format("CREATE DATABASE IF NOT EXISTS {0}", dbName)); // create database if not exists logger.LogInfo("****Database Created****"); session.Sql(string.Format("USE {0}", dbName)); logger.LogInfo("****Create Table operation started****"); peopleDataFrame.Write().Mode(SaveMode.Overwrite).SaveAsTable(tableName); // create table logger.LogInfo("****Table Created successfully****"); var tablesDataFrame = session.Table(tableName); logger.LogInfo(string.Format("****Table count in database {0}: {1}", dbName, tablesDataFrame.Count()) + "****"); var rowCollections = tablesDataFrame.Collect(); logger.LogInfo("**********************************************"); foreach (var row in rowCollections) { Console.WriteLine("{0}", row); } logger.LogInfo("*********************************************"); logger.LogInfo("Executed Successfully................."); }
static void runSpark(string file_path, string cores, string nodes, int nrows) { // Create Spark session SparkSession spark = SparkSession .Builder() .AppName("Resolution de " + nrows + " sudokus par évolution combinatoire de " + cores + " noyau(x) et " + nodes + " noeud(s)") .Config("spark.executor.cores", cores) .Config("spark.executor.instances", nodes) .GetOrCreate(); // Create initial DataFrame DataFrame dataFrame = spark .Read() .Option("header", true) .Option("inferSchema", true) .Schema("quizzes string, solutions string") .Csv(file_path); DataFrame dataFrame2 = dataFrame.Limit(nrows); spark.Udf().Register <string, string>( "SukoduUDF", (sudoku) => sudokusolution(sudoku)); dataFrame2.CreateOrReplaceTempView("Resolved"); DataFrame sqlDf = spark.Sql("SELECT quizzes, SukoduUDF(quizzes) as Resolution from Resolved"); sqlDf.Show(); spark.Stop(); }
public UdfComplexTypesTests(SparkFixture fixture) { _spark = fixture.Spark; _df = _spark .Read() .Json(Path.Combine($"{TestEnvironment.ResourceDirectory}people.json")); }
public void TestSignaturesV2_3_X() { Assert.IsType <SparkContext>(_spark.SparkContext); Assert.IsType <Builder>(SparkSession.Builder()); SparkSession.ClearDefaultSession(); SparkSession.SetDefaultSession(_spark); Assert.IsType <SparkSession>(SparkSession.GetDefaultSession()); Assert.IsType <RuntimeConfig>(_spark.Conf()); Assert.IsType <SparkSession>(_spark.NewSession()); Assert.IsType <DataFrameReader>(_spark.Read()); Assert.IsType <DataFrame>(_spark.Range(10)); Assert.IsType <DataFrame>(_spark.Range(10, 100)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10)); Assert.IsType <DataFrame>(_spark.Range(10, 100, 10, 5)); _spark.Range(10).CreateOrReplaceTempView("testView"); Assert.IsType <DataFrame>(_spark.Table("testView")); Assert.IsType <DataStreamReader>(_spark.ReadStream()); Assert.IsType <UdfRegistration>(_spark.Udf()); Assert.IsType <Catalog>(_spark.Catalog()); }
public void TestRead() { var mockSparkSessionProxy = new Mock <ISparkSessionProxy>(); var sparkSession = new SparkSession(mockSparkSessionProxy.Object); var reader = sparkSession.Read(); mockSparkSessionProxy.Verify(m => m.Read(), Times.Once); }
public DataFrameTests(SparkFixture fixture) { _spark = fixture.Spark; _df = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json"); }
public UdfSimpleTypesTests(SparkFixture fixture) { _spark = fixture.Spark; _df = _spark .Read() .Schema("name STRING, age INT, date DATE") .Json(Path.Combine($"{TestEnvironment.ResourceDirectory}people.json")); }
public DataFrameTests() { fixture = new SparkFixture(); _spark = fixture.Spark; _df = _spark .Read() .Schema("age INT, name STRING") .Json($"Resources/people.json"); }
private void RunDatasourceExample(SparkSession spark) { DataFrame jdbcDf = spark.Read() .Format("jdbc") .Options( new Dictionary <string, string> { { "url", "jdbc:postgresql:postgres" }, { "dbtable", "table_name" }, { "user", "user_name" }, { "password", "password" } }) .Load(); jdbcDf.Show(); DataFrame jdbcDf2 = spark.Read() .Format("jdbc") .Options( new Dictionary <string, string> { { "url", "jdbc:postgresql:postgres" }, { "dbtable", "table_name" }, { "user", "user_name" }, { "password", "password" }, { "customSchema", "another_id int, another_name STRING" } }) .Load(); jdbcDf2.Show(); jdbcDf.Write() .Format("jdbc") .Options( new Dictionary <string, string> { { "url", "jdbc:postgresql:postgres" }, { "dbtable", "table_name" }, { "user", "user_name" }, { "password", "password" } }) .Mode(SaveMode.Append) .Save(); }
static void Main(string[] args) { // Verify environment variables if (args.Length != 4) { Console.Error.WriteLine("Usage: $TENANT_ID $ADLS_NAME $ADLS_SP_CLIENT_ID $ADLS_SP_CLIENT_SECRET"); Environment.Exit(1); } // Specify file path in Azure Data Lake Gen1 string filePath = $"adl://{args[1]}.azuredatalakestore.net/parquet/people.parquet"; // Create SparkSession SparkSession spark = SparkSession .Builder() .AppName("Azure Data Lake Storage example using .NET for Apache Spark") .Config("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem") .Config("fs.adl.oauth2.access.token.provider.type", "ClientCredential") .Config("fs.adl.oauth2.client.id", args[2]) .Config("fs.adl.oauth2.credential", args[3]) .Config("fs.adl.oauth2.refresh.url", $"https://login.microsoftonline.com/{args[0]}/oauth2/token") .GetOrCreate(); // Create sample data var data = new List <GenericRow> { new GenericRow(new object[] { 1, "John Doe" }), new GenericRow(new object[] { 2, "Jane Doe" }), new GenericRow(new object[] { 3, "Foo Bar" }) }; // Create schema for sample data var schema = new StructType(new List <StructField>() { new StructField("Id", new IntegerType()), new StructField("Name", new StringType()), }); // Create DataFrame using data and schema DataFrame df = spark.CreateDataFrame(data, schema); // Print DataFrame df.Show(); // Write DataFrame to Azure Data Lake Gen1 df.Write().Mode(SaveMode.Overwrite).Parquet(filePath); // Read saved DataFrame from Azure Data Lake Gen1 DataFrame readDf = spark.Read().Parquet(filePath); // Print DataFrame readDf.Show(); }
public void TestSignaturesV3_0_X() { DataFrame df = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json"); DataFrameWriterV2 dfwV2 = df.WriteTo("testtable"); Assert.IsType <DataFrameWriterV2>(dfwV2.Using("json")); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key1", "value")); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key2", true)); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key3", 1L)); Assert.IsType <DataFrameWriterV2>(dfwV2.Option("key4", 2D)); Assert.IsType <DataFrameWriterV2>(dfwV2.Options( new Dictionary <string, string>() { { "key", "value" } })); Assert.IsType <DataFrameWriterV2>(dfwV2.TableProperty("prop", "value")); _spark.Sql("DROP TABLE IF EXISTS default.testtable"); dfwV2.Create(); Assert.IsType <DataFrameWriterV2>(dfwV2.PartitionedBy(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. Assert.Throws <Exception>(() => dfwV2.Replace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: REPLACE TABLE AS SELECT is only supported // with v2 tables. Assert.Throws <Exception>(() => dfwV2.CreateOrReplace()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // append in batch mode. Assert.Throws <Exception>(() => dfwV2.Append()); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // overwrite by filter in batch mode. Assert.Throws <Exception>(() => dfwV2.Overwrite(df.Col("age"))); // Throws the following exception: // org.apache.spark.sql.AnalysisException: Table default.testtable does not support // dynamic overwrite in batch mode. Assert.Throws <Exception>(() => dfwV2.OverwritePartitions()); }