public void TestConvertToDelta() { string partitionColumnName = "id_plus_one"; DataFrame data = _spark.Range(0, 5).Select( Functions.Col("id"), Functions.Expr($"(`id` + 1) AS `{partitionColumnName}`")); // Run the same test on the different overloads of DeltaTable.ConvertToDelta(). void testWrapper( DataFrame dataFrame, Func <string, DeltaTable> convertToDelta, string partitionColumn = null) { using var tempDirectory = new TemporaryDirectory(); string path = Path.Combine(tempDirectory.Path, "parquet-data"); DataFrameWriter dataWriter = dataFrame.Write(); if (!string.IsNullOrEmpty(partitionColumn)) { dataWriter = dataWriter.PartitionBy(partitionColumn); } dataWriter.Parquet(path); Assert.False(DeltaTable.IsDeltaTable(path)); string identifier = $"parquet.`{path}`"; DeltaTable convertedDeltaTable = convertToDelta(identifier); ValidateRangeDataFrame(Enumerable.Range(0, 5), convertedDeltaTable.ToDF()); Assert.True(DeltaTable.IsDeltaTable(path)); } testWrapper(data, identifier => DeltaTable.ConvertToDelta(_spark, identifier)); testWrapper( data.Repartition(Functions.Col(partitionColumnName)), identifier => DeltaTable.ConvertToDelta( _spark, identifier, $"{partitionColumnName} bigint"), partitionColumnName); testWrapper( data.Repartition(Functions.Col(partitionColumnName)), identifier => DeltaTable.ConvertToDelta( _spark, identifier, new StructType(new[] { new StructField(partitionColumnName, new IntegerType()) })), partitionColumnName); }
public void TestPartitionBy() { // arrange mockDataFrameWriterProxy.Setup(m => m.PartitionBy(It.IsAny <string[]>())); var dataFrameWriter = new DataFrameWriter(mockDataFrameWriterProxy.Object); var colNames = new string[] { "col1", "col2", "col3" }; // Act dataFrameWriter.PartitionBy(colNames); // Assert mockDataFrameWriterProxy.Verify(m => m.PartitionBy(colNames)); }
public void TestSignaturesV2_3_X() { { DataFrameWriter dfw = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json") .Write(); Assert.IsType <DataFrameWriter>(dfw.Mode(SaveMode.Ignore)); Assert.IsType <DataFrameWriter>(dfw.Mode("overwrite")); Assert.IsType <DataFrameWriter>(dfw.Format("json")); Assert.IsType <DataFrameWriter>(dfw.Option("stringOption", "value")); Assert.IsType <DataFrameWriter>(dfw.Option("boolOption", true)); Assert.IsType <DataFrameWriter>(dfw.Option("longOption", 1L)); Assert.IsType <DataFrameWriter>(dfw.Option("doubleOption", 3D)); Assert.IsType <DataFrameWriter>( dfw.Options( new Dictionary <string, string> { { "option1", "value1" }, { "option2", "value2" } })); Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age")); Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age", "name")); Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age")); Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age", "name")); Assert.IsType <DataFrameWriter>(dfw.SortBy("name")); } using (var tempDir = new TemporaryDirectory()) { DataFrameWriter dfw = _spark .Read() .Csv($"{TestEnvironment.ResourceDirectory}people.csv") .Write(); // TODO: Test dfw.Jdbc without running a local db. dfw.Option("path", tempDir.Path).SaveAsTable("TestTable"); dfw.InsertInto("TestTable"); dfw.Option("path", $"{tempDir.Path}TestSavePath1").Save(); dfw.Save($"{tempDir.Path}TestSavePath2"); dfw.Json($"{tempDir.Path}TestJsonPath"); dfw.Parquet($"{tempDir.Path}TestParquetPath"); dfw.Orc($"{tempDir.Path}TestOrcPath"); dfw.Text($"{tempDir.Path}TestTextPath"); dfw.Csv($"{tempDir.Path}TestCsvPath"); } }
public void TestSignaturesV2_3_X() { { DataFrameWriter dfw = _spark .Read() .Schema("age INT, name STRING") .Json($"{TestEnvironment.ResourceDirectory}people.json") .Write(); Assert.IsType <DataFrameWriter>(dfw.Mode(SaveMode.Ignore)); Assert.IsType <DataFrameWriter>(dfw.Mode("overwrite")); Assert.IsType <DataFrameWriter>(dfw.Format("json")); Assert.IsType <DataFrameWriter>(dfw.Option("stringOption", "value")); Assert.IsType <DataFrameWriter>(dfw.Option("boolOption", true)); Assert.IsType <DataFrameWriter>(dfw.Option("longOption", 1L)); Assert.IsType <DataFrameWriter>(dfw.Option("doubleOption", 3D)); Assert.IsType <DataFrameWriter>( dfw.Options( new Dictionary <string, string> { { "option1", "value1" }, { "option2", "value2" } })); Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age")); Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age", "name")); Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age")); Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age", "name")); Assert.IsType <DataFrameWriter>(dfw.SortBy("name")); } using (var tempDir = new TemporaryDirectory()) { DataFrameWriter dfw = _spark .Read() .Csv($"{TestEnvironment.ResourceDirectory}people.csv") .Write(); // TODO: Test dfw.Jdbc without running a local db. dfw.Save($"{tempDir.Path}TestSavePath1"); dfw.Json($"{tempDir.Path}TestJsonPath"); dfw.Parquet($"{tempDir.Path}TestParquetPath"); dfw.Orc($"{tempDir.Path}TestOrcPath"); dfw.Text($"{tempDir.Path}TestTextPath"); dfw.Csv($"{tempDir.Path}TestCsvPath"); dfw.Option("path", tempDir.Path).SaveAsTable("TestTable"); dfw.InsertInto("TestTable"); // In Spark 3.1.1+ setting the `path` Option and then calling .Save(path) is not // supported unless `spark.sql.legacy.pathOptionBehavior.enabled` conf is set. // .Json(path), .Parquet(path), etc follow the same code path so the conf // needs to be set in these scenarios as well. dfw.Option("path", $"{tempDir.Path}TestSavePath2").Save(); } }