예제 #1
0
        public void TestConvertToDelta()
        {
            string    partitionColumnName = "id_plus_one";
            DataFrame data = _spark.Range(0, 5).Select(
                Functions.Col("id"),
                Functions.Expr($"(`id` + 1) AS `{partitionColumnName}`"));

            // Run the same test on the different overloads of DeltaTable.ConvertToDelta().
            void testWrapper(
                DataFrame dataFrame,
                Func <string, DeltaTable> convertToDelta,
                string partitionColumn = null)
            {
                using var tempDirectory = new TemporaryDirectory();
                string          path       = Path.Combine(tempDirectory.Path, "parquet-data");
                DataFrameWriter dataWriter = dataFrame.Write();

                if (!string.IsNullOrEmpty(partitionColumn))
                {
                    dataWriter = dataWriter.PartitionBy(partitionColumn);
                }

                dataWriter.Parquet(path);

                Assert.False(DeltaTable.IsDeltaTable(path));

                string     identifier          = $"parquet.`{path}`";
                DeltaTable convertedDeltaTable = convertToDelta(identifier);

                ValidateRangeDataFrame(Enumerable.Range(0, 5), convertedDeltaTable.ToDF());
                Assert.True(DeltaTable.IsDeltaTable(path));
            }

            testWrapper(data, identifier => DeltaTable.ConvertToDelta(_spark, identifier));
            testWrapper(
                data.Repartition(Functions.Col(partitionColumnName)),
                identifier => DeltaTable.ConvertToDelta(
                    _spark,
                    identifier,
                    $"{partitionColumnName} bigint"),
                partitionColumnName);
            testWrapper(
                data.Repartition(Functions.Col(partitionColumnName)),
                identifier => DeltaTable.ConvertToDelta(
                    _spark,
                    identifier,
                    new StructType(new[]
            {
                new StructField(partitionColumnName, new IntegerType())
            })),
                partitionColumnName);
        }
예제 #2
0
        public void TestPartitionBy()
        {
            // arrange
            mockDataFrameWriterProxy.Setup(m => m.PartitionBy(It.IsAny <string[]>()));
            var dataFrameWriter = new DataFrameWriter(mockDataFrameWriterProxy.Object);
            var colNames        = new string[] { "col1", "col2", "col3" };

            // Act
            dataFrameWriter.PartitionBy(colNames);

            // Assert
            mockDataFrameWriterProxy.Verify(m => m.PartitionBy(colNames));
        }
예제 #3
0
        public void TestSignaturesV2_3_X()
        {
            {
                DataFrameWriter dfw = _spark
                                      .Read()
                                      .Schema("age INT, name STRING")
                                      .Json($"{TestEnvironment.ResourceDirectory}people.json")
                                      .Write();

                Assert.IsType <DataFrameWriter>(dfw.Mode(SaveMode.Ignore));

                Assert.IsType <DataFrameWriter>(dfw.Mode("overwrite"));

                Assert.IsType <DataFrameWriter>(dfw.Format("json"));

                Assert.IsType <DataFrameWriter>(dfw.Option("stringOption", "value"));
                Assert.IsType <DataFrameWriter>(dfw.Option("boolOption", true));
                Assert.IsType <DataFrameWriter>(dfw.Option("longOption", 1L));
                Assert.IsType <DataFrameWriter>(dfw.Option("doubleOption", 3D));

                Assert.IsType <DataFrameWriter>(
                    dfw.Options(
                        new Dictionary <string, string>
                {
                    { "option1", "value1" },
                    { "option2", "value2" }
                }));

                Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age"));
                Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age", "name"));

                Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age"));
                Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age", "name"));

                Assert.IsType <DataFrameWriter>(dfw.SortBy("name"));
            }

            using (var tempDir = new TemporaryDirectory())
            {
                DataFrameWriter dfw = _spark
                                      .Read()
                                      .Csv($"{TestEnvironment.ResourceDirectory}people.csv")
                                      .Write();

                // TODO: Test dfw.Jdbc without running a local db.

                dfw.Option("path", tempDir.Path).SaveAsTable("TestTable");

                dfw.InsertInto("TestTable");

                dfw.Option("path", $"{tempDir.Path}TestSavePath1").Save();
                dfw.Save($"{tempDir.Path}TestSavePath2");

                dfw.Json($"{tempDir.Path}TestJsonPath");

                dfw.Parquet($"{tempDir.Path}TestParquetPath");

                dfw.Orc($"{tempDir.Path}TestOrcPath");

                dfw.Text($"{tempDir.Path}TestTextPath");

                dfw.Csv($"{tempDir.Path}TestCsvPath");
            }
        }
예제 #4
0
        public void TestSignaturesV2_3_X()
        {
            {
                DataFrameWriter dfw = _spark
                                      .Read()
                                      .Schema("age INT, name STRING")
                                      .Json($"{TestEnvironment.ResourceDirectory}people.json")
                                      .Write();

                Assert.IsType <DataFrameWriter>(dfw.Mode(SaveMode.Ignore));

                Assert.IsType <DataFrameWriter>(dfw.Mode("overwrite"));

                Assert.IsType <DataFrameWriter>(dfw.Format("json"));

                Assert.IsType <DataFrameWriter>(dfw.Option("stringOption", "value"));
                Assert.IsType <DataFrameWriter>(dfw.Option("boolOption", true));
                Assert.IsType <DataFrameWriter>(dfw.Option("longOption", 1L));
                Assert.IsType <DataFrameWriter>(dfw.Option("doubleOption", 3D));

                Assert.IsType <DataFrameWriter>(
                    dfw.Options(
                        new Dictionary <string, string>
                {
                    { "option1", "value1" },
                    { "option2", "value2" }
                }));

                Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age"));
                Assert.IsType <DataFrameWriter>(dfw.PartitionBy("age", "name"));

                Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age"));
                Assert.IsType <DataFrameWriter>(dfw.BucketBy(3, "age", "name"));

                Assert.IsType <DataFrameWriter>(dfw.SortBy("name"));
            }

            using (var tempDir = new TemporaryDirectory())
            {
                DataFrameWriter dfw = _spark
                                      .Read()
                                      .Csv($"{TestEnvironment.ResourceDirectory}people.csv")
                                      .Write();

                // TODO: Test dfw.Jdbc without running a local db.

                dfw.Save($"{tempDir.Path}TestSavePath1");

                dfw.Json($"{tempDir.Path}TestJsonPath");

                dfw.Parquet($"{tempDir.Path}TestParquetPath");

                dfw.Orc($"{tempDir.Path}TestOrcPath");

                dfw.Text($"{tempDir.Path}TestTextPath");

                dfw.Csv($"{tempDir.Path}TestCsvPath");

                dfw.Option("path", tempDir.Path).SaveAsTable("TestTable");

                dfw.InsertInto("TestTable");

                // In Spark 3.1.1+ setting the `path` Option and then calling .Save(path) is not
                // supported unless `spark.sql.legacy.pathOptionBehavior.enabled` conf is set.
                // .Json(path), .Parquet(path), etc follow the same code path so the conf
                // needs to be set in these scenarios as well.
                dfw.Option("path", $"{tempDir.Path}TestSavePath2").Save();
            }
        }