public void include_no_metrics_in_loaded_AnalysisResults_if_requested() =>
        Evaluate(_session, (context, repository) =>
        {
            repository.Save(new ResultKey(DATE_ONE, new Dictionary <string, string>(REGION_EU)), context);
            repository.Save(new ResultKey(DATE_TWO, new Dictionary <string, string>(REGION_NA)), context);

            DataFrame analysisResultsAsDataFrame = repository.Load()
                                                   .After(DATE_ONE)
                                                   .ForAnalyzers(Enumerable.Empty <IAnalyzer <IMetric> >())
                                                   .GetSuccessMetricsAsDataFrame(_session, Enumerable.Empty <string>());

            List <GenericRow> elements = new List <GenericRow>();

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("entity", new StringType()),
                new StructField("instance", new StringType()),
                new StructField("name", new StringType()),
                new StructField("value", new DoubleType()),
                new StructField("dataset_date", new LongType()),
                new StructField("region", new StringType())
            });

            DataFrame df = _session.CreateDataFrame(elements, schema);

            AssertSameRows(analysisResultsAsDataFrame, df);
        });
Beispiel #2
0
        public static (DataFrame, DataFrame) GetDfWithStrongPositiveCorrelationPartitioned(SparkSession session)
        {
            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("att1", new IntegerType()),
                new StructField("att2", new IntegerType())
            });


            List <GenericRow> first = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, 2 }),
                new GenericRow(new object[] { 2, 4 }),
                new GenericRow(new object[] { 3, 6 })
            };


            var firstDataframe = session.CreateDataFrame(first, schema);

            List <GenericRow> second = new List <GenericRow>
            {
                new GenericRow(new object[] { 4, 8 }),
                new GenericRow(new object[] { 5, 10 }),
                new GenericRow(new object[] { 6, 12 })
            };

            var secondDataframe = session.CreateDataFrame(second, schema);

            return(firstDataframe, secondDataframe);
        }
        /// <summary>
        /// Get the <see cref="AssemblyInfo"/> for the "Microsoft.Spark" assembly running
        /// on the Spark Driver and make a "best effort" attempt in determining the
        /// <see cref="AssemblyInfo"/> of "Microsoft.Spark.Worker"
        /// assembly on the Spark Executors.
        ///
        /// There is no guarantee that a Spark Executor will be run on all the nodes in
        /// a cluster. To increase the likelyhood, the spark conf `spark.executor.instances`
        /// and the <paramref name="numPartitions"/> settings should be adjusted to a
        /// reasonable number relative to the number of nodes in the Spark cluster.
        /// </summary>
        /// <param name="session">The <see cref="SparkSession"/></param>
        /// <param name="numPartitions">Number of partitions</param>
        /// <returns>
        /// A <see cref="DataFrame"/> containing the <see cref="AssemblyInfo"/>
        /// </returns>
        public static DataFrame GetAssemblyInfo(this SparkSession session, int numPartitions = 10)
        {
            var schema = new StructType(new StructField[]
            {
                new StructField("AssemblyName", new StringType(), isNullable: false),
                new StructField("AssemblyVersion", new StringType(), isNullable: false),
                new StructField("HostName", new StringType(), isNullable: false)
            });

            DataFrame driverAssmeblyInfoDf = session.CreateDataFrame(
                new GenericRow[] { CreateGenericRow(MicrosoftSparkAssemblyInfo()) },
                schema);

            Func <Column, Column> executorAssemblyInfoUdf = Udf <int>(
                i => CreateGenericRow(MicrosoftSparkWorkerAssemblyInfo()),
                schema);
            DataFrame df = session.CreateDataFrame(Enumerable.Range(0, 10 * numPartitions));

            string    tempColName            = "ExecutorAssemblyInfo";
            DataFrame executorAssemblyInfoDf = df
                                               .Repartition(numPartitions)
                                               .WithColumn(tempColName, executorAssemblyInfoUdf(df["_1"]))
                                               .Select(schema.Fields.Select(f => Col($"{tempColName}.{f.Name}")).ToArray());

            return(driverAssmeblyInfoDf
                   .Union(executorAssemblyInfoDf)
                   .DropDuplicates()
                   .Sort(schema.Fields.Select(f => Col(f.Name)).ToArray()));
        }
Beispiel #4
0
        public void TestCreateDataFrame()
        {
            // Calling CreateDataFrame with schema
            {
                var data = new List <GenericRow>();
                data.Add(new GenericRow(new object[] { "Alice", 20 }));
                data.Add(new GenericRow(new object[] { "Bob", 30 }));

                var schema = new StructType(new List <StructField>()
                {
                    new StructField("Name", new StringType()),
                    new StructField("Age", new IntegerType())
                });
                DataFrame df = _spark.CreateDataFrame(data, schema);
                ValidateDataFrame(df, data.Select(a => a.Values), schema);
            }

            // Calling CreateDataFrame(IEnumerable<string> _) without schema
            {
                var data   = new List <string>(new string[] { "Alice", "Bob" });
                var schema = SchemaWithSingleColumn(new StringType());

                DataFrame df = _spark.CreateDataFrame(data);
                ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
            }

            // Calling CreateDataFrame(IEnumerable<int> _) without schema
            {
                var data   = new List <int>(new int[] { 1, 2 });
                var schema = SchemaWithSingleColumn(new IntegerType());

                DataFrame df = _spark.CreateDataFrame(data);
                ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
            }

            // Calling CreateDataFrame(IEnumerable<double> _) without schema
            {
                var data   = new List <double>(new double[] { 1.2, 2.3 });
                var schema = SchemaWithSingleColumn(new DoubleType());

                DataFrame df = _spark.CreateDataFrame(data);
                ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
            }

            // Calling CreateDataFrame(IEnumerable<bool> _) without schema
            {
                var data   = new List <bool>(new bool[] { true, false });
                var schema = SchemaWithSingleColumn(new BooleanType());

                DataFrame df = _spark.CreateDataFrame(data);
                ValidateDataFrame(df, data.Select(a => new object[] { a }), schema);
            }
        }
        public void TestUdfWithDuplicateTimestamps()
        {
            var timestamp = new Timestamp(2020, 1, 1, 0, 0, 0, 0);
            var schema    = new StructType(new StructField[]
            {
                new StructField("ts", new TimestampType())
            });
            var data = new GenericRow[]
            {
                new GenericRow(new object[] { timestamp }),
                new GenericRow(new object[] { timestamp }),
                new GenericRow(new object[] { timestamp })
            };

            var expectedTimestamp     = new Timestamp(1970, 1, 2, 0, 0, 0, 0);
            Func <Column, Column> udf = Udf <Timestamp, Timestamp>(
                ts => new Timestamp(1970, 1, 2, 0, 0, 0, 0));

            DataFrame df = _spark.CreateDataFrame(data, schema);

            Row[] rows = df.Select(udf(df["ts"])).Collect().ToArray();

            Assert.Equal(3, rows.Length);
            foreach (Row row in rows)
            {
                Assert.Single(row.Values);
                Assert.Equal(expectedTimestamp, row.Values[0]);
            }
        }
        private DataFrame GetTestData(SparkSession session)
        {
            var schema = new StructType(new[]
            {
                new StructField("item", new StringType(), false),
                new StructField("origin", new StringType()),
                new StructField("sales", new IntegerType(), false),
                new StructField("marketplace", new StringType(), false)
            });


            var rowData = new List <GenericRow>
            {
                new GenericRow(new object[] { "item1", "US", 100, "EU" }),
                new GenericRow(new object[] { "item1", "US", 1000, "EU" }),
                new GenericRow(new object[] { "item1", "US", 20, "EU" }),

                new GenericRow(new object[] { "item2", "DE", 20, "EU" }),
                new GenericRow(new object[] { "item2", "DE", 333, "EU" }),

                new GenericRow(new object[] { "item3", null, 12, "EU" }),
                new GenericRow(new object[] { "item4", null, 45, "EU" }),
                new GenericRow(new object[] { "item5", null, 123, "EU" })
            };

            return(session.CreateDataFrame(rowData, schema));
        }
        public UdfSimpleTypesTests(SparkFixture fixture)
        {
            _spark = fixture.Spark;
            var data = new List <GenericRow>();

            data.Add(new GenericRow(
                         new object[]
            {
                null,
                new Date(2020, 1, 1),
                new Timestamp(2020, 1, 1, 0, 0, 0, 0)
            }));
            data.Add(new GenericRow(
                         new object[]
            {
                30,
                new Date(2020, 1, 2),
                new Timestamp(2020, 1, 2, 15, 30, 30, 123456)
            }));
            var schema = new StructType(new List <StructField>()
            {
                new StructField("age", new IntegerType()),
                new StructField("date", new DateType()),
                new StructField("time", new TimestampType())
            });

            _df = _spark.CreateDataFrame(data, schema);
        }
Beispiel #8
0
        public static DataFrame GetDFWithUniqueColumns(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "1", "0", "3", "1", "5", "0" }),
                new GenericRow(new object[] { "2", "0", "3", "2", "6", "0" }),
                new GenericRow(new object[] { "3", "0", "3", null, "7", "0" }),
                new GenericRow(new object[] { "4", "5", null, "3", "0", "4" }),
                new GenericRow(new object[] { "5", "6", null, "4", "0", "5" }),
                new GenericRow(new object[] { "6", "7", null, "5", "0", "6" })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("unique", new StringType()),
                new StructField("nonUnique", new StringType()),
                new StructField("nonUniqueWithNulls", new StringType()),
                new StructField("uniqueWithNulls", new StringType()),
                new StructField("onlyUniqueWithOtherNonUnique", new StringType()),
                new StructField("halfUniqueCombinedWithNonUnique", new StringType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
Beispiel #9
0
        public static DataFrame GetDfWithStrongPositiveCorrelationFilter(SparkSession session)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { 65, 64 }),
                new GenericRow(new object[] { 3426, 2634 }),
                new GenericRow(new object[] { 2345, 23434 }),
                new GenericRow(new object[] { 2374, 234 }),
                new GenericRow(new object[] { 767, 2676 }),
                new GenericRow(new object[] { 1, 2 }),
                new GenericRow(new object[] { 2, 4 }),
                new GenericRow(new object[] { 3, 6 }),
                new GenericRow(new object[] { 4, 8 }),
                new GenericRow(new object[] { 5, 10 }),
                new GenericRow(new object[] { 6, 12 })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("att1", new IntegerType()),
                new StructField("att2", new IntegerType())
            });

            return(session.CreateDataFrame(elements, schema));
        }
Beispiel #10
0
        public void TestSignaturesV3_X_X()
        {
            // Validate ToLocalIterator
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { "Alice", 20 }),
                new GenericRow(new object[] { "Bob", 30 })
            };
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Name", new StringType()),
                new StructField("Age", new IntegerType())
            });
            DataFrame         df       = _spark.CreateDataFrame(data, schema);
            IEnumerable <Row> actual   = df.ToLocalIterator(true).ToArray();
            IEnumerable <Row> expected = data.Select(r => new Row(r.Values, schema));

            Assert.Equal(expected, actual);

            Assert.IsType <DataFrame>(df.Observe("metrics", Count("Name").As("CountNames")));

            Assert.IsType <Row[]>(_df.Tail(1).ToArray());

            _df.PrintSchema(1);

            _df.Explain("simple");
            _df.Explain("extended");
            _df.Explain("codegen");
            _df.Explain("cost");
            _df.Explain("formatted");
        }
Beispiel #11
0
        public static DataFrame GetDFMissing(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "1", "a", "f" }),
                new GenericRow(new object[] { "2", "b", "d" }),
                new GenericRow(new object[] { "3", null, "f" }),
                new GenericRow(new object[] { "4", "a", null }),
                new GenericRow(new object[] { "5", "a", "f" }),
                new GenericRow(new object[] { "6", null, "d" }),
                new GenericRow(new object[] { "7", null, "d" }),
                new GenericRow(new object[] { "8", "b", null }),
                new GenericRow(new object[] { "9", "a", "f" }),
                new GenericRow(new object[] { "10", null, null }),
                new GenericRow(new object[] { "11", null, "f" }),
                new GenericRow(new object[] { "12", null, "d" })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("item", new StringType()),
                new StructField("att1", new StringType()),
                new StructField("att2", new StringType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
        static void Main(string[] args)
        {
            // Verify environment variables
            if (args.Length != 4)
            {
                Console.Error.WriteLine("Usage: $TENANT_ID $ADLS_NAME $ADLS_SP_CLIENT_ID $ADLS_SP_CLIENT_SECRET");
                Environment.Exit(1);
            }

            // Specify file path in Azure Data Lake Gen1
            string filePath =
                $"adl://{args[1]}.azuredatalakestore.net/parquet/people.parquet";

            // Create SparkSession
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Azure Data Lake Storage example using .NET for Apache Spark")
                                 .Config("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem")
                                 .Config("fs.adl.oauth2.access.token.provider.type", "ClientCredential")
                                 .Config("fs.adl.oauth2.client.id", args[2])
                                 .Config("fs.adl.oauth2.credential", args[3])
                                 .Config("fs.adl.oauth2.refresh.url", $"https://login.microsoftonline.com/{args[0]}/oauth2/token")
                                 .GetOrCreate();

            // Create sample data
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "John Doe" }),
                new GenericRow(new object[] { 2, "Jane Doe" }),
                new GenericRow(new object[] { 3, "Foo Bar" })
            };

            // Create schema for sample data
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Id", new IntegerType()),
                new StructField("Name", new StringType()),
            });

            // Create DataFrame using data and schema
            DataFrame df = spark.CreateDataFrame(data, schema);

            // Print DataFrame
            df.Show();

            // Write DataFrame to Azure Data Lake Gen1
            df.Write().Mode(SaveMode.Overwrite).Parquet(filePath);

            // Read saved DataFrame from Azure Data Lake Gen1
            DataFrame readDf = spark.Read().Parquet(filePath);

            // Print DataFrame
            readDf.Show();
        }
        public void TestUdfWithSimpleArrayType()
        {
            var schema = new StructType(new StructField[]
            {
                new StructField("name", new StringType()),
                new StructField("ids", new ArrayType(new IntegerType()))
            });

            var data = new GenericRow[]
            {
                new GenericRow(new object[] { "Name1", new int[] { 1, 2, 3 } }),
                new GenericRow(new object[] { "Name2", null }),
                new GenericRow(new object[] { "Name3", new int[] { 4 } }),
            };

            DataFrame df = _spark.CreateDataFrame(data, schema);

            var expected = new string[] { "Name1|1,2,3", "Name2", "Name3|4" };

            {
                // Test using array
                Func <Column, Column, Column> udf =
                    Udf <string, int[], string>(
                        (name, ids) =>
                {
                    if (ids == null)
                    {
                        return(name);
                    }

                    return(AppendEnumerable(name, ids));
                });

                Row[] rows = df.Select(udf(df["name"], df["ids"])).Collect().ToArray();
                Assert.Equal(expected, rows.Select(r => r.GetAs <string>(0)));
            }
            {
                // Test using ArrayList
                Func <Column, Column, Column> udf =
                    Udf <string, ArrayList, string>(
                        (name, ids) =>
                {
                    if (ids == null)
                    {
                        return(name);
                    }

                    return(AppendEnumerable(name, ids.ToArray()));
                });

                Row[] rows = df.Select(udf(df["name"], df["ids"])).Collect().ToArray();
                Assert.Equal(expected, rows.Select(r => r.GetAs <string>(0)));
            }
        }
Beispiel #14
0
        public void should_execute_a_basic_example()
        {
            var data = _session.CreateDataFrame(
                new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "Thingy A", "awesome thing. http://thingb.com", "high", 0 }),
                new GenericRow(new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }),
                new GenericRow(new object[] { 3, null, null, "low", 5 }),
                new GenericRow(new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 }),
                new GenericRow(new object[] { 5, "Thingy E", null, "high", 12 })
            },
                new StructType(new List <StructField>
            {
                new StructField("id", new IntegerType()),
                new StructField("productName", new StringType()),
                new StructField("description", new StringType()),
                new StructField("priority", new StringType()),
                new StructField("numViews", new IntegerType()),
            }));

            var result = new VerificationSuite()
                         .OnData(data)
                         .AddCheck(
                new Check(CheckLevel.Error, "integrity checks")
                .HasSize(val => val == 5)
                .IsComplete("id")
                .IsUnique("id")
                .IsComplete("productName")
                .IsContainedIn("priority", new[] { "high", "low" })
                .IsNonNegative("numViews")
                )
                         .AddCheck(
                new Check(CheckLevel.Warning, "distribution checks")
                .ContainsURL("description", val => val >= .5)
                )
                         .Run();

            result.Debug(_helper.WriteLine);
        }
        static void Main(string[] args)
        {
            // Verify environment variables
            if (args.Length != 2)
            {
                Console.Error.WriteLine("Usage: $AZURE_STORAGE_ACCOUNT $AZURE_STORAGE_KEY");
                Environment.Exit(1);
            }

            // Specify file path in Azure Storage
            string filePath =
                $"wasbs://dotnet-spark@{args[0]}.blob.core.windows.net/json/people.json";

            // Create SparkSession
            SparkSession spark = SparkSession
                                 .Builder()
                                 .AppName("Azure Storage example using .NET for Apache Spark")
                                 .Config("fs.wasbs.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
                                 .Config($"fs.azure.account.key.{args[0]}.blob.core.windows.net", args[1])
                                 .GetOrCreate();

            // Create sample data
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, "John Doe" }),
                new GenericRow(new object[] { 2, "Jane Doe" }),
                new GenericRow(new object[] { 3, "Foo Bar" })
            };

            // Create schema for sample data
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Id", new IntegerType()),
                new StructField("Name", new StringType()),
            });

            // Create DataFrame using data and schema
            DataFrame df = spark.CreateDataFrame(data, schema);

            // Print DataFrame
            df.Show();

            // Write DataFrame to Azure Storage
            df.Write().Mode(SaveMode.Overwrite).Json(filePath);

            // Read saved DataFrame from Azure Storage
            DataFrame readDf = spark.Read().Json(filePath);

            // Print DataFrame
            readDf.Show();
        }
        private static void CreateUsingGenericRowAndStructType(SparkSession spark)
        {
            Console.WriteLine("spark.CreateDataFrame using StructType");
            var rowOne = new GenericRow(new object[]
            {
                "columnOne Row One", 1.1
            });

            var rowTwo = new GenericRow(new object[]
            {
                "columnOne Row Two", null
            });

            var rowThree = new GenericRow(new object[]
            {
                "columnOne Row Three", 3.3
            });

            var rows = new List <GenericRow>()
            {
                rowOne, rowTwo, rowThree
            };

            var structType = new StructType(new List <StructField>()
            {
                new StructField("column one", new StringType(), isNullable: false),
                new StructField("column two", new DoubleType(), isNullable: true)
            });

            var dataFrame = spark.CreateDataFrame(rows, structType);

            dataFrame.Show();

            /*
             *  +-------------------+----------+
             |         column one|column two|
             +-------------------+----------+
             |  columnOne Row One|       1.1|
             |  columnOne Row Two|      null|
             |columnOne Row Three|       3.3|
             +-------------------+----------+
             */

            dataFrame.PrintSchema();

            /*
             *  root
             |-- column one: string (nullable = false)
             |-- column two: double (nullable = true)
             */
        }
Beispiel #17
0
        public void TestWithDuplicatedRows()
        {
            var timestamp = new Timestamp(2020, 1, 1, 0, 0, 0, 0);
            var schema    = new StructType(new StructField[]
            {
                new StructField("ts", new TimestampType())
            });
            var data = new GenericRow[]
            {
                new GenericRow(new object[] { timestamp })
            };

            DataFrame df = _spark.CreateDataFrame(data, schema);

            Row[] rows = df
                         .WithColumn("tsRow", Struct("ts"))
                         .WithColumn("tsRowRow", Struct("tsRow"))
                         .Collect()
                         .ToArray();

            Assert.Single(rows);

            Row row = rows[0];

            Assert.Equal(3, row.Values.Length);
            Assert.Equal(timestamp, row.Values[0]);

            Row tsRow = row.Values[1] as Row;

            Assert.Single(tsRow.Values);
            Assert.Equal(timestamp, tsRow.Values[0]);

            Row tsRowRow = row.Values[2] as Row;

            Assert.Single(tsRowRow.Values);
            Assert.Equal(tsRowRow.Values[0], tsRow);
        }
Beispiel #18
0
        private static DataFrame GetSearchTermTFIDF(SparkSession spark, string searchTerm,
                                                    Tokenizer tokenizer, HashingTF hashingTF, IDFModel idfModel)
        {
            var searchTermDataFrame = spark.CreateDataFrame(new List <string>()
            {
                searchTerm
            })
                                      .WithColumnRenamed("_1", "Content");
            var searchWords         = tokenizer.Transform(searchTermDataFrame);
            var featurizedSeachTerm = hashingTF.Transform(searchWords);
            var search = idfModel.Transform(featurizedSeachTerm).WithColumnRenamed("features", "features2")
                         .WithColumn("norm2", udfCalcNorm(Col("features2")));

            return(search);
        }
Beispiel #19
0
        public static DataFrame GetDfFractionalStringTypes(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "1", "1.0" }), new GenericRow(new object[] { "2", "a" })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("item", new StringType()), new StructField("att1", new StringType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
Beispiel #20
0
        public void correctly_return_a_DataFrame_of_multiple_AnalysisResults_that_is_formatted_as_expected() =>
        Evaluate(_session, (context, repository) =>
        {
            repository.Save(new ResultKey(DATE_ONE, new Dictionary <string, string>(REGION_EU)), context);
            repository.Save(new ResultKey(DATE_TWO, new Dictionary <string, string>(REGION_NA)), context);

            DataFrame analysisResultsAsDataFrame = repository.Load()
                                                   .GetSuccessMetricsAsDataFrame(_session, Enumerable.Empty <string>());

            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "Dataset", "*", "Size", 4.0, DATE_ONE, "EU" }),
                new GenericRow(new object[] { "Column", "att1", "Completeness", 1.0, DATE_ONE, "EU" }),
                new GenericRow(new object[] { "Column", "item", "Distinctness", 1.0, DATE_ONE, "EU" }),
                new GenericRow(new object[] { "Multicolumn", "att1,att2", "Uniqueness", 0.25, DATE_ONE, "EU" }),
                new GenericRow(new object[] { "Dataset", "*", "Size", 4.0, DATE_TWO, "NA" }),
                new GenericRow(new object[] { "Column", "att1", "Completeness", 1.0, DATE_TWO, "NA" }),
                new GenericRow(new object[] { "Column", "item", "Distinctness", 1.0, DATE_TWO, "NA" }),
                new GenericRow(new object[] { "Multicolumn", "att1,att2", "Uniqueness", 0.25, DATE_TWO, "NA" })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("entity", new StringType()),
                new StructField("instance", new StringType()),
                new StructField("name", new StringType()),
                new StructField("value", new DoubleType()),
                new StructField("dataset_date", new LongType()),
                new StructField("region", new StringType())
            });

            DataFrame df = _session.CreateDataFrame(elements, schema);

            FixtureSupport.AssertSameRows(analysisResultsAsDataFrame, df, Option <ITestOutputHelper> .None);
        });
Beispiel #21
0
        public static DataFrame GetDFWithNRows(SparkSession sparkSession, int N)
        {
            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("c0", new StringType()),
                new StructField("c1", new StringType()),
                new StructField("c2", new StringType())
            });

            return(sparkSession.CreateDataFrame(
                       Enumerable.Range(1, N).Select(value =>
            {
                return new GenericRow(new object[] { $"{value}", $"c1-r{value}", $"c2-r{value}" });
            }).ToList(), schema));
        }
Beispiel #22
0
        public static DataFrame GetDfWithConditionallyInformativeColumns(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, 4 }),
                new GenericRow(new object[] { 2, 5 }),
                new GenericRow(new object[] { 3, 6 })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
Beispiel #23
0
        public static DataFrame GetDfWithVariableStringLengthValues(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "" }),
                new GenericRow(new object[] { "a" }),
                new GenericRow(new object[] { "bb" }),
                new GenericRow(new object[] { "ccc" }),
                new GenericRow(new object[] { "dddd" })
            };

            StructType schema = new StructType(
                new List <StructField> {
                new StructField("att1", new StringType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
Beispiel #24
0
        public void TestSignaturesV3_X_X()
        {
            // Validate ToLocalIterator
            var data = new List <GenericRow>
            {
                new GenericRow(new object[] { "Alice", 20 }),
                new GenericRow(new object[] { "Bob", 30 })
            };
            var schema = new StructType(new List <StructField>()
            {
                new StructField("Name", new StringType()),
                new StructField("Age", new IntegerType())
            });
            DataFrame         df       = _spark.CreateDataFrame(data, schema);
            IEnumerable <Row> actual   = df.ToLocalIterator(true).ToArray();
            IEnumerable <Row> expected = data.Select(r => new Row(r.Values, schema));

            Assert.Equal(expected, actual);
        }
Beispiel #25
0
        public void TestSQLTransformer()
        {
            DataFrame input = _spark.CreateDataFrame(
                new List <GenericRow>
            {
                new GenericRow(new object[] { 0, 1.0, 3.0 }),
                new GenericRow(new object[] { 2, 2.0, 5.0 })
            },
                new StructType(new List <StructField>
            {
                new StructField("id", new IntegerType()),
                new StructField("v1", new DoubleType()),
                new StructField("v2", new DoubleType())
            }));

            string expectedUid    = "theUid";
            string inputStatement = "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__";

            SQLTransformer sqlTransformer = new SQLTransformer(expectedUid)
                                            .SetStatement(inputStatement);

            string outputStatement = sqlTransformer.GetStatement();

            DataFrame  output       = sqlTransformer.Transform(input);
            StructType outputSchema = sqlTransformer.TransformSchema(input.Schema());

            Assert.Contains(output.Schema().Fields, (f => f.Name == "v3"));
            Assert.Contains(output.Schema().Fields, (f => f.Name == "v4"));
            Assert.Contains(outputSchema.Fields, (f => f.Name == "v3"));
            Assert.Contains(outputSchema.Fields, (f => f.Name == "v4"));
            Assert.Equal(inputStatement, outputStatement);

            using (var tempDirectory = new TemporaryDirectory())
            {
                string savePath = Path.Join(tempDirectory.Path, "SQLTransformer");
                sqlTransformer.Save(savePath);

                SQLTransformer loadedsqlTransformer = SQLTransformer.Load(savePath);
                Assert.Equal(sqlTransformer.Uid(), loadedsqlTransformer.Uid());
            }
            Assert.Equal(expectedUid, sqlTransformer.Uid());
        }
Beispiel #26
0
        public static DataFrame GetDfWithDistinctValues(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "a", null }),
                new GenericRow(new object[] { "a", null }),
                new GenericRow(new object[] { null, "x" }),
                new GenericRow(new object[] { "b", "x" }),
                new GenericRow(new object[] { "b", "x" }),
                new GenericRow(new object[] { "c", "y" })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("att1", new StringType()), new StructField("att2", new StringType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
Beispiel #27
0
        public static DataFrame GetDFWithNegativeNumbers(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "1", "-1", "-1.0" }),
                new GenericRow(new object[] { "2", "-2", "-2.0" }),
                new GenericRow(new object[] { "3", "-3", "-3.0" }),
                new GenericRow(new object[] { "4", "-4", "-4.0" })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("item", new StringType()),
                new StructField("att1", new StringType()),
                new StructField("att2", new StringType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }
Beispiel #28
0
        public static DataFrame GetDfWithStrongNegativeCorrelation(SparkSession session)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { 1, 12 }),
                new GenericRow(new object[] { 2, 10 }),
                new GenericRow(new object[] { 3, 8 }),
                new GenericRow(new object[] { 4, 6 }),
                new GenericRow(new object[] { 5, 4 }),
                new GenericRow(new object[] { 6, 2 })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("att1", new IntegerType()),
                new StructField("att2", new IntegerType())
            });

            return(session.CreateDataFrame(elements, schema));
        }
Beispiel #29
0
        public static DataFrame GetDfWithLowCorrelation(SparkSession session)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { 12, 8 }),
                new GenericRow(new object[] { 10, 12 }),
                new GenericRow(new object[] { 8, 1 }),
                new GenericRow(new object[] { 6, 30 }),
                new GenericRow(new object[] { 4, 9 }),
                new GenericRow(new object[] { 2, 7 })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("att1", new IntegerType()),
                new StructField("att2", new IntegerType())
            });

            return(session.CreateDataFrame(elements, schema));
        }
Beispiel #30
0
        public static DataFrame GetDfWithNumericFractionalValues(SparkSession sparkSession)
        {
            List <GenericRow> elements = new List <GenericRow>
            {
                new GenericRow(new object[] { "1", 1.0, 0.0 }),
                new GenericRow(new object[] { "2", 2.0, 0.0 }),
                new GenericRow(new object[] { "3", 3.0, 0.0 }),
                new GenericRow(new object[] { "4", 4.0, 0.0 }),
                new GenericRow(new object[] { "5", 5.0, 0.0 }),
                new GenericRow(new object[] { "6", 6.0, 0.0 })
            };

            StructType schema = new StructType(
                new List <StructField>
            {
                new StructField("item", new StringType()),
                new StructField("att1", new IntegerType()),
                new StructField("att2", new IntegerType())
            });

            return(sparkSession.CreateDataFrame(elements, schema));
        }