public void include_no_metrics_in_loaded_AnalysisResults_if_requested() => Evaluate(_session, (context, repository) => { repository.Save(new ResultKey(DATE_ONE, new Dictionary <string, string>(REGION_EU)), context); repository.Save(new ResultKey(DATE_TWO, new Dictionary <string, string>(REGION_NA)), context); DataFrame analysisResultsAsDataFrame = repository.Load() .After(DATE_ONE) .ForAnalyzers(Enumerable.Empty <IAnalyzer <IMetric> >()) .GetSuccessMetricsAsDataFrame(_session, Enumerable.Empty <string>()); List <GenericRow> elements = new List <GenericRow>(); StructType schema = new StructType( new List <StructField> { new StructField("entity", new StringType()), new StructField("instance", new StringType()), new StructField("name", new StringType()), new StructField("value", new DoubleType()), new StructField("dataset_date", new LongType()), new StructField("region", new StringType()) }); DataFrame df = _session.CreateDataFrame(elements, schema); AssertSameRows(analysisResultsAsDataFrame, df); });
public static (DataFrame, DataFrame) GetDfWithStrongPositiveCorrelationPartitioned(SparkSession session) { StructType schema = new StructType( new List <StructField> { new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType()) }); List <GenericRow> first = new List <GenericRow> { new GenericRow(new object[] { 1, 2 }), new GenericRow(new object[] { 2, 4 }), new GenericRow(new object[] { 3, 6 }) }; var firstDataframe = session.CreateDataFrame(first, schema); List <GenericRow> second = new List <GenericRow> { new GenericRow(new object[] { 4, 8 }), new GenericRow(new object[] { 5, 10 }), new GenericRow(new object[] { 6, 12 }) }; var secondDataframe = session.CreateDataFrame(second, schema); return(firstDataframe, secondDataframe); }
/// <summary> /// Get the <see cref="AssemblyInfo"/> for the "Microsoft.Spark" assembly running /// on the Spark Driver and make a "best effort" attempt in determining the /// <see cref="AssemblyInfo"/> of "Microsoft.Spark.Worker" /// assembly on the Spark Executors. /// /// There is no guarantee that a Spark Executor will be run on all the nodes in /// a cluster. To increase the likelyhood, the spark conf `spark.executor.instances` /// and the <paramref name="numPartitions"/> settings should be adjusted to a /// reasonable number relative to the number of nodes in the Spark cluster. /// </summary> /// <param name="session">The <see cref="SparkSession"/></param> /// <param name="numPartitions">Number of partitions</param> /// <returns> /// A <see cref="DataFrame"/> containing the <see cref="AssemblyInfo"/> /// </returns> public static DataFrame GetAssemblyInfo(this SparkSession session, int numPartitions = 10) { var schema = new StructType(new StructField[] { new StructField("AssemblyName", new StringType(), isNullable: false), new StructField("AssemblyVersion", new StringType(), isNullable: false), new StructField("HostName", new StringType(), isNullable: false) }); DataFrame driverAssmeblyInfoDf = session.CreateDataFrame( new GenericRow[] { CreateGenericRow(MicrosoftSparkAssemblyInfo()) }, schema); Func <Column, Column> executorAssemblyInfoUdf = Udf <int>( i => CreateGenericRow(MicrosoftSparkWorkerAssemblyInfo()), schema); DataFrame df = session.CreateDataFrame(Enumerable.Range(0, 10 * numPartitions)); string tempColName = "ExecutorAssemblyInfo"; DataFrame executorAssemblyInfoDf = df .Repartition(numPartitions) .WithColumn(tempColName, executorAssemblyInfoUdf(df["_1"])) .Select(schema.Fields.Select(f => Col($"{tempColName}.{f.Name}")).ToArray()); return(driverAssmeblyInfoDf .Union(executorAssemblyInfoDf) .DropDuplicates() .Sort(schema.Fields.Select(f => Col(f.Name)).ToArray())); }
public void TestCreateDataFrame() { // Calling CreateDataFrame with schema { var data = new List <GenericRow>(); data.Add(new GenericRow(new object[] { "Alice", 20 })); data.Add(new GenericRow(new object[] { "Bob", 30 })); var schema = new StructType(new List <StructField>() { new StructField("Name", new StringType()), new StructField("Age", new IntegerType()) }); DataFrame df = _spark.CreateDataFrame(data, schema); ValidateDataFrame(df, data.Select(a => a.Values), schema); } // Calling CreateDataFrame(IEnumerable<string> _) without schema { var data = new List <string>(new string[] { "Alice", "Bob" }); var schema = SchemaWithSingleColumn(new StringType()); DataFrame df = _spark.CreateDataFrame(data); ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); } // Calling CreateDataFrame(IEnumerable<int> _) without schema { var data = new List <int>(new int[] { 1, 2 }); var schema = SchemaWithSingleColumn(new IntegerType()); DataFrame df = _spark.CreateDataFrame(data); ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); } // Calling CreateDataFrame(IEnumerable<double> _) without schema { var data = new List <double>(new double[] { 1.2, 2.3 }); var schema = SchemaWithSingleColumn(new DoubleType()); DataFrame df = _spark.CreateDataFrame(data); ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); } // Calling CreateDataFrame(IEnumerable<bool> _) without schema { var data = new List <bool>(new bool[] { true, false }); var schema = SchemaWithSingleColumn(new BooleanType()); DataFrame df = _spark.CreateDataFrame(data); ValidateDataFrame(df, data.Select(a => new object[] { a }), schema); } }
public void TestUdfWithDuplicateTimestamps() { var timestamp = new Timestamp(2020, 1, 1, 0, 0, 0, 0); var schema = new StructType(new StructField[] { new StructField("ts", new TimestampType()) }); var data = new GenericRow[] { new GenericRow(new object[] { timestamp }), new GenericRow(new object[] { timestamp }), new GenericRow(new object[] { timestamp }) }; var expectedTimestamp = new Timestamp(1970, 1, 2, 0, 0, 0, 0); Func <Column, Column> udf = Udf <Timestamp, Timestamp>( ts => new Timestamp(1970, 1, 2, 0, 0, 0, 0)); DataFrame df = _spark.CreateDataFrame(data, schema); Row[] rows = df.Select(udf(df["ts"])).Collect().ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Single(row.Values); Assert.Equal(expectedTimestamp, row.Values[0]); } }
private DataFrame GetTestData(SparkSession session) { var schema = new StructType(new[] { new StructField("item", new StringType(), false), new StructField("origin", new StringType()), new StructField("sales", new IntegerType(), false), new StructField("marketplace", new StringType(), false) }); var rowData = new List <GenericRow> { new GenericRow(new object[] { "item1", "US", 100, "EU" }), new GenericRow(new object[] { "item1", "US", 1000, "EU" }), new GenericRow(new object[] { "item1", "US", 20, "EU" }), new GenericRow(new object[] { "item2", "DE", 20, "EU" }), new GenericRow(new object[] { "item2", "DE", 333, "EU" }), new GenericRow(new object[] { "item3", null, 12, "EU" }), new GenericRow(new object[] { "item4", null, 45, "EU" }), new GenericRow(new object[] { "item5", null, 123, "EU" }) }; return(session.CreateDataFrame(rowData, schema)); }
public UdfSimpleTypesTests(SparkFixture fixture) { _spark = fixture.Spark; var data = new List <GenericRow>(); data.Add(new GenericRow( new object[] { null, new Date(2020, 1, 1), new Timestamp(2020, 1, 1, 0, 0, 0, 0) })); data.Add(new GenericRow( new object[] { 30, new Date(2020, 1, 2), new Timestamp(2020, 1, 2, 15, 30, 30, 123456) })); var schema = new StructType(new List <StructField>() { new StructField("age", new IntegerType()), new StructField("date", new DateType()), new StructField("time", new TimestampType()) }); _df = _spark.CreateDataFrame(data, schema); }
public static DataFrame GetDFWithUniqueColumns(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "1", "0", "3", "1", "5", "0" }), new GenericRow(new object[] { "2", "0", "3", "2", "6", "0" }), new GenericRow(new object[] { "3", "0", "3", null, "7", "0" }), new GenericRow(new object[] { "4", "5", null, "3", "0", "4" }), new GenericRow(new object[] { "5", "6", null, "4", "0", "5" }), new GenericRow(new object[] { "6", "7", null, "5", "0", "6" }) }; StructType schema = new StructType( new List <StructField> { new StructField("unique", new StringType()), new StructField("nonUnique", new StringType()), new StructField("nonUniqueWithNulls", new StringType()), new StructField("uniqueWithNulls", new StringType()), new StructField("onlyUniqueWithOtherNonUnique", new StringType()), new StructField("halfUniqueCombinedWithNonUnique", new StringType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
public static DataFrame GetDfWithStrongPositiveCorrelationFilter(SparkSession session) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { 65, 64 }), new GenericRow(new object[] { 3426, 2634 }), new GenericRow(new object[] { 2345, 23434 }), new GenericRow(new object[] { 2374, 234 }), new GenericRow(new object[] { 767, 2676 }), new GenericRow(new object[] { 1, 2 }), new GenericRow(new object[] { 2, 4 }), new GenericRow(new object[] { 3, 6 }), new GenericRow(new object[] { 4, 8 }), new GenericRow(new object[] { 5, 10 }), new GenericRow(new object[] { 6, 12 }) }; StructType schema = new StructType( new List <StructField> { new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType()) }); return(session.CreateDataFrame(elements, schema)); }
public void TestSignaturesV3_X_X() { // Validate ToLocalIterator var data = new List <GenericRow> { new GenericRow(new object[] { "Alice", 20 }), new GenericRow(new object[] { "Bob", 30 }) }; var schema = new StructType(new List <StructField>() { new StructField("Name", new StringType()), new StructField("Age", new IntegerType()) }); DataFrame df = _spark.CreateDataFrame(data, schema); IEnumerable <Row> actual = df.ToLocalIterator(true).ToArray(); IEnumerable <Row> expected = data.Select(r => new Row(r.Values, schema)); Assert.Equal(expected, actual); Assert.IsType <DataFrame>(df.Observe("metrics", Count("Name").As("CountNames"))); Assert.IsType <Row[]>(_df.Tail(1).ToArray()); _df.PrintSchema(1); _df.Explain("simple"); _df.Explain("extended"); _df.Explain("codegen"); _df.Explain("cost"); _df.Explain("formatted"); }
public static DataFrame GetDFMissing(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "1", "a", "f" }), new GenericRow(new object[] { "2", "b", "d" }), new GenericRow(new object[] { "3", null, "f" }), new GenericRow(new object[] { "4", "a", null }), new GenericRow(new object[] { "5", "a", "f" }), new GenericRow(new object[] { "6", null, "d" }), new GenericRow(new object[] { "7", null, "d" }), new GenericRow(new object[] { "8", "b", null }), new GenericRow(new object[] { "9", "a", "f" }), new GenericRow(new object[] { "10", null, null }), new GenericRow(new object[] { "11", null, "f" }), new GenericRow(new object[] { "12", null, "d" }) }; StructType schema = new StructType( new List <StructField> { new StructField("item", new StringType()), new StructField("att1", new StringType()), new StructField("att2", new StringType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
static void Main(string[] args) { // Verify environment variables if (args.Length != 4) { Console.Error.WriteLine("Usage: $TENANT_ID $ADLS_NAME $ADLS_SP_CLIENT_ID $ADLS_SP_CLIENT_SECRET"); Environment.Exit(1); } // Specify file path in Azure Data Lake Gen1 string filePath = $"adl://{args[1]}.azuredatalakestore.net/parquet/people.parquet"; // Create SparkSession SparkSession spark = SparkSession .Builder() .AppName("Azure Data Lake Storage example using .NET for Apache Spark") .Config("fs.adl.impl", "org.apache.hadoop.fs.adl.AdlFileSystem") .Config("fs.adl.oauth2.access.token.provider.type", "ClientCredential") .Config("fs.adl.oauth2.client.id", args[2]) .Config("fs.adl.oauth2.credential", args[3]) .Config("fs.adl.oauth2.refresh.url", $"https://login.microsoftonline.com/{args[0]}/oauth2/token") .GetOrCreate(); // Create sample data var data = new List <GenericRow> { new GenericRow(new object[] { 1, "John Doe" }), new GenericRow(new object[] { 2, "Jane Doe" }), new GenericRow(new object[] { 3, "Foo Bar" }) }; // Create schema for sample data var schema = new StructType(new List <StructField>() { new StructField("Id", new IntegerType()), new StructField("Name", new StringType()), }); // Create DataFrame using data and schema DataFrame df = spark.CreateDataFrame(data, schema); // Print DataFrame df.Show(); // Write DataFrame to Azure Data Lake Gen1 df.Write().Mode(SaveMode.Overwrite).Parquet(filePath); // Read saved DataFrame from Azure Data Lake Gen1 DataFrame readDf = spark.Read().Parquet(filePath); // Print DataFrame readDf.Show(); }
public void TestUdfWithSimpleArrayType() { var schema = new StructType(new StructField[] { new StructField("name", new StringType()), new StructField("ids", new ArrayType(new IntegerType())) }); var data = new GenericRow[] { new GenericRow(new object[] { "Name1", new int[] { 1, 2, 3 } }), new GenericRow(new object[] { "Name2", null }), new GenericRow(new object[] { "Name3", new int[] { 4 } }), }; DataFrame df = _spark.CreateDataFrame(data, schema); var expected = new string[] { "Name1|1,2,3", "Name2", "Name3|4" }; { // Test using array Func <Column, Column, Column> udf = Udf <string, int[], string>( (name, ids) => { if (ids == null) { return(name); } return(AppendEnumerable(name, ids)); }); Row[] rows = df.Select(udf(df["name"], df["ids"])).Collect().ToArray(); Assert.Equal(expected, rows.Select(r => r.GetAs <string>(0))); } { // Test using ArrayList Func <Column, Column, Column> udf = Udf <string, ArrayList, string>( (name, ids) => { if (ids == null) { return(name); } return(AppendEnumerable(name, ids.ToArray())); }); Row[] rows = df.Select(udf(df["name"], df["ids"])).Collect().ToArray(); Assert.Equal(expected, rows.Select(r => r.GetAs <string>(0))); } }
public void should_execute_a_basic_example() { var data = _session.CreateDataFrame( new List <GenericRow> { new GenericRow(new object[] { 1, "Thingy A", "awesome thing. http://thingb.com", "high", 0 }), new GenericRow(new object[] { 2, "Thingy B", "available at http://thingb.com", null, 0 }), new GenericRow(new object[] { 3, null, null, "low", 5 }), new GenericRow(new object[] { 4, "Thingy D", "checkout https://thingd.ca", "low", 10 }), new GenericRow(new object[] { 5, "Thingy E", null, "high", 12 }) }, new StructType(new List <StructField> { new StructField("id", new IntegerType()), new StructField("productName", new StringType()), new StructField("description", new StringType()), new StructField("priority", new StringType()), new StructField("numViews", new IntegerType()), })); var result = new VerificationSuite() .OnData(data) .AddCheck( new Check(CheckLevel.Error, "integrity checks") .HasSize(val => val == 5) .IsComplete("id") .IsUnique("id") .IsComplete("productName") .IsContainedIn("priority", new[] { "high", "low" }) .IsNonNegative("numViews") ) .AddCheck( new Check(CheckLevel.Warning, "distribution checks") .ContainsURL("description", val => val >= .5) ) .Run(); result.Debug(_helper.WriteLine); }
static void Main(string[] args) { // Verify environment variables if (args.Length != 2) { Console.Error.WriteLine("Usage: $AZURE_STORAGE_ACCOUNT $AZURE_STORAGE_KEY"); Environment.Exit(1); } // Specify file path in Azure Storage string filePath = $"wasbs://dotnet-spark@{args[0]}.blob.core.windows.net/json/people.json"; // Create SparkSession SparkSession spark = SparkSession .Builder() .AppName("Azure Storage example using .NET for Apache Spark") .Config("fs.wasbs.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem") .Config($"fs.azure.account.key.{args[0]}.blob.core.windows.net", args[1]) .GetOrCreate(); // Create sample data var data = new List <GenericRow> { new GenericRow(new object[] { 1, "John Doe" }), new GenericRow(new object[] { 2, "Jane Doe" }), new GenericRow(new object[] { 3, "Foo Bar" }) }; // Create schema for sample data var schema = new StructType(new List <StructField>() { new StructField("Id", new IntegerType()), new StructField("Name", new StringType()), }); // Create DataFrame using data and schema DataFrame df = spark.CreateDataFrame(data, schema); // Print DataFrame df.Show(); // Write DataFrame to Azure Storage df.Write().Mode(SaveMode.Overwrite).Json(filePath); // Read saved DataFrame from Azure Storage DataFrame readDf = spark.Read().Json(filePath); // Print DataFrame readDf.Show(); }
private static void CreateUsingGenericRowAndStructType(SparkSession spark) { Console.WriteLine("spark.CreateDataFrame using StructType"); var rowOne = new GenericRow(new object[] { "columnOne Row One", 1.1 }); var rowTwo = new GenericRow(new object[] { "columnOne Row Two", null }); var rowThree = new GenericRow(new object[] { "columnOne Row Three", 3.3 }); var rows = new List <GenericRow>() { rowOne, rowTwo, rowThree }; var structType = new StructType(new List <StructField>() { new StructField("column one", new StringType(), isNullable: false), new StructField("column two", new DoubleType(), isNullable: true) }); var dataFrame = spark.CreateDataFrame(rows, structType); dataFrame.Show(); /* * +-------------------+----------+ | column one|column two| +-------------------+----------+ | columnOne Row One| 1.1| | columnOne Row Two| null| |columnOne Row Three| 3.3| +-------------------+----------+ */ dataFrame.PrintSchema(); /* * root |-- column one: string (nullable = false) |-- column two: double (nullable = true) */ }
public void TestWithDuplicatedRows() { var timestamp = new Timestamp(2020, 1, 1, 0, 0, 0, 0); var schema = new StructType(new StructField[] { new StructField("ts", new TimestampType()) }); var data = new GenericRow[] { new GenericRow(new object[] { timestamp }) }; DataFrame df = _spark.CreateDataFrame(data, schema); Row[] rows = df .WithColumn("tsRow", Struct("ts")) .WithColumn("tsRowRow", Struct("tsRow")) .Collect() .ToArray(); Assert.Single(rows); Row row = rows[0]; Assert.Equal(3, row.Values.Length); Assert.Equal(timestamp, row.Values[0]); Row tsRow = row.Values[1] as Row; Assert.Single(tsRow.Values); Assert.Equal(timestamp, tsRow.Values[0]); Row tsRowRow = row.Values[2] as Row; Assert.Single(tsRowRow.Values); Assert.Equal(tsRowRow.Values[0], tsRow); }
private static DataFrame GetSearchTermTFIDF(SparkSession spark, string searchTerm, Tokenizer tokenizer, HashingTF hashingTF, IDFModel idfModel) { var searchTermDataFrame = spark.CreateDataFrame(new List <string>() { searchTerm }) .WithColumnRenamed("_1", "Content"); var searchWords = tokenizer.Transform(searchTermDataFrame); var featurizedSeachTerm = hashingTF.Transform(searchWords); var search = idfModel.Transform(featurizedSeachTerm).WithColumnRenamed("features", "features2") .WithColumn("norm2", udfCalcNorm(Col("features2"))); return(search); }
public static DataFrame GetDfFractionalStringTypes(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "1", "1.0" }), new GenericRow(new object[] { "2", "a" }) }; StructType schema = new StructType( new List <StructField> { new StructField("item", new StringType()), new StructField("att1", new StringType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
public void correctly_return_a_DataFrame_of_multiple_AnalysisResults_that_is_formatted_as_expected() => Evaluate(_session, (context, repository) => { repository.Save(new ResultKey(DATE_ONE, new Dictionary <string, string>(REGION_EU)), context); repository.Save(new ResultKey(DATE_TWO, new Dictionary <string, string>(REGION_NA)), context); DataFrame analysisResultsAsDataFrame = repository.Load() .GetSuccessMetricsAsDataFrame(_session, Enumerable.Empty <string>()); List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "Dataset", "*", "Size", 4.0, DATE_ONE, "EU" }), new GenericRow(new object[] { "Column", "att1", "Completeness", 1.0, DATE_ONE, "EU" }), new GenericRow(new object[] { "Column", "item", "Distinctness", 1.0, DATE_ONE, "EU" }), new GenericRow(new object[] { "Multicolumn", "att1,att2", "Uniqueness", 0.25, DATE_ONE, "EU" }), new GenericRow(new object[] { "Dataset", "*", "Size", 4.0, DATE_TWO, "NA" }), new GenericRow(new object[] { "Column", "att1", "Completeness", 1.0, DATE_TWO, "NA" }), new GenericRow(new object[] { "Column", "item", "Distinctness", 1.0, DATE_TWO, "NA" }), new GenericRow(new object[] { "Multicolumn", "att1,att2", "Uniqueness", 0.25, DATE_TWO, "NA" }) }; StructType schema = new StructType( new List <StructField> { new StructField("entity", new StringType()), new StructField("instance", new StringType()), new StructField("name", new StringType()), new StructField("value", new DoubleType()), new StructField("dataset_date", new LongType()), new StructField("region", new StringType()) }); DataFrame df = _session.CreateDataFrame(elements, schema); FixtureSupport.AssertSameRows(analysisResultsAsDataFrame, df, Option <ITestOutputHelper> .None); });
public static DataFrame GetDFWithNRows(SparkSession sparkSession, int N) { StructType schema = new StructType( new List <StructField> { new StructField("c0", new StringType()), new StructField("c1", new StringType()), new StructField("c2", new StringType()) }); return(sparkSession.CreateDataFrame( Enumerable.Range(1, N).Select(value => { return new GenericRow(new object[] { $"{value}", $"c1-r{value}", $"c2-r{value}" }); }).ToList(), schema)); }
public static DataFrame GetDfWithConditionallyInformativeColumns(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { 1, 4 }), new GenericRow(new object[] { 2, 5 }), new GenericRow(new object[] { 3, 6 }) }; StructType schema = new StructType( new List <StructField> { new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
public static DataFrame GetDfWithVariableStringLengthValues(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "" }), new GenericRow(new object[] { "a" }), new GenericRow(new object[] { "bb" }), new GenericRow(new object[] { "ccc" }), new GenericRow(new object[] { "dddd" }) }; StructType schema = new StructType( new List <StructField> { new StructField("att1", new StringType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
public void TestSignaturesV3_X_X() { // Validate ToLocalIterator var data = new List <GenericRow> { new GenericRow(new object[] { "Alice", 20 }), new GenericRow(new object[] { "Bob", 30 }) }; var schema = new StructType(new List <StructField>() { new StructField("Name", new StringType()), new StructField("Age", new IntegerType()) }); DataFrame df = _spark.CreateDataFrame(data, schema); IEnumerable <Row> actual = df.ToLocalIterator(true).ToArray(); IEnumerable <Row> expected = data.Select(r => new Row(r.Values, schema)); Assert.Equal(expected, actual); }
public void TestSQLTransformer() { DataFrame input = _spark.CreateDataFrame( new List <GenericRow> { new GenericRow(new object[] { 0, 1.0, 3.0 }), new GenericRow(new object[] { 2, 2.0, 5.0 }) }, new StructType(new List <StructField> { new StructField("id", new IntegerType()), new StructField("v1", new DoubleType()), new StructField("v2", new DoubleType()) })); string expectedUid = "theUid"; string inputStatement = "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__"; SQLTransformer sqlTransformer = new SQLTransformer(expectedUid) .SetStatement(inputStatement); string outputStatement = sqlTransformer.GetStatement(); DataFrame output = sqlTransformer.Transform(input); StructType outputSchema = sqlTransformer.TransformSchema(input.Schema()); Assert.Contains(output.Schema().Fields, (f => f.Name == "v3")); Assert.Contains(output.Schema().Fields, (f => f.Name == "v4")); Assert.Contains(outputSchema.Fields, (f => f.Name == "v3")); Assert.Contains(outputSchema.Fields, (f => f.Name == "v4")); Assert.Equal(inputStatement, outputStatement); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "SQLTransformer"); sqlTransformer.Save(savePath); SQLTransformer loadedsqlTransformer = SQLTransformer.Load(savePath); Assert.Equal(sqlTransformer.Uid(), loadedsqlTransformer.Uid()); } Assert.Equal(expectedUid, sqlTransformer.Uid()); }
public static DataFrame GetDfWithDistinctValues(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "a", null }), new GenericRow(new object[] { "a", null }), new GenericRow(new object[] { null, "x" }), new GenericRow(new object[] { "b", "x" }), new GenericRow(new object[] { "b", "x" }), new GenericRow(new object[] { "c", "y" }) }; StructType schema = new StructType( new List <StructField> { new StructField("att1", new StringType()), new StructField("att2", new StringType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
public static DataFrame GetDFWithNegativeNumbers(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "1", "-1", "-1.0" }), new GenericRow(new object[] { "2", "-2", "-2.0" }), new GenericRow(new object[] { "3", "-3", "-3.0" }), new GenericRow(new object[] { "4", "-4", "-4.0" }) }; StructType schema = new StructType( new List <StructField> { new StructField("item", new StringType()), new StructField("att1", new StringType()), new StructField("att2", new StringType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }
public static DataFrame GetDfWithStrongNegativeCorrelation(SparkSession session) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { 1, 12 }), new GenericRow(new object[] { 2, 10 }), new GenericRow(new object[] { 3, 8 }), new GenericRow(new object[] { 4, 6 }), new GenericRow(new object[] { 5, 4 }), new GenericRow(new object[] { 6, 2 }) }; StructType schema = new StructType( new List <StructField> { new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType()) }); return(session.CreateDataFrame(elements, schema)); }
public static DataFrame GetDfWithLowCorrelation(SparkSession session) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { 12, 8 }), new GenericRow(new object[] { 10, 12 }), new GenericRow(new object[] { 8, 1 }), new GenericRow(new object[] { 6, 30 }), new GenericRow(new object[] { 4, 9 }), new GenericRow(new object[] { 2, 7 }) }; StructType schema = new StructType( new List <StructField> { new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType()) }); return(session.CreateDataFrame(elements, schema)); }
public static DataFrame GetDfWithNumericFractionalValues(SparkSession sparkSession) { List <GenericRow> elements = new List <GenericRow> { new GenericRow(new object[] { "1", 1.0, 0.0 }), new GenericRow(new object[] { "2", 2.0, 0.0 }), new GenericRow(new object[] { "3", 3.0, 0.0 }), new GenericRow(new object[] { "4", 4.0, 0.0 }), new GenericRow(new object[] { "5", 5.0, 0.0 }), new GenericRow(new object[] { "6", 6.0, 0.0 }) }; StructType schema = new StructType( new List <StructField> { new StructField("item", new StringType()), new StructField("att1", new IntegerType()), new StructField("att2", new IntegerType()) }); return(sparkSession.CreateDataFrame(elements, schema)); }