public void TestWithDuplicateDates() { var date = new Date(2020, 1, 1); var schema = new StructType(new StructField[] { new StructField("date", new DateType()) }); var data = new GenericRow[] { new GenericRow(new object[] { date }), new GenericRow(new object[] { date }), new GenericRow(new object[] { date }) }; DataFrame df = _spark.CreateDataFrame(data, schema); Row[] rows = df.Collect().ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Single(row.Values); Assert.Equal(date, row.GetAs <Date>(0)); } }
public void TestWithDuplicateTimestamps() { var timestamp = new Timestamp(2020, 1, 1, 0, 0, 0, 0); var schema = new StructType(new StructField[] { new StructField("ts", new TimestampType()) }); var data = new GenericRow[] { new GenericRow(new object[] { timestamp }), new GenericRow(new object[] { timestamp }), new GenericRow(new object[] { timestamp }) }; DataFrame df = _spark.CreateDataFrame(data, schema); Row[] rows = df.Collect().ToArray(); Assert.Equal(3, rows.Length); foreach (Row row in rows) { Assert.Single(row.Values); Assert.Equal(timestamp, row.GetAs <Timestamp>(0)); } }
public void TestEmailSearchTopNReducerBasics() { // Read the sample data. DataFrame df = _spark .Read() .Schema("Id STRING, DisplayName STRING, GivenName STRING, Surname STRING, IMAddress STRING, EmailAddress STRING, RelevanceScore DOUBLE, puser STRING, ptenant STRING") .Json($"{TestEnvironment.ResourceDirectory}neighbors.json"); // Trim the IMAddress column. Func <Column, Column> trimIMAddress = Udf <string, string>((str) => str.StartsWith("sip:") ? str.Substring(4) : str); df = df.WithColumn("IMAddress", trimIMAddress(df["IMAddress"])); // Reduce df = df.GroupBy("puser", "ptenant").Agg(CollectList("GivenName").Alias("GivenNames"), CollectList("Surname").Alias("Surnames"), CollectList("DisplayName").Alias("DisplayNames"), CollectList("EmailAddress").Alias("EmailAddresses"), CollectList("RelevanceScore").Alias("RelevanceScores"), CollectList("IMAddress").Alias("IMAddresses")); // Format the output. df = df.Select(df["puser"], df["ptenant"], ConcatWs(";", df["GivenNames"]).Alias("GivenNames"), ConcatWs(";", df["Surnames"]).Alias("Surnames"), ConcatWs(";", df["DisplayNames"]).Alias("DisplayNames"), ConcatWs(";", df["EmailAddresses"]).Alias("EmailAddresses"), ConcatWs(";", df["RelevanceScores"]).Alias("RelevanceScores"), ConcatWs(";", df["IMAddresses"]).Alias("IMAddresses")); Assert.Equal(2, df.Count()); foreach (Row row in df.Collect()) { string puser = row.GetAs <string>("puser"); Assert.Equal("MSFT", row.GetAs <string>("ptenant")); Assert.Equal("1101.0;900.0;857.0", row.GetAs <string>("RelevanceScores")); switch (puser) { case "ruih": Assert.Equal("AliceFN;BobFN;CharlieFN", row.GetAs <string>("GivenNames")); Assert.Equal("AliceLN;BobLN;CharlieLN", row.GetAs <string>("Surnames")); Assert.Equal("AliceFN AliceLN;BobFN BobLN;CharlieFN CharlieLN", row.GetAs <string>("DisplayNames")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses")); break; case "rui": Assert.Equal("DougFN;ElvaFN;FrankFN", row.GetAs <string>("GivenNames")); Assert.Equal("DougLN;ElvaLN;FrankLN", row.GetAs <string>("Surnames")); Assert.Equal("DougFN DougLN;ElvaFN ElvaLN;FrankFN FrankLN", row.GetAs <string>("DisplayNames")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("EmailAddresses")); Assert.Equal("[email protected];[email protected];[email protected]", row.GetAs <string>("IMAddresses")); break; default: throw new Exception($"Unexpected age: {puser}."); } } }
private void ValidateDataFrame( DataFrame actual, IEnumerable <object[]> expectedRows, StructType expectedSchema) { Assert.Equal(expectedSchema, actual.Schema()); Assert.Equal(expectedRows, actual.Collect().Select(r => r.Values)); }
public void TestCollect() { Row[] rows = _df.Collect().ToArray(); Assert.Equal(3, rows.Length); Row row1 = rows[0]; Assert.Equal("Michael", row1.GetAs <string>("name")); Assert.Null(row1.Get("age")); Row row2 = rows[1]; Assert.Equal("Andy", row2.GetAs <string>("name")); Assert.Equal(30, row2.GetAs <int>("age")); Row row3 = rows[2]; Assert.Equal("Justin", row3.GetAs <string>("name")); Assert.Equal(19, row3.GetAs <int>("age")); }
public void TestWithColumn() { Func <Column, Column> sizeNameAgeUdf = Udf <Row, string>( r => { string name = r.GetAs <string>("name"); int?age = r.GetAs <int?>("age"); if (age.HasValue) { return($"{r.Size()},{name},{age.Value}"); } return($"{r.Size()},{name},{string.Empty}"); }); string[] allCols = _df.Columns().ToArray(); DataFrame nameAgeColDF = _df.WithColumn("NameAgeCol", Struct(allCols[0], allCols.Skip(1).ToArray())); DataFrame sizeNameAgeColDF = nameAgeColDF.WithColumn("SizeNameAgeCol", sizeNameAgeUdf(nameAgeColDF["NameAgeCol"])); Row[] originalDFRows = _df.Collect().ToArray(); Assert.Equal(3, originalDFRows.Length); Row[] sizeNameAgeColDFRows = sizeNameAgeColDF.Collect().ToArray(); Assert.Equal(3, sizeNameAgeColDFRows.Length); { Row row = sizeNameAgeColDFRows[0]; Assert.Equal("Michael", row.GetAs <string>("name")); Assert.Null(row.Get("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[0], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Michael,", row.GetAs <string>("SizeNameAgeCol")); } { Row row = sizeNameAgeColDFRows[1]; Assert.Equal("Andy", row.GetAs <string>("name")); Assert.Equal(30, row.GetAs <int>("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[1], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Andy,30", row.GetAs <string>("SizeNameAgeCol")); } { Row row = sizeNameAgeColDFRows[2]; Assert.Equal("Justin", row.GetAs <string>("name")); Assert.Equal(19, row.GetAs <int>("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[2], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Justin,19", row.GetAs <string>("SizeNameAgeCol")); } }
public void TestVersion() { DataFrame versionDf = _spark.GetAssemblyInfo(); Row[] versionRows = versionDf.Collect().ToArray(); Assert.Equal(2, versionRows.Length); Assert.Equal( new string[] { "Microsoft.Spark", "Microsoft.Spark.Worker" }, versionRows.Select(r => r.GetAs <string>("AssemblyName"))); for (int i = 0; i < 2; ++i) { Assert.False( string.IsNullOrWhiteSpace(versionRows[i].GetAs <string>("AssemblyVersion"))); Assert.False( string.IsNullOrWhiteSpace(versionRows[i].GetAs <string>("HostName"))); } }
public void TestCollect() { Console.WriteLine(">>>>>>>>>>>>>>>>>> Dataframe unit testing started >>>>>>>>>>> "); _df.Show(10); Row[] rows = _df.Collect().ToArray(); Assert.Equal(3, rows.Length); Row row1 = rows[0]; Assert.Equal("Michael", row1.GetAs <string>("name")); Assert.Null(row1.Get("age")); Row row2 = rows[1]; Assert.Equal("Andy", row2.GetAs <string>("name")); Assert.Equal(30, row2.GetAs <int>("age")); Row row3 = rows[2]; Assert.Equal("Justin", row3.GetAs <string>("name")); Assert.Equal(19, row3.GetAs <int>("age")); Console.WriteLine(">>>>>>>>>>>>>>>>>> Dataframe unit testing end >>>>>>>>>>> "); }
private void AssertSameRows(DataFrame dataFrameA, DataFrame dataFrameB) { IEnumerable <Row> dfASeq = dataFrameA.Collect(); IEnumerable <Row> dfBSeq = dataFrameB.Collect(); int i = 0; foreach (Row rowA in dfASeq) { Row rowB = dfBSeq.Skip(i).First(); _helper.WriteLine($"Computed - {rowA}"); _helper.WriteLine($"Expected - {rowB}"); int columnSize = rowA.Size(); for (int j = 0; j < columnSize; j++) { rowA[j].ShouldBe(rowB[j]); } i++; } }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName(".NET Spark SQL basic example") .Config("spark.some.config.option", "some-value") .GetOrCreate(); // Need to explicitly specify the schema since pickling vs. arrow formatting // will return different types. Pickling will turn longs into ints if the values fit. // Same as the "age INT, name STRING" DDL-format string. var inputSchema = new StructType(new[] { new StructField("age", new IntegerType()), new StructField("name", new StringType()) }); DataFrame df = spark.Read().Schema(inputSchema).Json(args[0]); Spark.Sql.Types.StructType schema = df.Schema(); Console.WriteLine(schema.SimpleString); IEnumerable <Row> rows = df.Collect(); foreach (Row row in rows) { Console.WriteLine(row); } df.Show(); df.PrintSchema(); df.Select("name", "age", "age", "name").Show(); df.Select(df["name"], df["age"] + 1).Show(); df.Filter(df["age"] > 21).Show(); df.GroupBy("age") .Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"])) .Show(); df.CreateOrReplaceTempView("people"); // Registering Udf for SQL expression. DataFrame sqlDf = spark.Sql("SELECT * FROM people"); sqlDf.Show(); spark.Udf().Register <int?, string, string>( "my_udf", (age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null")); sqlDf = spark.Sql("SELECT my_udf(*) FROM people"); sqlDf.Show(); // Using UDF via data frames. Func <Column, Column, Column> addition = Udf <int?, string, string>( (age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0)); df.Select(addition(df["age"], df["name"])).Show(); // Chaining example: Func <Column, Column> addition2 = Udf <string, string>(str => $"hello {str}!"); df.Select(addition2(addition(df["age"], df["name"]))).Show(); // Multiple UDF example: df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show(); // UDF return type as array. Func <Column, Column> udfArray = Udf <string, string[]>((str) => new string[] { str, str + str }); df.Select(Explode(udfArray(df["name"]))).Show(); // UDF return type as map. Func <Column, Column> udfMap = Udf <string, IDictionary <string, string[]> >( (str) => new Dictionary <string, string[]> { { str, new[] { str, str } } }); df.Select(udfMap(df["name"]).As("UdfMap")).Show(truncate: 50); // Joins. DataFrame joinedDf = df.Join(df, "name"); joinedDf.Show(); DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" }); joinedDf2.Show(); DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer"); joinedDf3.Show(); spark.Stop(); }
private string[] ToStringArray(DataFrame df) { Row[] rows = df.Collect().ToArray(); return(rows.Select(s => s[0].ToString()).ToArray()); }
private void TestAndValidateForeach( string streamInputPath, TestForeachWriter foreachWriter, int expectedCSVFiles, int expectedExceptionFiles, IEnumerable <int> expectedOutput) { // Temporary folder the TestForeachWriter will write to. using var dstTempDirectory = new TemporaryDirectory(); foreachWriter.WritePath = dstTempDirectory.Path; // Read streamInputPath, repartition data, then // call TestForeachWriter on the data. DataStreamWriter dsw = _spark .ReadStream() .Schema(new StructType(new[] { new StructField("id", new IntegerType()), new StructField("idStr", new StringType()), new StructField("idAndIdStr", new StructType(new[] { new StructField("id", new IntegerType()), new StructField("idStr", new StringType()) })) })) .Json(streamInputPath) .Repartition(expectedCSVFiles) .WriteStream() .Foreach(foreachWriter); // Trigger the stream batch once. if (expectedExceptionFiles > 0) { Assert.Throws <Exception>( () => dsw.Trigger(Trigger.Once()).Start().AwaitTermination()); } else { dsw.Trigger(Trigger.Once()).Start().AwaitTermination(); } // Verify that TestForeachWriter created a unique .csv when // ForeachWriter.Open was called on each partitionId. Assert.Equal( expectedCSVFiles, Directory.GetFiles(dstTempDirectory.Path, "*.csv").Length); // Only if ForeachWriter.Process(Row) throws an exception, will // ForeachWriter.Close(Exception) create a file with the // .exception extension. Assert.Equal( expectedExceptionFiles, Directory.GetFiles(dstTempDirectory.Path, "*.exception").Length); // Read in the *.csv file(s) generated by the TestForeachWriter. // If there are multiple input files, sorting by "id" will make // validation simpler. Contents of the *.csv will only be populated // on successful calls to the ForeachWriter.Process method. DataFrame foreachWriterOutputDF = _spark .Read() .Schema("id INT") .Csv(dstTempDirectory.Path) .Sort("id"); // Validate expected *.csv data. Assert.Equal( expectedOutput.Select(i => new object[] { i }), foreachWriterOutputDF.Collect().Select(r => r.Values)); }
public void TestForeachBatch() { // Temporary folder to put our test stream input. using var srcTempDirectory = new TemporaryDirectory(); // Temporary folder to write ForeachBatch output. using var dstTempDirectory = new TemporaryDirectory(); Func <Column, Column> outerUdf = Udf <int, int>(i => i + 100); // id column: [0, 1, ..., 9] WriteCsv(0, 10, Path.Combine(srcTempDirectory.Path, "input1.csv")); DataStreamWriter dsw = _spark .ReadStream() .Schema("id INT") .Csv(srcTempDirectory.Path) .WriteStream() .ForeachBatch((df, id) => { Func <Column, Column> innerUdf = Udf <int, int>(i => i + 200); df.Select(outerUdf(innerUdf(Col("id")))) .Write() .Csv(Path.Combine(dstTempDirectory.Path, id.ToString())); }); StreamingQuery sq = dsw.Start(); // Process until all available data in the source has been processed and committed // to the ForeachBatch sink. sq.ProcessAllAvailable(); // Add new file to the source path. The spark stream will read any new files // added to the source path. // id column: [10, 11, ..., 19] WriteCsv(10, 10, Path.Combine(srcTempDirectory.Path, "input2.csv")); // Process until all available data in the source has been processed and committed // to the ForeachBatch sink. sq.ProcessAllAvailable(); sq.Stop(); // Verify folders in the destination path. string[] csvPaths = Directory.GetDirectories(dstTempDirectory.Path).OrderBy(s => s).ToArray(); var expectedPaths = new string[] { Path.Combine(dstTempDirectory.Path, "0"), Path.Combine(dstTempDirectory.Path, "1"), }; Assert.True(expectedPaths.SequenceEqual(csvPaths)); // Read the generated csv paths and verify contents. DataFrame df = _spark .Read() .Schema("id INT") .Csv(csvPaths[0], csvPaths[1]) .Sort("id"); IEnumerable <int> actualIds = df.Collect().Select(r => r.GetAs <int>("id")); Assert.True(Enumerable.Range(300, 20).SequenceEqual(actualIds)); }
public void TestDataFrameCollect() { string jsonSchema = @" { ""type"" : ""struct"", ""fields"" : [ { ""name"" : ""address"", ""type"" : { ""type"" : ""struct"", ""fields"" : [ { ""name"" : ""city"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""state"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } } ] }, ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""age"", ""type"" : ""long"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""id"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } }, { ""name"" : ""name"", ""type"" : ""string"", ""nullable"" : true, ""metadata"" : { } } ] }"; int localPort = 4000; object row1 = new object[] { new object[] { "Columbus", "Ohio" }, 34, "123", "Bill" }; object row2 = new object[] { new object[] { "Seattle", "Washington" }, 43, "789", "Bill" }; IStructTypeProxy structTypeProxy = new MockStructTypeProxy(jsonSchema); IDataFrameProxy dataFrameProxy = new MockDataFrameProxy(localPort, new List <object>() { row1, row2 }, structTypeProxy); DataFrame dataFrame = new DataFrame(dataFrameProxy, null); List <Row> rows = new List <Row>(); foreach (var row in dataFrame.Collect()) { rows.Add(row); Console.WriteLine("{0}", row); } Assert.AreEqual(rows.Count, 2); Row firstRow = rows[0]; string id = firstRow.GetAs <string>("id"); Assert.IsTrue(id.Equals("123")); string name = firstRow.GetAs <string>("name"); Assert.IsTrue(name.Equals("Bill")); int age = firstRow.GetAs <int>("age"); Assert.AreEqual(age, 34); Row address = firstRow.GetAs <Row>("address"); Assert.AreNotEqual(address, null); string city = address.GetAs <string>("city"); Assert.IsTrue(city.Equals("Columbus")); string state = address.GetAs <string>("state"); Assert.IsTrue(state.Equals("Ohio")); }
public void TestEmailSearchSuccessActionReducerBasics() { // Read the sample data. DataFrame df = _spark.Read().Json($"{TestEnvironment.ResourceDirectory}search_actions.json"); // Select the required columns. df = df.Select("ImpressionId", "ConversationId", "EntityType", "FolderIdList", "ReferenceIdList", "ItemIdList", "ItemImmutableIdList"); // Convert columns of concatenated string to array of strings. Func <Column, Column> toStringArrayUdf = Udf <string, string[]>((str) => str.Split(';')); df = df.WithColumn("FolderIdList", toStringArrayUdf(df["FolderIdList"])) .WithColumn("ReferenceIdList", toStringArrayUdf(df["ReferenceIdList"])) .WithColumn("ItemIdList", toStringArrayUdf(df["ItemIdList"])) .WithColumn("ItemImmutableIdList", toStringArrayUdf(df["ItemImmutableIdList"])); // Apply the ArrayZip function to combine the i-th element of each array. df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], ArraysZip(df["FolderIdList"], df["ReferenceIdList"], df["ItemIdList"], df["ItemImmutableIdList"]).Alias("ConcatedColumn")); // Apply the Explode function to split into multiple rows. df = df.Select(df["ConversationId"], df["ImpressionId"], df["EntityType"], Explode(df["ConcatedColumn"]).Alias("NewColumn")); // Create multiple columns. df = df.WithColumn("FolderId", df["NewColumn"].GetField("FolderIdList")) .WithColumn("ReferenceId", df["NewColumn"].GetField("ReferenceIdList")) .WithColumn("ItemId", df["NewColumn"].GetField("ItemIdList")) .WithColumn("ItemImmutableId", df["NewColumn"].GetField("ItemImmutableIdList")) .Select("ConversationId", "ImpressionId", "EntityType", "FolderId", "ItemId", "ReferenceId", "ItemImmutableId"); // Check the results. Assert.Equal(3, df.Count()); int i = 0; foreach (Row row in df.Collect()) { string impressionId = row.GetAs <string>("ImpressionId"); string conversationId = row.GetAs <string>("ConversationId"); string entityType = row.GetAs <string>("EntityType"); Assert.Equal("Imp1", impressionId); Assert.Equal("DD8A6B40-B4C9-426F-8194-895E9053077C", conversationId); Assert.Equal("Message", entityType); string folderId = row.GetAs <string>("FolderId"); string itemId = row.GetAs <string>("ItemId"); string referenceId = row.GetAs <string>("ReferenceId"); string itemImmutableId = row.GetAs <string>("ItemImmutableId"); if (i == 0) { Assert.Equal("F1", folderId); Assert.Equal("ItemId1", itemId); Assert.Equal("R1", referenceId); Assert.Equal("ItemImmutableId1", itemImmutableId); } else if (i == 1) { Assert.Equal("F2", folderId); Assert.Equal("ItemId2", itemId); Assert.Equal("R2", referenceId); Assert.Equal("ItemImmutableId2", itemImmutableId); } else if (i == 2) { Assert.Equal("F3", folderId); Assert.Equal("ItemId3", itemId); Assert.Equal("R3", referenceId); Assert.Equal("ItemImmutableId3", itemImmutableId); } else { throw new Exception(string.Format("Unexpected row: ConversationId={0}, ImpressionId={1}", conversationId, impressionId)); } i++; } }
public void Run(string[] args) { if (args.Length != 1) { Console.Error.WriteLine( "Usage: Logging <path to Apache User Logs>"); Environment.Exit(1); } SparkSession spark = SparkSession .Builder() .AppName("Apache User Log Processing") .GetOrCreate(); // Read input log file and display it DataFrame df = spark.Read().Text(args[0]); df.Show(); // Step 1: UDF to determine if each line is a valid log entry // Remove any invalid entries before further filtering spark.Udf().Register <string, bool>( "GeneralReg", log => Regex.IsMatch(log, s_apacheRx)); df.CreateOrReplaceTempView("Logs"); // Apply the UDF to get valid log entries DataFrame generalDf = spark.Sql( "SELECT logs.value, GeneralReg(logs.value) FROM Logs"); // Only keep log entries that matched the reg ex generalDf = generalDf.Filter(generalDf["GeneralReg(value)"]); generalDf.Show(); // View the resulting schema // Notice we created a new column "GeneralReg(value)" generalDf.PrintSchema(); // Step 2: Choose valid log entries that start with 10 spark.Udf().Register <string, bool>( "IPReg", log => Regex.IsMatch(log, "^(?=10)")); generalDf.CreateOrReplaceTempView("IPLogs"); // Apply UDF to get valid log entries starting with 10 // Use SQL "WHERE" rather than doing ipDf.Filter(), // which avoids creating an extra column "IPReg(value)" DataFrame ipDf = spark.Sql( "SELECT iplogs.value FROM IPLogs WHERE IPReg(iplogs.value)"); ipDf.Show(); // Step 3: Choose valid log entries that start // with 10 and deal with spam spark.Udf().Register <string, bool>( "SpamRegEx", log => Regex.IsMatch(log, "\\b(?=spam)\\b")); ipDf.CreateOrReplaceTempView("SpamLogs"); // Apply UDF to get valid, start with 10, spam entries DataFrame spamDF = spark.Sql( "SELECT spamlogs.value FROM SpamLogs WHERE SpamRegEx(spamlogs.value)"); // Let's explore the columns in the data we have filtered // Use LINQ to count the number of GET requests int numGetRequests = spamDF .Collect() .Where(r => ContainsGet(r.GetAs <string>("value"))) .Count(); Console.WriteLine("Number of GET requests: " + numGetRequests); spark.Stop(); }