public void TestWithColumn() { Func <Column, Column> sizeNameAgeUdf = Udf <Row, string>( r => { string name = r.GetAs <string>("name"); int?age = r.GetAs <int?>("age"); if (age.HasValue) { return($"{r.Size()},{name},{age.Value}"); } return($"{r.Size()},{name},{string.Empty}"); }); string[] allCols = _df.Columns().ToArray(); DataFrame nameAgeColDF = _df.WithColumn("NameAgeCol", Struct(allCols[0], allCols.Skip(1).ToArray())); DataFrame sizeNameAgeColDF = nameAgeColDF.WithColumn("SizeNameAgeCol", sizeNameAgeUdf(nameAgeColDF["NameAgeCol"])); Row[] originalDFRows = _df.Collect().ToArray(); Assert.Equal(3, originalDFRows.Length); Row[] sizeNameAgeColDFRows = sizeNameAgeColDF.Collect().ToArray(); Assert.Equal(3, sizeNameAgeColDFRows.Length); { Row row = sizeNameAgeColDFRows[0]; Assert.Equal("Michael", row.GetAs <string>("name")); Assert.Null(row.Get("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[0], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Michael,", row.GetAs <string>("SizeNameAgeCol")); } { Row row = sizeNameAgeColDFRows[1]; Assert.Equal("Andy", row.GetAs <string>("name")); Assert.Equal(30, row.GetAs <int>("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[1], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Andy,30", row.GetAs <string>("SizeNameAgeCol")); } { Row row = sizeNameAgeColDFRows[2]; Assert.Equal("Justin", row.GetAs <string>("name")); Assert.Equal(19, row.GetAs <int>("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[2], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Justin,19", row.GetAs <string>("SizeNameAgeCol")); } }
public static void AssertSameRows(DataFrame dataFrameA, DataFrame dataFrameB, Option <ITestOutputHelper> helper) { Column[] dfAOrderedColumns = dataFrameA .Columns() .OrderByDescending(val => val) .Select(Column) .ToArray(); Column[] dfBOrderedColumns = dataFrameB .Columns() .OrderByDescending(val => val) .Select(Column) .ToArray(); IEnumerable <Row> dfASeq = dataFrameA.Select(dfAOrderedColumns).Collect(); IEnumerable <Row> dfBSeq = dataFrameB.Select(dfBOrderedColumns).Collect(); foreach (Row rowA in dfASeq) { if (helper.HasValue) { helper.Value.WriteLine($"Computed - {rowA}"); } } int i = 0; foreach (Row rowA in dfASeq) { dfBSeq.Select(row => row.Values.ToString()).ShouldContain(rowA.Values.ToString()); } }
private static string FormatTagColumnNameInDataFrame(string tagName, DataFrame dataFrame) { string tagColumnName = tagName.Replace("[^A-Za-z0-9_]", "").ToLowerInvariant(); if (dataFrame.Columns().Contains(tagColumnName)) { tagColumnName += "_2"; } return(tagColumnName); }
private DataFrame DataFrameUnion(DataFrame dataFrameOne, DataFrame dataFrameTwo) { string[] columnsOne = dataFrameOne.Columns().ToArray(); string[] columnsTwo = dataFrameTwo.Columns().ToArray(); IEnumerable <string> columnTotal = columnsOne.Concat(columnsTwo).Distinct(); return(dataFrameOne .Select(WithAllColumns(columnsOne, columnTotal.ToArray()).ToArray()) .Union(dataFrameTwo.Select(WithAllColumns(columnsTwo, columnTotal)))); }
public void TestIDFModel() { int expectedDocFrequency = 1980; string expectedInputCol = "rawFeatures"; string expectedOutputCol = "features"; DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); Tokenizer tokenizer = new Tokenizer() .SetInputCol("sentence") .SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); HashingTF hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol(expectedInputCol) .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); Assert.Contains(expectedOutputCol, rescaledData.Columns()); Assert.Equal(expectedInputCol, idfModel.GetInputCol()); Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) { string modelPath = Path.Join(tempDirectory.Path, "idfModel"); idfModel.Save(modelPath); IDFModel loadedModel = IDFModel.Load(modelPath); Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } TestFeatureBase(idfModel, "minDocFreq", 1000); }
public void TestHashingTF() { string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; int expectedFeatures = 10; Assert.IsType <HashingTF>(new HashingTF()); HashingTF hashingTf = new HashingTF("my-unique-id") .SetNumFeatures(expectedFeatures) .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol); Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures()); Assert.Equal(expectedInputCol, hashingTf.GetInputCol()); Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol()); DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + " as input_col"); DataFrame output = hashingTf.Transform(input); DataFrame outputVector = output.Select(expectedOutputCol); Assert.Contains(expectedOutputCol, outputVector.Columns()); using (var tempDirectory = new TemporaryDirectory()) { string savePath = Path.Join(tempDirectory.Path, "hashingTF"); hashingTf.Save(savePath); HashingTF loadedHashingTf = HashingTF.Load(savePath); Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); } hashingTf.SetBinary(true); Assert.True(hashingTf.GetBinary()); TestFeatureBase(hashingTf, "numFeatures", 1000); }
public void TestColumns() { // Arrange const string columnName = "column1"; var mockSchemaProxy = new Mock <IStructTypeProxy>(); var mockFieldProxy = new Mock <IStructFieldProxy>(); var expectedResultDataFrameProxy = new Mock <IDataFrameProxy>().Object; mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object); mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List <IStructFieldProxy> { mockFieldProxy.Object }); mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName); var sc = new SparkContext(null); // Act var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc); var actualColumns = originalDataFrame.Columns(); // Assert CollectionAssert.AreEqual(new[] { columnName }, actualColumns.ToArray()); }
public void TestSignaturesV2_3_X() { Column col = _df["name"]; col = _df["age"]; DataFrame df = _df.ToDF(); df = df.ToDF("name2", "age2"); StructType schema = _df.Schema(); Assert.NotNull(schema); _df.PrintSchema(); _df.Explain(); _df.Explain(true); _df.Explain(false); Assert.Equal(2, _df.Columns().ToArray().Length); _df.IsLocal(); _df.IsStreaming(); using (var tempDir = new TemporaryDirectory()) { // The following is required for *CheckPoint(). _spark.SparkContext.SetCheckpointDir(tempDir.Path); _df.Checkpoint(); _df.Checkpoint(false); _df.LocalCheckpoint(); _df.LocalCheckpoint(false); } _df.WithWatermark("time", "10 minutes"); _df.Show(); _df.Show(10); _df.Show(10, 10); _df.Show(10, 10, true); _df.Join(_df); _df.Join(_df, "name"); _df.Join(_df, new[] { "name" }); _df.Join(_df, new[] { "name" }, "outer"); _df.Join(_df, _df["age"] == _df["age"]); _df.Join(_df, _df["age"] == _df["age"], "outer"); _df.CrossJoin(_df); _df.SortWithinPartitions("age"); _df.SortWithinPartitions("age", "name"); _df.SortWithinPartitions(); _df.SortWithinPartitions(_df["age"]); _df.SortWithinPartitions(_df["age"], _df["name"]); _df.Sort("age"); _df.Sort("age", "name"); _df.Sort(); _df.Sort(_df["age"]); _df.Sort(_df["age"], _df["name"]); _df.OrderBy("age"); _df.OrderBy("age", "name"); _df.OrderBy(); _df.OrderBy(_df["age"]); _df.OrderBy(_df["age"], _df["name"]); _df.Hint("broadcast"); _df.Hint("broadcast", new[] { "hello", "world" }); _df.Col("age"); _df.ColRegex("age"); _df.As("alias"); _df.Alias("alias"); _df.Select("age"); _df.Select("age", "name"); _df.Select(); _df.Select(_df["age"]); _df.Select(_df["age"], _df["name"]); _df.SelectExpr(); _df.SelectExpr("age * 2"); _df.SelectExpr("age * 2", "abs(age)"); _df.Filter(_df["age"] > 21); _df.Filter("age > 21"); _df.Where(_df["age"] > 21); _df.Where("age > 21"); _df.GroupBy("age"); _df.GroupBy("age", "name"); _df.GroupBy(); _df.GroupBy(_df["age"]); _df.GroupBy(_df["age"], _df["name"]); _df.Rollup("age"); _df.Rollup("age", "name"); _df.Rollup(); _df.Rollup(_df["age"]); _df.Rollup(_df["age"], _df["name"]); _df.Cube("age"); _df.Cube("age", "name"); _df.Cube(); _df.Cube(_df["age"]); _df.Cube(_df["age"], _df["name"]); _df.Agg(Avg(_df["age"])); _df.Agg(Avg(_df["age"]), Avg(_df["name"])); _df.Limit(10); _df.Union(_df); _df.UnionByName(_df); _df.Intersect(_df); _df.Except(_df); _df.Sample(0.5); _df.Sample(0.5, true); _df.Sample(0.5, false, 12345); _df.RandomSplit(new[] { 0.2, 0.8 }); _df.RandomSplit(new[] { 0.2, 0.8 }, 12345); _df.WithColumn("age2", _df["age"]); _df.WithColumnRenamed("age", "age2"); _df.Drop(); _df.Drop("age"); _df.Drop("age", "name"); _df.Drop(_df["age"]); _df.DropDuplicates(); _df.DropDuplicates("age"); _df.DropDuplicates("age", "name"); _df.Describe(); _df.Describe("age"); _df.Describe("age", "name"); _df.Summary(); _df.Summary("count"); _df.Summary("count", "mean"); _df.Head(2); _df.Head(); _df.First(); _df.Take(3).ToArray(); _df.Collect().ToArray(); _df.ToLocalIterator().ToArray(); _df.Count(); _df.Repartition(2); _df.Repartition(2, _df["age"]); _df.Repartition(_df["age"]); _df.Repartition(); _df.RepartitionByRange(2, _df["age"]); _df.RepartitionByRange(_df["age"]); _df.Coalesce(1); _df.Distinct(); _df.Persist(); _df.Cache(); _df.Unpersist(); _df.CreateTempView("view"); _df.CreateOrReplaceTempView("view"); _df.CreateGlobalTempView("global_view"); _df.CreateOrReplaceGlobalTempView("global_view"); }