/// <summary> /// Maps each group of the current DataFrame using a UDF and /// returns the result as a DataFrame. /// /// The user-defined function should take an <see cref="FxDataFrame"/> /// and return another <see cref="FxDataFrame"/>. For each group, all /// columns are passed together as an <see cref="FxDataFrame"/> to the user-function and /// the returned FxDataFrame are combined as a DataFrame. /// /// The returned <see cref="FxDataFrame"/> can be of arbitrary length and its schema must /// match <paramref name="returnType"/>. /// </summary> /// <param name="returnType"> /// The <see cref="StructType"/> that represents the schema of the return data set. /// </param> /// <param name="func">A grouped map user-defined function.</param> /// <returns>New DataFrame object with the UDF applied.</returns> public DataFrame Apply(StructType returnType, Func <FxDataFrame, FxDataFrame> func) { DataFrameGroupedMapWorkerFunction.ExecuteDelegate wrapper = new DataFrameGroupedMapUdfWrapper(func).Execute; var udf = UserDefinedFunction.Create( Reference.Jvm, func.Method.ToString(), CommandSerDe.Serialize( wrapper, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row), UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, returnType.Json); IReadOnlyList <string> columnNames = _dataFrame.Columns(); var columns = new Column[columnNames.Count]; for (int i = 0; i < columnNames.Count; ++i) { columns[i] = _dataFrame[columnNames[i]]; } Column udfColumn = udf.Apply(columns); return(new DataFrame((JvmObjectReference)Reference.Invoke( "flatMapGroupsInPandas", udfColumn.Expr()))); }
public void TestWithColumn() { Func <Column, Column> sizeNameAgeUdf = Udf <Row, string>( r => { string name = r.GetAs <string>("name"); int?age = r.GetAs <int?>("age"); if (age.HasValue) { return($"{r.Size()},{name},{age.Value}"); } return($"{r.Size()},{name},{string.Empty}"); }); string[] allCols = _df.Columns().ToArray(); DataFrame nameAgeColDF = _df.WithColumn("NameAgeCol", Struct(allCols[0], allCols.Skip(1).ToArray())); DataFrame sizeNameAgeColDF = nameAgeColDF.WithColumn("SizeNameAgeCol", sizeNameAgeUdf(nameAgeColDF["NameAgeCol"])); Row[] originalDFRows = _df.Collect().ToArray(); Assert.Equal(3, originalDFRows.Length); Row[] sizeNameAgeColDFRows = sizeNameAgeColDF.Collect().ToArray(); Assert.Equal(3, sizeNameAgeColDFRows.Length); { Row row = sizeNameAgeColDFRows[0]; Assert.Equal("Michael", row.GetAs <string>("name")); Assert.Null(row.Get("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[0], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Michael,", row.GetAs <string>("SizeNameAgeCol")); } { Row row = sizeNameAgeColDFRows[1]; Assert.Equal("Andy", row.GetAs <string>("name")); Assert.Equal(30, row.GetAs <int>("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[1], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Andy,30", row.GetAs <string>("SizeNameAgeCol")); } { Row row = sizeNameAgeColDFRows[2]; Assert.Equal("Justin", row.GetAs <string>("name")); Assert.Equal(19, row.GetAs <int>("age")); Assert.IsType <Row>(row.Get("NameAgeCol")); Assert.Equal(originalDFRows[2], row.GetAs <Row>("NameAgeCol")); Assert.Equal("2,Justin,19", row.GetAs <string>("SizeNameAgeCol")); } }