public void TestDataFrameVectorUdf() { Func <PrimitiveDataFrameColumn <int>, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn> udf1Func = (ages, names) => { var stringArray = (StringArray)ToArrowArray( Enumerable.Range(0, (int)names.Length) .Select(i => $"{names[i]} is {ages[i] ?? 0}") .ToArray()); return(ToArrowStringDataFrameColumn(stringArray)); }; // Single UDF. Func <Column, Column, Column> udf1 = ExperimentalDataFrameFunctions.VectorUdf(udf1Func); { Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } // Chained UDFs. Func <Column, Column> udf2 = ExperimentalDataFrameFunctions.VectorUdf <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringArray = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"hello {strings[i]}!") .ToArray()); return(ToArrowStringDataFrameColumn(stringArray)); }); { Row[] rows = _df .Select(udf2(udf1(_df["age"], _df["name"]))) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0)); Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0)); Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0)); } // Multiple UDFs: { Row[] rows = _df .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"])) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("hello Michael!", rows[0].GetAs <string>(1)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("hello Andy!", rows[1].GetAs <string>(1)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); Assert.Equal("hello Justin!", rows[2].GetAs <string>(1)); } // Register UDF { _df.CreateOrReplaceTempView("people"); _spark.Udf().RegisterVector("udf1", udf1Func); Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } }
public void TestDataFrameVectorUdf() { Func <Int32DataFrameColumn, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn> udf1Func = (ages, names) => { long i = 0; return(names.Apply(cur => $"{cur} is {ages[i++] ?? 0}")); }; // Single UDF. Func <Column, Column, Column> udf1 = ExperimentalDataFrameFunctions.VectorUdf(udf1Func); { Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } // Chained UDFs. Func <Column, Column> udf2 = ExperimentalDataFrameFunctions.VectorUdf <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"hello {cur}!")); { Row[] rows = _df .Select(udf2(udf1(_df["age"], _df["name"]))) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0)); Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0)); Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0)); } // Multiple UDFs: { Row[] rows = _df .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"])) .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("hello Michael!", rows[0].GetAs <string>(1)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("hello Andy!", rows[1].GetAs <string>(1)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); Assert.Equal("hello Justin!", rows[2].GetAs <string>(1)); } // Register UDF { _df.CreateOrReplaceTempView("people"); _spark.Udf().RegisterVector("udf1", udf1Func); Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people") .Collect() .ToArray(); Assert.Equal(3, rows.Length); Assert.Equal("Michael is 0", rows[0].GetAs <string>(0)); Assert.Equal("Andy is 30", rows[1].GetAs <string>(0)); Assert.Equal("Justin is 19", rows[2].GetAs <string>(0)); } }