Exemplo n.º 1
0
        public void TestDataFrameVectorUdf()
        {
            Func <PrimitiveDataFrameColumn <int>, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn> udf1Func =
                (ages, names) =>
            {
                var stringArray = (StringArray)ToArrowArray(
                    Enumerable.Range(0, (int)names.Length)
                    .Select(i => $"{names[i]} is {ages[i] ?? 0}")
                    .ToArray());
                return(ToArrowStringDataFrameColumn(stringArray));
            };

            // Single UDF.
            Func <Column, Column, Column> udf1 =
                ExperimentalDataFrameFunctions.VectorUdf(udf1Func);
            {
                Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }

            // Chained UDFs.
            Func <Column, Column> udf2 = ExperimentalDataFrameFunctions.VectorUdf <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) =>
            {
                var stringArray = (StringArray)ToArrowArray(
                    Enumerable.Range(0, (int)strings.Length)
                    .Select(i => $"hello {strings[i]}!")
                    .ToArray());
                return(ToArrowStringDataFrameColumn(stringArray));
            });
            {
                Row[] rows = _df
                             .Select(udf2(udf1(_df["age"], _df["name"])))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0));
                Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0));
                Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0));
            }

            // Multiple UDFs:
            {
                Row[] rows = _df
                             .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"]))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("hello Michael!", rows[0].GetAs <string>(1));

                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("hello Andy!", rows[1].GetAs <string>(1));

                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
                Assert.Equal("hello Justin!", rows[2].GetAs <string>(1));
            }

            // Register UDF
            {
                _df.CreateOrReplaceTempView("people");
                _spark.Udf().RegisterVector("udf1", udf1Func);
                Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people")
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }
        }
Exemplo n.º 2
0
        public void TestDataFrameVectorUdf()
        {
            Func <Int32DataFrameColumn, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn> udf1Func =
                (ages, names) =>
            {
                long i = 0;
                return(names.Apply(cur => $"{cur} is {ages[i++] ?? 0}"));
            };

            // Single UDF.
            Func <Column, Column, Column> udf1 =
                ExperimentalDataFrameFunctions.VectorUdf(udf1Func);
            {
                Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }

            // Chained UDFs.
            Func <Column, Column> udf2 = ExperimentalDataFrameFunctions.VectorUdf <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"hello {cur}!"));
            {
                Row[] rows = _df
                             .Select(udf2(udf1(_df["age"], _df["name"])))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("hello Michael is 0!", rows[0].GetAs <string>(0));
                Assert.Equal("hello Andy is 30!", rows[1].GetAs <string>(0));
                Assert.Equal("hello Justin is 19!", rows[2].GetAs <string>(0));
            }

            // Multiple UDFs:
            {
                Row[] rows = _df
                             .Select(udf1(_df["age"], _df["name"]), udf2(_df["name"]))
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("hello Michael!", rows[0].GetAs <string>(1));

                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("hello Andy!", rows[1].GetAs <string>(1));

                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
                Assert.Equal("hello Justin!", rows[2].GetAs <string>(1));
            }

            // Register UDF
            {
                _df.CreateOrReplaceTempView("people");
                _spark.Udf().RegisterVector("udf1", udf1Func);
                Row[] rows = _spark.Sql("SELECT udf1(age, name) FROM people")
                             .Collect()
                             .ToArray();
                Assert.Equal(3, rows.Length);
                Assert.Equal("Michael is 0", rows[0].GetAs <string>(0));
                Assert.Equal("Andy is 30", rows[1].GetAs <string>(0));
                Assert.Equal("Justin is 19", rows[2].GetAs <string>(0));
            }
        }