Ejemplo n.º 1
0
        /// <summary>
        /// Maps each group of the current DataFrame using a UDF and
        /// returns the result as a DataFrame.
        ///
        /// The user-defined function should take an <see cref="FxDataFrame"/>
        /// and return another <see cref="FxDataFrame"/>. For each group, all
        /// columns are passed together as an <see cref="FxDataFrame"/> to the user-function and
        /// the returned FxDataFrame are combined as a DataFrame.
        ///
        /// The returned <see cref="FxDataFrame"/> can be of arbitrary length and its schema must
        /// match <paramref name="returnType"/>.
        /// </summary>
        /// <param name="returnType">
        /// The <see cref="StructType"/> that represents the schema of the return data set.
        /// </param>
        /// <param name="func">A grouped map user-defined function.</param>
        /// <returns>New DataFrame object with the UDF applied.</returns>
        public DataFrame Apply(StructType returnType, Func <FxDataFrame, FxDataFrame> func)
        {
            DataFrameGroupedMapWorkerFunction.ExecuteDelegate wrapper =
                new DataFrameGroupedMapUdfWrapper(func).Execute;

            var udf = UserDefinedFunction.Create(
                Reference.Jvm,
                func.Method.ToString(),
                CommandSerDe.Serialize(
                    wrapper,
                    CommandSerDe.SerializedMode.Row,
                    CommandSerDe.SerializedMode.Row),
                UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                returnType.Json);

            IReadOnlyList <string> columnNames = _dataFrame.Columns();
            var columns = new Column[columnNames.Count];

            for (int i = 0; i < columnNames.Count; ++i)
            {
                columns[i] = _dataFrame[columnNames[i]];
            }

            Column udfColumn = udf.Apply(columns);

            return(new DataFrame((JvmObjectReference)Reference.Invoke(
                                     "flatMapGroupsInPandas",
                                     udfColumn.Expr())));
        }
Ejemplo n.º 2
0
        public void TestWithColumn()
        {
            Func <Column, Column> sizeNameAgeUdf = Udf <Row, string>(
                r =>
            {
                string name = r.GetAs <string>("name");
                int?age     = r.GetAs <int?>("age");
                if (age.HasValue)
                {
                    return($"{r.Size()},{name},{age.Value}");
                }

                return($"{r.Size()},{name},{string.Empty}");
            });

            string[]  allCols      = _df.Columns().ToArray();
            DataFrame nameAgeColDF =
                _df.WithColumn("NameAgeCol", Struct(allCols[0], allCols.Skip(1).ToArray()));
            DataFrame sizeNameAgeColDF =
                nameAgeColDF.WithColumn("SizeNameAgeCol", sizeNameAgeUdf(nameAgeColDF["NameAgeCol"]));

            Row[] originalDFRows = _df.Collect().ToArray();
            Assert.Equal(3, originalDFRows.Length);

            Row[] sizeNameAgeColDFRows = sizeNameAgeColDF.Collect().ToArray();
            Assert.Equal(3, sizeNameAgeColDFRows.Length);

            {
                Row row = sizeNameAgeColDFRows[0];
                Assert.Equal("Michael", row.GetAs <string>("name"));
                Assert.Null(row.Get("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[0], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Michael,", row.GetAs <string>("SizeNameAgeCol"));
            }

            {
                Row row = sizeNameAgeColDFRows[1];
                Assert.Equal("Andy", row.GetAs <string>("name"));
                Assert.Equal(30, row.GetAs <int>("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[1], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Andy,30", row.GetAs <string>("SizeNameAgeCol"));
            }

            {
                Row row = sizeNameAgeColDFRows[2];
                Assert.Equal("Justin", row.GetAs <string>("name"));
                Assert.Equal(19, row.GetAs <int>("age"));
                Assert.IsType <Row>(row.Get("NameAgeCol"));
                Assert.Equal(originalDFRows[2], row.GetAs <Row>("NameAgeCol"));
                Assert.Equal("2,Justin,19", row.GetAs <string>("SizeNameAgeCol"));
            }
        }