Beispiel #1
0
        /// <summary>
        /// Maps each group of the current DataFrame using a UDF and
        /// returns the result as a DataFrame.
        ///
        /// The user-defined function should take an Apache Arrow RecordBatch
        /// and return another Apache Arrow RecordBatch. For each group, all
        /// columns are passed together as a RecordBatch to the user-function and
        /// the returned RecordBatch are combined as a DataFrame.
        ///
        /// The returned <see cref="RecordBatch"/> can be of arbitrary length and its
        /// schema must match <paramref name="returnType"/>.
        /// </summary>
        /// <param name="returnType">
        /// The <see cref="StructType"/> that represents the shape of the return data set.
        /// </param>
        /// <param name="func">A grouped map user-defined function.</param>
        /// <returns>New DataFrame object with the UDF applied.</returns>
        public DataFrame Apply(StructType returnType, Func <RecordBatch, RecordBatch> func)
        {
            ArrowGroupedMapWorkerFunction.ExecuteDelegate wrapper =
                new ArrowGroupedMapUdfWrapper(func).Execute;

            UserDefinedFunction udf = UserDefinedFunction.Create(
                Reference.Jvm,
                func.Method.ToString(),
                CommandSerDe.Serialize(
                    wrapper,
                    CommandSerDe.SerializedMode.Row,
                    CommandSerDe.SerializedMode.Row),
                UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                returnType.Json);

            IReadOnlyList <string> columnNames = _dataFrame.Columns();
            var columns = new Column[columnNames.Count];

            for (int i = 0; i < columnNames.Count; ++i)
            {
                columns[i] = _dataFrame[columnNames[i]];
            }

            Column udfColumn = udf.Apply(columns);

            return(new DataFrame((JvmObjectReference)Reference.Invoke(
                                     "flatMapGroupsInPandas",
                                     udfColumn.Expr())));
        }