/// <summary> /// Maps each group of the current DataFrame using a UDF and /// returns the result as a DataFrame. /// /// The user-defined function should take an Apache Arrow RecordBatch /// and return another Apache Arrow RecordBatch. For each group, all /// columns are passed together as a RecordBatch to the user-function and /// the returned RecordBatch are combined as a DataFrame. /// /// The returned <see cref="RecordBatch"/> can be of arbitrary length and its /// schema must match <paramref name="returnType"/>. /// </summary> /// <param name="returnType"> /// The <see cref="StructType"/> that represents the shape of the return data set. /// </param> /// <param name="func">A grouped map user-defined function.</param> /// <returns>New DataFrame object with the UDF applied.</returns> public DataFrame Apply(StructType returnType, Func <RecordBatch, RecordBatch> func) { ArrowGroupedMapWorkerFunction.ExecuteDelegate wrapper = new ArrowGroupedMapUdfWrapper(func).Execute; UserDefinedFunction udf = UserDefinedFunction.Create( Reference.Jvm, func.Method.ToString(), CommandSerDe.Serialize( wrapper, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row), UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, returnType.Json); IReadOnlyList <string> columnNames = _dataFrame.Columns(); var columns = new Column[columnNames.Count]; for (int i = 0; i < columnNames.Count; ++i) { columns[i] = _dataFrame[columnNames[i]]; } Column udfColumn = udf.Apply(columns); return(new DataFrame((JvmObjectReference)Reference.Invoke( "flatMapGroupsInPandas", udfColumn.Expr()))); }
/// <summary> /// Helper function to register wrapped udf. /// </summary> /// <param name="name">Name of the udf</param> /// <param name="func">Wrapped UDF function</param> /// <param name="evalType">The EvalType of the function</param> /// <param name="returnType">The return type of the function in JSON format</param> private void Register(string name, Delegate func, UdfUtils.PythonEvalType evalType, string returnType) { byte[] command = CommandSerDe.Serialize( func, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row); UserDefinedFunction udf = UserDefinedFunction.Create( _jvmObject.Jvm, name, command, evalType, returnType); _jvmObject.Invoke("registerPython", name, udf); }
/// <summary> /// Helper function to register wrapped udf. /// </summary> /// <typeparam name="TResult">Return type of the udf</typeparam> /// <param name="name">Name of the udf</param> /// <param name="func">Wrapped UDF function</param> /// <param name="evalType">The EvalType of the function.</param> internal void Register <TResult>(string name, Delegate func, UdfUtils.PythonEvalType evalType) { byte[] command = CommandSerDe.Serialize( func, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row); var udf = UserDefinedFunction.Create( _jvmObject.Jvm, name, command, evalType, UdfUtils.GetReturnType(typeof(TResult))); _jvmObject.Invoke("registerPython", name, udf); }
public DataStreamWriter Foreach(IForeachWriter writer) { RDD.WorkerFunction.ExecuteDelegate wrapper = new ForeachWriterWrapperUdfWrapper( new ForeachWriterWrapper(writer).Process).Execute; _jvmObject.Invoke( "foreach", _jvmObject.Jvm.CallConstructor( "org.apache.spark.sql.execution.python.PythonForeachWriter", UdfUtils.CreatePythonFunction( _jvmObject.Jvm, CommandSerDe.Serialize( wrapper, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row)), DataType.FromJson(_jvmObject.Jvm, _df.Schema().Json))); return(this); }
/// <summary> /// Helper function to register wrapped udf. /// </summary> /// <typeparam name="TResult">Return type of the udf</typeparam> /// <param name="name">Name of the udf</param> /// <param name="func">Wrapped UDF function</param> private void Register <TResult>(string name, Delegate func) { byte[] command = CommandSerDe.Serialize( func, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row); JvmObjectReference pythonFunction = UdfUtils.CreatePythonFunction(_jvmObject.Jvm, command); var udf = new UserDefinedFunction( _jvmObject.Jvm.CallConstructor( "org.apache.spark.sql.execution.python.UserDefinedPythonFunction", name, pythonFunction, GetDataType <TResult>(), (int)UdfUtils.GetPythonEvalType(), true // udfDeterministic )); _jvmObject.Invoke("registerPython", name, udf); }
public void Write(Stream stream, Command[] commands) { SerDe.Write(stream, commands.Length); foreach (Command command in commands) { SerDe.Write(stream, command.ArgOffsets.Length); foreach (int argOffset in command.ArgOffsets) { SerDe.Write(stream, argOffset); } SerDe.Write(stream, command.ChainedUdfs.Length); foreach (Delegate udf in command.ChainedUdfs) { byte[] serializedCommand = CommandSerDe.Serialize( udf, CommandSerDe.SerializedMode.Row, CommandSerDe.SerializedMode.Row); SerDe.Write(stream, serializedCommand.Length); SerDe.Write(stream, serializedCommand); } } }