コード例 #1
0
ファイル: CommandProcessor.cs プロジェクト: seanshi007/spark
        /// <summary>
        /// Read one RDDCommand from the stream.
        /// </summary>
        /// <param name="stream">Stream to read from</param>
        /// <returns>RDDCommand object</returns>
        private static RDDCommand ReadRDDCommand(Stream stream)
        {
            int commandBytesCount = SerDe.ReadInt32(stream);

            if (commandBytesCount <= 0)
            {
                throw new InvalidDataException(
                          $"Invalid command size: {commandBytesCount}");
            }

            var rddCommand = new RDDCommand
            {
                WorkerFunction = new RDD.WorkerFunction(
                    CommandSerDe.Deserialize <RDD.WorkerFunction.ExecuteDelegate>(
                        stream,
                        out CommandSerDe.SerializedMode serializerMode,
                        out CommandSerDe.SerializedMode deserializerMode,
                        out var runMode))
            };

            rddCommand.SerializerMode   = serializerMode;
            rddCommand.DeserializerMode = deserializerMode;

            return(rddCommand);
        }
コード例 #2
0
        /// <summary>
        /// Maps each group of the current DataFrame using a UDF and
        /// returns the result as a DataFrame.
        ///
        /// The user-defined function should take an Apache Arrow RecordBatch
        /// and return another Apache Arrow RecordBatch. For each group, all
        /// columns are passed together as a RecordBatch to the user-function and
        /// the returned RecordBatch are combined as a DataFrame.
        ///
        /// The returned <see cref="RecordBatch"/> can be of arbitrary length and its
        /// schema must match <paramref name="returnType"/>.
        /// </summary>
        /// <param name="returnType">
        /// The <see cref="StructType"/> that represents the shape of the return data set.
        /// </param>
        /// <param name="func">A grouped map user-defined function.</param>
        /// <returns>New DataFrame object with the UDF applied.</returns>
        public DataFrame Apply(StructType returnType, Func <RecordBatch, RecordBatch> func)
        {
            ArrowGroupedMapWorkerFunction.ExecuteDelegate wrapper =
                new ArrowGroupedMapUdfWrapper(func).Execute;

            UserDefinedFunction udf = UserDefinedFunction.Create(
                Reference.Jvm,
                func.Method.ToString(),
                CommandSerDe.Serialize(
                    wrapper,
                    CommandSerDe.SerializedMode.Row,
                    CommandSerDe.SerializedMode.Row),
                UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                returnType.Json);

            IReadOnlyList <string> columnNames = _dataFrame.Columns();
            var columns = new Column[columnNames.Count];

            for (int i = 0; i < columnNames.Count; ++i)
            {
                columns[i] = _dataFrame[columnNames[i]];
            }

            Column udfColumn = udf.Apply(columns);

            return(new DataFrame((JvmObjectReference)Reference.Invoke(
                                     "flatMapGroupsInPandas",
                                     udfColumn.Expr())));
        }
コード例 #3
0
        /// <summary>
        /// Helper function to register wrapped udf.
        /// </summary>
        /// <param name="name">Name of the udf</param>
        /// <param name="func">Wrapped UDF function</param>
        /// <param name="evalType">The EvalType of the function</param>
        /// <param name="returnType">The return type of the function in JSON format</param>
        private void Register(string name, Delegate func, UdfUtils.PythonEvalType evalType, string returnType)
        {
            byte[] command = CommandSerDe.Serialize(
                func,
                CommandSerDe.SerializedMode.Row,
                CommandSerDe.SerializedMode.Row);

            UserDefinedFunction udf = UserDefinedFunction.Create(
                _jvmObject.Jvm,
                name,
                command,
                evalType,
                returnType);

            _jvmObject.Invoke("registerPython", name, udf);
        }
コード例 #4
0
        /// <summary>
        /// Helper function to register wrapped udf.
        /// </summary>
        /// <typeparam name="TResult">Return type of the udf</typeparam>
        /// <param name="name">Name of the udf</param>
        /// <param name="func">Wrapped UDF function</param>
        /// <param name="evalType">The EvalType of the function.</param>
        internal void Register <TResult>(string name, Delegate func, UdfUtils.PythonEvalType evalType)
        {
            byte[] command = CommandSerDe.Serialize(
                func,
                CommandSerDe.SerializedMode.Row,
                CommandSerDe.SerializedMode.Row);

            var udf = UserDefinedFunction.Create(
                _jvmObject.Jvm,
                name,
                command,
                evalType,
                UdfUtils.GetReturnType(typeof(TResult)));

            _jvmObject.Invoke("registerPython", name, udf);
        }
コード例 #5
0
ファイル: DataStreamWriter.cs プロジェクト: Kollou/spark
        public DataStreamWriter Foreach(IForeachWriter writer)
        {
            RDD.WorkerFunction.ExecuteDelegate wrapper =
                new ForeachWriterWrapperUdfWrapper(
                    new ForeachWriterWrapper(writer).Process).Execute;

            _jvmObject.Invoke(
                "foreach",
                _jvmObject.Jvm.CallConstructor(
                    "org.apache.spark.sql.execution.python.PythonForeachWriter",
                    UdfUtils.CreatePythonFunction(
                        _jvmObject.Jvm,
                        CommandSerDe.Serialize(
                            wrapper,
                            CommandSerDe.SerializedMode.Row,
                            CommandSerDe.SerializedMode.Row)),
                    DataType.FromJson(_jvmObject.Jvm, _df.Schema().Json)));

            return(this);
        }
コード例 #6
0
ファイル: UDFRegistration.cs プロジェクト: johnpaulada/spark
        /// <summary>
        /// Helper function to register wrapped udf.
        /// </summary>
        /// <typeparam name="TResult">Return type of the udf</typeparam>
        /// <param name="name">Name of the udf</param>
        /// <param name="func">Wrapped UDF function</param>
        private void Register <TResult>(string name, Delegate func)
        {
            byte[] command = CommandSerDe.Serialize(
                func,
                CommandSerDe.SerializedMode.Row,
                CommandSerDe.SerializedMode.Row);

            JvmObjectReference pythonFunction =
                UdfUtils.CreatePythonFunction(_jvmObject.Jvm, command);

            var udf = new UserDefinedFunction(
                _jvmObject.Jvm.CallConstructor(
                    "org.apache.spark.sql.execution.python.UserDefinedPythonFunction",
                    name,
                    pythonFunction,
                    GetDataType <TResult>(),
                    (int)UdfUtils.GetPythonEvalType(),
                    true // udfDeterministic
                    ));

            _jvmObject.Invoke("registerPython", name, udf);
        }
コード例 #7
0
ファイル: PayloadWriter.cs プロジェクト: imback82/spark-3
        public void Write(Stream stream, Command[] commands)
        {
            SerDe.Write(stream, commands.Length);
            foreach (Command command in commands)
            {
                SerDe.Write(stream, command.ArgOffsets.Length);
                foreach (int argOffset in command.ArgOffsets)
                {
                    SerDe.Write(stream, argOffset);
                }

                SerDe.Write(stream, command.ChainedUdfs.Length);
                foreach (Delegate udf in command.ChainedUdfs)
                {
                    byte[] serializedCommand = CommandSerDe.Serialize(
                        udf,
                        CommandSerDe.SerializedMode.Row,
                        CommandSerDe.SerializedMode.Row);

                    SerDe.Write(stream, serializedCommand.Length);
                    SerDe.Write(stream, serializedCommand);
                }
            }
        }
コード例 #8
0
        /// <summary>
        /// Read SqlCommands from the stream.
        /// </summary>
        /// <param name="stream">Stream to read from</param>
        /// <param name="evalType">Evaluation type for the current commands</param>
        /// <returns>SqlCommand objects</returns>
        private static SqlCommand[] ReadSqlCommands(
            PythonEvalType evalType,
            Stream stream)
        {
            int numUdfs  = SerDe.ReadInt32(stream);
            var commands = new SqlCommand[numUdfs];

            for (int i = 0; i < numUdfs; ++i)
            {
                var command = new SqlCommand();

                int numArgsOffsets = SerDe.ReadInt32(stream);
                command.ArgOffsets = new int[numArgsOffsets];
                for (int argIndex = 0; argIndex < numArgsOffsets; ++argIndex)
                {
                    command.ArgOffsets[argIndex] = SerDe.ReadInt32(stream);
                }

                command.NumChainedFunctions = SerDe.ReadInt32(stream);
                for (int funcIndex = 0; funcIndex < command.NumChainedFunctions; ++funcIndex)
                {
                    int commandBytesCount = SerDe.ReadInt32(stream);
                    if (commandBytesCount > 0)
                    {
                        CommandSerDe.SerializedMode serializerMode;
                        CommandSerDe.SerializedMode deserializerMode;
                        if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF)
                        {
                            var curWorkerFunction = new ArrowWorkerFunction(
                                CommandSerDe.Deserialize <ArrowWorkerFunction.ExecuteDelegate>(
                                    stream,
                                    out serializerMode,
                                    out deserializerMode,
                                    out string runMode));

                            command.WorkerFunction = (command.WorkerFunction == null) ?
                                                     curWorkerFunction :
                                                     ArrowWorkerFunction.Chain(
                                (ArrowWorkerFunction)command.WorkerFunction,
                                curWorkerFunction);
                        }
                        else if (evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)
                        {
                            if ((numUdfs != 1) || (command.WorkerFunction != null))
                            {
                                throw new InvalidDataException(
                                          "Grouped map UDFs do not support combining multiple UDFs");
                            }

                            command.WorkerFunction = new ArrowGroupedMapWorkerFunction(
                                CommandSerDe.Deserialize <ArrowGroupedMapWorkerFunction.ExecuteDelegate>(
                                    stream,
                                    out serializerMode,
                                    out deserializerMode,
                                    out string runMode));
                        }
                        else
                        {
                            var curWorkerFunction = new PicklingWorkerFunction(
                                CommandSerDe.Deserialize <PicklingWorkerFunction.ExecuteDelegate>(
                                    stream,
                                    out serializerMode,
                                    out deserializerMode,
                                    out string runMode));

                            command.WorkerFunction = (command.WorkerFunction == null) ?
                                                     curWorkerFunction :
                                                     PicklingWorkerFunction.Chain(
                                (PicklingWorkerFunction)command.WorkerFunction,
                                curWorkerFunction);
                        }

                        command.SerializerMode   = serializerMode;
                        command.DeserializerMode = deserializerMode;
                    }
                    else
                    {
                        throw new InvalidDataException(
                                  $"Invalid command size: {commandBytesCount}");
                    }
                }

                commands[i] = command;
            }

            return(commands);
        }