/// <summary> /// Read SqlCommands from the stream based on the given version. /// </summary> /// <param name="evalType">Evaluation type for the current commands</param> /// <param name="stream">Stream to read from</param> /// <param name="version">Spark version</param> /// <returns>SqlCommand objects</returns> private static SqlCommand[] ReadSqlCommands( PythonEvalType evalType, Stream stream, Version version) { if ((evalType != PythonEvalType.SQL_BATCHED_UDF) && (evalType != PythonEvalType.SQL_SCALAR_PANDAS_UDF) && (evalType != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)) { throw new NotImplementedException($"{evalType} is not supported."); } if (version.Major == 2) { switch (version.Minor) { case 3: return(SqlCommandProcessorV2_3_X.Process(evalType, stream)); case 4: return(SqlCommandProcessorV2_4_X.Process(evalType, stream)); } } throw new NotSupportedException($"Spark {version} not supported."); }
internal static SqlCommand[] Process(PythonEvalType evalType, Stream stream) { SqlCommand[] sqlCommands = ReadSqlCommands(evalType, stream); if ((evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF) || (evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)) { // Reads the timezone information. This is not going to be used until // timestamp column is supported in Arrow. SerDe.ReadString(stream); } return(sqlCommands); }
/// <summary> /// Read SqlCommands from the stream based on the given version. /// </summary> /// <param name="evalType">Evaluation type for the current commands</param> /// <param name="stream">Stream to read from</param> /// <param name="version">Spark version</param> /// <returns>SqlCommand objects</returns> private static SqlCommand[] ReadSqlCommands( PythonEvalType evalType, Stream stream, Version version) { if ((evalType != PythonEvalType.SQL_BATCHED_UDF) && (evalType != PythonEvalType.SQL_SCALAR_PANDAS_UDF) && (evalType != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF)) { throw new NotImplementedException($"{evalType} is not supported."); } return((version.Major, version.Minor) switch { (2, 4) => SqlCommandProcessorV2_4_X.Process(evalType, stream), (3, _) => SqlCommandProcessorV2_4_X.Process(evalType, stream), _ => throw new NotSupportedException($"Spark {version} not supported.") });
internal static SqlCommand[] Process(PythonEvalType evalType, Stream stream) { if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF || evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF || evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF || evalType == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF) { int numConf = SerDe.ReadInt32(stream); for (int i = 0; i < numConf; ++i) { // Currently this setting is not used. // When Arrow supports timestamp type, "spark.sql.session.timeZone" // can be retrieved from here. SerDe.ReadString(stream); SerDe.ReadString(stream); } } return(ReadSqlCommands(evalType, stream)); }
/// <summary> /// Read SqlCommands from the stream. /// </summary> /// <param name="stream">Stream to read from</param> /// <param name="evalType">Evaluation type for the current commands</param> /// <returns>SqlCommand objects</returns> private static SqlCommand[] ReadSqlCommands( PythonEvalType evalType, Stream stream) { int numUdfs = SerDe.ReadInt32(stream); var commands = new SqlCommand[numUdfs]; for (int i = 0; i < numUdfs; ++i) { var command = new SqlCommand(); int numArgsOffsets = SerDe.ReadInt32(stream); command.ArgOffsets = new int[numArgsOffsets]; for (int argIndex = 0; argIndex < numArgsOffsets; ++argIndex) { command.ArgOffsets[argIndex] = SerDe.ReadInt32(stream); } command.NumChainedFunctions = SerDe.ReadInt32(stream); for (int funcIndex = 0; funcIndex < command.NumChainedFunctions; ++funcIndex) { int commandBytesCount = SerDe.ReadInt32(stream); if (commandBytesCount > 0) { CommandSerDe.SerializedMode serializerMode; CommandSerDe.SerializedMode deserializerMode; if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF) { var curWorkerFunction = new ArrowWorkerFunction( CommandSerDe.Deserialize <ArrowWorkerFunction.ExecuteDelegate>( stream, out serializerMode, out deserializerMode, out string runMode)); command.WorkerFunction = (command.WorkerFunction == null) ? curWorkerFunction : ArrowWorkerFunction.Chain( (ArrowWorkerFunction)command.WorkerFunction, curWorkerFunction); } else if (evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) { if ((numUdfs != 1) || (command.WorkerFunction != null)) { throw new InvalidDataException( "Grouped map UDFs do not support combining multiple UDFs"); } command.WorkerFunction = new ArrowGroupedMapWorkerFunction( CommandSerDe.Deserialize <ArrowGroupedMapWorkerFunction.ExecuteDelegate>( stream, out serializerMode, out deserializerMode, out string runMode)); } else { var curWorkerFunction = new PicklingWorkerFunction( CommandSerDe.Deserialize <PicklingWorkerFunction.ExecuteDelegate>( stream, out serializerMode, out deserializerMode, out string runMode)); command.WorkerFunction = (command.WorkerFunction == null) ? curWorkerFunction : PicklingWorkerFunction.Chain( (PicklingWorkerFunction)command.WorkerFunction, curWorkerFunction); } command.SerializerMode = serializerMode; command.DeserializerMode = deserializerMode; } else { throw new InvalidDataException( $"Invalid command size: {commandBytesCount}"); } } commands[i] = command; } return(commands); }