protected override CommandExecutorStat ExecuteCore( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); // On the Spark side, each object in the following List<> is considered as a row. // See the ICommandRunner comments above for the types for a row. var outputRows = new List <object>(); // If the input is empty (no rows) or all rows have been read, then // SpecialLengths.END_OF_DATA_SECTION is sent as the messageLength. // For example, no rows: // +---+----+ // |age|name| // +---+----+ // +---+----+ int messageLength = 0; while ((messageLength = SerDe.ReadInt32(inputStream)) != (int)SpecialLengths.END_OF_DATA_SECTION) { if ((messageLength > 0) || (messageLength == (int)SpecialLengths.NULL)) { if (messageLength <= 0) { throw new InvalidDataException( $"Invalid message length: {messageLength}"); } // Each row in inputRows is of type object[]. If a null is present in a row // then the corresponding index column of the row object[] will be set to null. // For example, (inputRows.Length == 2) and (inputRows[0][0] == null) // +----+ // | age| // +----+ // |null| // | 11| // +----+ object[] inputRows = PythonSerDe.GetUnpickledObjects(inputStream, messageLength); for (int i = 0; i < inputRows.Length; ++i) { // Split id is not used for SQL UDFs, so 0 is passed. outputRows.Add(commandRunner.Run(0, inputRows[i])); } // The initial (estimated) buffer size for pickling rows is set to the size of input pickled rows // because the number of rows are the same for both input and output. WriteOutput(outputStream, outputRows, messageLength); stat.NumEntriesProcessed += inputRows.Length; outputRows.Clear(); } } return(stat); }
protected override CommandExecutorStat ExecuteCore( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); // TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams. // For now, we write to a temporary seekable MemoryStream which we then copy to // the actual destination stream. MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream()); ArrowStreamWriter writer = null; Schema resultSchema = null; foreach (ReadOnlyMemory <IArrowArray> input in GetInputIterator(inputStream)) { // Split id is currently not used, so 0 is passed. IArrowArray[] results = commandRunner.Run(0, input); // Assumes all columns have the same length, so uses 0th for num entries. int numEntries = results[0].Length; stat.NumEntriesProcessed += numEntries; tmp.SetLength(0); if (writer == null) { Debug.Assert(resultSchema == null); resultSchema = BuildSchema(results); writer = new ArrowStreamWriter(tmp, resultSchema, leaveOpen: true); } var recordBatch = new RecordBatch(resultSchema, results, numEntries); // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult(); tmp.Position = 0; tmp.CopyTo(outputStream); outputStream.Flush(); } SerDe.Write(outputStream, 0); if (writer != null) { writer.Dispose(); } return(stat); }
protected override CommandExecutorStat ExecuteCore( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (ArrowGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); // TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams. // For now, we write to a temporary seekable MemoryStream which we then copy to // the actual destination stream. MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream()); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { RecordBatch result = worker.Func(input); int numEntries = result.Length; stat.NumEntriesProcessed += numEntries; tmp.SetLength(0); if (writer == null) { writer = new ArrowStreamWriter(tmp, result.Schema, leaveOpen: true); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); tmp.Position = 0; tmp.CopyTo(outputStream); outputStream.Flush(); } SerDe.Write(outputStream, 0); if (writer != null) { writer.Dispose(); } return(stat); }
private CommandExecutorStat ExecuteDataFrameSqlCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); var inputColumns = new DataFrameColumn[input.ColumnCount]; for (int i = 0; i < dataFrame.Columns.Count; ++i) { inputColumns[i] = dataFrame.Columns[i]; } DataFrameColumn[] results = commandRunner.Run(inputColumns); var resultDataFrame = new FxDataFrame(results); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch result in recordBatches) { stat.NumEntriesProcessed += result.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } } SerDe.Write(outputStream, 0); if (writer != null) { writer.Dispose(); } return(stat); }
protected override CommandExecutorStat ExecuteCore( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); ArrowStreamWriter writer = null; Schema resultSchema = null; foreach (ReadOnlyMemory <IArrowArray> input in GetInputIterator(inputStream)) { IArrowArray[] results = commandRunner.Run(input); // Assumes all columns have the same length, so uses 0th for num entries. int numEntries = results[0].Length; stat.NumEntriesProcessed += numEntries; if (writer == null) { Debug.Assert(resultSchema == null); resultSchema = BuildSchema(results); writer = new ArrowStreamWriter(outputStream, resultSchema, leaveOpen: true); } var recordBatch = new RecordBatch(resultSchema, results, numEntries); // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult(); } SerDe.Write(outputStream, 0); if (writer != null) { writer.Dispose(); } return(stat); }
private CommandExecutorStat ExecuteDataFrameGroupedMapCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); FxDataFrame resultDataFrame = worker.Func(dataFrame); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch result in recordBatches) { stat.NumEntriesProcessed += result.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } } SerDe.Write(outputStream, 0); if (writer != null) { writer.Dispose(); } return(stat); }
protected override CommandExecutorStat ExecuteCore( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (ArrowGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { RecordBatch result = worker.Func(input); int numEntries = result.Length; stat.NumEntriesProcessed += numEntries; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } SerDe.Write(outputStream, 0); if (writer != null) { writer.Dispose(); } return(stat); }
/// <summary> /// Executes the commands on the input data read from input stream /// and writes results to the output stream. /// </summary> /// <param name="inputStream">Input stream to read data from</param> /// <param name="outputStream">Output stream to write results to</param> /// <param name="splitIndex">Split index for this task</param> /// <param name="command">Contains the commands to execute</param> /// <returns>Statistics captured during the Execute() run</returns> internal CommandExecutorStat Execute( Stream inputStream, Stream outputStream, int splitIndex, RDDCommand command) { var stat = new CommandExecutorStat(); CommandSerDe.SerializedMode serializerMode = command.SerializerMode; CommandSerDe.SerializedMode deserializerMode = command.DeserializerMode; RDD.WorkerFunction.ExecuteDelegate func = command.WorkerFunction.Func; foreach (object output in func( splitIndex, GetInputIterator(inputStream, deserializerMode))) { WriteOutput(outputStream, serializerMode, output); ++stat.NumEntriesProcessed; } return(stat); }