private CommandExecutorStat ExecuteDataFrameSqlCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); var inputColumns = new DataFrameColumn[input.ColumnCount]; for (int i = 0; i < dataFrame.Columns.Count; ++i) { inputColumns[i] = dataFrame.Columns[i]; } DataFrameColumn[] results = commandRunner.Run(inputColumns); var resultDataFrame = new FxDataFrame(results); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch result in recordBatches) { stat.NumEntriesProcessed += result.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private CommandExecutorStat ExecuteDataFrameGroupedMapCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); FxDataFrame resultDataFrame = worker.Func(dataFrame); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch batch in recordBatches) { RecordBatch final = WrapColumnsInStructIfApplicable(batch); stat.NumEntriesProcessed += final.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions); } writer.WriteRecordBatch(final); } } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private CommandExecutorStat ExecuteDataFrameGroupedMapCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); FxDataFrame resultDataFrame = worker.Func(dataFrame); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch result in recordBatches) { stat.NumEntriesProcessed += result.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private CommandExecutorStat ExecuteArrowSqlCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; Schema resultSchema = null; foreach (ReadOnlyMemory <IArrowArray> input in GetArrowInputIterator(inputStream)) { IArrowArray[] results = commandRunner.Run(input); // Assumes all columns have the same length, so uses 0th for num entries. int numEntries = results[0].Length; stat.NumEntriesProcessed += numEntries; if (writer == null) { Debug.Assert(resultSchema == null); resultSchema = BuildSchema(results); writer = new ArrowStreamWriter(outputStream, resultSchema, leaveOpen: true, ipcOptions); } var recordBatch = new RecordBatch(resultSchema, results, numEntries); // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult(); } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private void LogStat(CommandExecutorStat stat, bool readComplete) { s_logger.LogInfo($"[{TaskId}] Processed a task: readComplete:{readComplete}, entries:{stat.NumEntriesProcessed}"); }