private CommandExecutorStat ExecuteDataFrameSqlCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame    = FxDataFrame.FromArrowRecordBatch(input);
                var         inputColumns = new DataFrameColumn[input.ColumnCount];
                for (int i = 0; i < dataFrame.Columns.Count; ++i)
                {
                    inputColumns[i] = dataFrame.Columns[i];
                }

                DataFrameColumn[] results = commandRunner.Run(inputColumns);

                var resultDataFrame = new FxDataFrame(results);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
Exemple #2
0
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame       = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame = worker.Func(dataFrame);

                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch batch in recordBatches)
                {
                    RecordBatch final = WrapColumnsInStructIfApplicable(batch);
                    stat.NumEntriesProcessed += final.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions);
                    }

                    writer.WriteRecordBatch(final);
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame                   = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame             = worker.Func(dataFrame);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
        private CommandExecutorStat ExecuteArrowSqlCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions   = ArrowIpcOptions();
            ArrowStreamWriter writer       = null;
            Schema            resultSchema = null;

            foreach (ReadOnlyMemory <IArrowArray> input in GetArrowInputIterator(inputStream))
            {
                IArrowArray[] results = commandRunner.Run(input);

                // Assumes all columns have the same length, so uses 0th for num entries.
                int numEntries = results[0].Length;
                stat.NumEntriesProcessed += numEntries;

                if (writer == null)
                {
                    Debug.Assert(resultSchema == null);
                    resultSchema = BuildSchema(results);

                    writer =
                        new ArrowStreamWriter(outputStream, resultSchema, leaveOpen: true, ipcOptions);
                }

                var recordBatch = new RecordBatch(resultSchema, results, numEntries);

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
Exemple #5
0
 private void LogStat(CommandExecutorStat stat, bool readComplete)
 {
     s_logger.LogInfo($"[{TaskId}] Processed a task: readComplete:{readComplete}, entries:{stat.NumEntriesProcessed}");
 }