Пример #1
0
        private CommandExecutorStat ExecuteDataFrameSqlCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            ArrowStreamWriter writer = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame    = FxDataFrame.FromArrowRecordBatch(input);
                var         inputColumns = new DataFrameColumn[input.ColumnCount];
                for (int i = 0; i < dataFrame.Columns.Count; ++i)
                {
                    inputColumns[i] = dataFrame.Columns[i];
                }

                DataFrameColumn[] results = commandRunner.Run(inputColumns);

                var resultDataFrame = new FxDataFrame(results);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #2
0
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            ArrowStreamWriter writer = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame                   = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame             = worker.Func(dataFrame);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #3
0
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame       = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame = worker.Func(dataFrame);

                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch batch in recordBatches)
                {
                    RecordBatch final = WrapColumnsInStructIfApplicable(batch);
                    stat.NumEntriesProcessed += final.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions);
                    }

                    writer.WriteRecordBatch(final);
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
Пример #4
0
        private static FxDataFrame CountCharacters(FxDataFrame dataFrame)
        {
            int characterCount = 0;

            var characterCountColumn = new Int32DataFrameColumn("nameCharCount");
            var ageColumn            = new Int32DataFrameColumn("age");
            ArrowStringDataFrameColumn nameColumn = dataFrame.Columns.GetArrowStringColumn("name");

            for (long i = 0; i < dataFrame.Rows.Count; ++i)
            {
                characterCount += nameColumn[i].Length;
            }

            if (dataFrame.Rows.Count > 0)
            {
                characterCountColumn.Append(characterCount);
                ageColumn.Append(dataFrame.Columns.GetInt32Column("age")[0]);
            }

            return(new FxDataFrame(ageColumn, characterCountColumn));
        }
Пример #5
0
        private static FxDataFrame CountCharacters(FxDataFrame dataFrame)
        {
            int characterCount = 0;

            var characterCountColumn = new PrimitiveDataFrameColumn <int>("nameCharCount");
            var ageColumn            = new PrimitiveDataFrameColumn <int>("age");
            ArrowStringDataFrameColumn fieldColumn = dataFrame["name"] as ArrowStringDataFrameColumn;

            for (long i = 0; i < dataFrame.Rows.Count; ++i)
            {
                characterCount += fieldColumn[i].Length;
            }

            if (dataFrame.Rows.Count > 0)
            {
                characterCountColumn.Append(characterCount);
                ageColumn.Append((int?)dataFrame["age"][0]);
            }

            return(new FxDataFrame(ageColumn, characterCountColumn));
        }