Пример #1
0
        protected override CommandExecutorStat ExecuteCore(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            // On the Spark side, each object in the following List<> is considered as a row.
            // See the ICommandRunner comments above for the types for a row.
            var outputRows = new List <object>();

            // If the input is empty (no rows) or all rows have been read, then
            // SpecialLengths.END_OF_DATA_SECTION is sent as the messageLength.
            // For example, no rows:
            //   +---+----+
            //   |age|name|
            //   +---+----+
            //   +---+----+
            int messageLength = 0;

            while ((messageLength = SerDe.ReadInt32(inputStream)) !=
                   (int)SpecialLengths.END_OF_DATA_SECTION)
            {
                if ((messageLength > 0) || (messageLength == (int)SpecialLengths.NULL))
                {
                    if (messageLength <= 0)
                    {
                        throw new InvalidDataException(
                                  $"Invalid message length: {messageLength}");
                    }

                    // Each row in inputRows is of type object[]. If a null is present in a row
                    // then the corresponding index column of the row object[] will be set to null.
                    // For example, (inputRows.Length == 2) and (inputRows[0][0] == null)
                    //   +----+
                    //   | age|
                    //   +----+
                    //   |null|
                    //   |  11|
                    //   +----+
                    object[] inputRows = PythonSerDe.GetUnpickledObjects(inputStream, messageLength);

                    for (int i = 0; i < inputRows.Length; ++i)
                    {
                        // Split id is not used for SQL UDFs, so 0 is passed.
                        outputRows.Add(commandRunner.Run(0, inputRows[i]));
                    }

                    // The initial (estimated) buffer size for pickling rows is set to the size of input pickled rows
                    // because the number of rows are the same for both input and output.
                    WriteOutput(outputStream, outputRows, messageLength);
                    stat.NumEntriesProcessed += inputRows.Length;
                    outputRows.Clear();
                }
            }

            return(stat);
        }
Пример #2
0
        protected override CommandExecutorStat ExecuteCore(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            // TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams.
            // For now, we write to a temporary seekable MemoryStream which we then copy to
            // the actual destination stream.
            MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream());

            ArrowStreamWriter writer       = null;
            Schema            resultSchema = null;

            foreach (ReadOnlyMemory <IArrowArray> input in GetInputIterator(inputStream))
            {
                // Split id is currently not used, so 0 is passed.
                IArrowArray[] results = commandRunner.Run(0, input);

                // Assumes all columns have the same length, so uses 0th for num entries.
                int numEntries = results[0].Length;
                stat.NumEntriesProcessed += numEntries;

                tmp.SetLength(0);

                if (writer == null)
                {
                    Debug.Assert(resultSchema == null);
                    resultSchema = BuildSchema(results);

                    writer = new ArrowStreamWriter(tmp, resultSchema, leaveOpen: true);
                }

                var recordBatch = new RecordBatch(resultSchema, results, numEntries);

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();

                tmp.Position = 0;
                tmp.CopyTo(outputStream);
                outputStream.Flush();
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #3
0
        protected override CommandExecutorStat ExecuteCore(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (ArrowGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            // TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams.
            // For now, we write to a temporary seekable MemoryStream which we then copy to
            // the actual destination stream.
            MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream());

            ArrowStreamWriter writer = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                RecordBatch result = worker.Func(input);

                int numEntries = result.Length;
                stat.NumEntriesProcessed += numEntries;

                tmp.SetLength(0);

                if (writer == null)
                {
                    writer = new ArrowStreamWriter(tmp, result.Schema, leaveOpen: true);
                }

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();

                tmp.Position = 0;
                tmp.CopyTo(outputStream);
                outputStream.Flush();
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #4
0
        private CommandExecutorStat ExecuteDataFrameSqlCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            ArrowStreamWriter writer = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame    = FxDataFrame.FromArrowRecordBatch(input);
                var         inputColumns = new DataFrameColumn[input.ColumnCount];
                for (int i = 0; i < dataFrame.Columns.Count; ++i)
                {
                    inputColumns[i] = dataFrame.Columns[i];
                }

                DataFrameColumn[] results = commandRunner.Run(inputColumns);

                var resultDataFrame = new FxDataFrame(results);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #5
0
        protected override CommandExecutorStat ExecuteCore(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            ArrowStreamWriter writer       = null;
            Schema            resultSchema = null;

            foreach (ReadOnlyMemory <IArrowArray> input in GetInputIterator(inputStream))
            {
                IArrowArray[] results = commandRunner.Run(input);

                // Assumes all columns have the same length, so uses 0th for num entries.
                int numEntries = results[0].Length;
                stat.NumEntriesProcessed += numEntries;

                if (writer == null)
                {
                    Debug.Assert(resultSchema == null);
                    resultSchema = BuildSchema(results);

                    writer = new ArrowStreamWriter(outputStream, resultSchema, leaveOpen: true);
                }

                var recordBatch = new RecordBatch(resultSchema, results, numEntries);

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #6
0
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            ArrowStreamWriter writer = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame                   = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame             = worker.Func(dataFrame);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #7
0
        protected override CommandExecutorStat ExecuteCore(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (ArrowGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            ArrowStreamWriter writer = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                RecordBatch result = worker.Func(input);

                int numEntries = result.Length;
                stat.NumEntriesProcessed += numEntries;

                if (writer == null)
                {
                    writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true);
                }

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }
Пример #8
0
        /// <summary>
        /// Executes the commands on the input data read from input stream
        /// and writes results to the output stream.
        /// </summary>
        /// <param name="inputStream">Input stream to read data from</param>
        /// <param name="outputStream">Output stream to write results to</param>
        /// <param name="splitIndex">Split index for this task</param>
        /// <param name="command">Contains the commands to execute</param>
        /// <returns>Statistics captured during the Execute() run</returns>
        internal CommandExecutorStat Execute(
            Stream inputStream,
            Stream outputStream,
            int splitIndex,
            RDDCommand command)
        {
            var stat = new CommandExecutorStat();

            CommandSerDe.SerializedMode serializerMode   = command.SerializerMode;
            CommandSerDe.SerializedMode deserializerMode = command.DeserializerMode;

            RDD.WorkerFunction.ExecuteDelegate func = command.WorkerFunction.Func;
            foreach (object output in func(
                         splitIndex,
                         GetInputIterator(inputStream, deserializerMode)))
            {
                WriteOutput(outputStream, serializerMode, output);

                ++stat.NumEntriesProcessed;
            }

            return(stat);
        }