Example #1
0
        private CommandExecutorStat ExecuteDataFrameSqlCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame    = FxDataFrame.FromArrowRecordBatch(input);
                var         inputColumns = new DataFrameColumn[input.ColumnCount];
                for (int i = 0; i < dataFrame.Columns.Count; ++i)
                {
                    inputColumns[i] = dataFrame.Columns[i];
                }

                DataFrameColumn[] results = commandRunner.Run(inputColumns);

                var resultDataFrame = new FxDataFrame(results);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
Example #2
0
        private static void TestRoundTripRecordBatch(RecordBatch originalBatch, IpcOptions options = null)
        {
            using (MemoryStream stream = new MemoryStream())
            {
                using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true, options))
                {
                    writer.WriteRecordBatch(originalBatch);
                    writer.WriteEnd();
                }

                stream.Position = 0;

                using (var reader = new ArrowStreamReader(stream))
                {
                    RecordBatch newBatch = reader.ReadNextRecordBatch();
                    ArrowReaderVerifier.CompareBatches(originalBatch, newBatch);
                }
            }
        }
Example #3
0
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame       = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame = worker.Func(dataFrame);

                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch batch in recordBatches)
                {
                    RecordBatch final = WrapColumnsInStructIfApplicable(batch);
                    stat.NumEntriesProcessed += final.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions);
                    }

                    writer.WriteRecordBatch(final);
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
Example #4
0
        private CommandExecutorStat ExecuteDataFrameGroupedMapCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            Debug.Assert(commands.Length == 1,
                         "Grouped Map UDFs do not support combining multiple UDFs.");

            var stat   = new CommandExecutorStat();
            var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction;

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions = ArrowIpcOptions();
            ArrowStreamWriter writer     = null;

            foreach (RecordBatch input in GetInputIterator(inputStream))
            {
                FxDataFrame dataFrame                   = FxDataFrame.FromArrowRecordBatch(input);
                FxDataFrame resultDataFrame             = worker.Func(dataFrame);
                IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches();

                foreach (RecordBatch result in recordBatches)
                {
                    stat.NumEntriesProcessed += result.Length;

                    if (writer == null)
                    {
                        writer =
                            new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions);
                    }

                    // TODO: Remove sync-over-async once WriteRecordBatch exists.
                    writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult();
                }
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
Example #5
0
        private CommandExecutorStat ExecuteArrowSqlCommand(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            IpcOptions        ipcOptions   = ArrowIpcOptions();
            ArrowStreamWriter writer       = null;
            Schema            resultSchema = null;

            foreach (ReadOnlyMemory <IArrowArray> input in GetArrowInputIterator(inputStream))
            {
                IArrowArray[] results = commandRunner.Run(input);

                // Assumes all columns have the same length, so uses 0th for num entries.
                int numEntries = results[0].Length;
                stat.NumEntriesProcessed += numEntries;

                if (writer == null)
                {
                    Debug.Assert(resultSchema == null);
                    resultSchema = BuildSchema(results);

                    writer =
                        new ArrowStreamWriter(outputStream, resultSchema, leaveOpen: true, ipcOptions);
                }

                var recordBatch = new RecordBatch(resultSchema, results, numEntries);

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();
            }

            WriteEnd(outputStream, ipcOptions);
            writer?.Dispose();

            return(stat);
        }
        public void TestPicklingSqlCommandExecutorWithEmptyInput(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            _ = ipcOptions;
            var udfWrapper = new Sql.PicklingUdfWrapper <string, string>((str) => $"udf: {str}");
            var command    = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.PicklingWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            // Write test data to the input stream. For the empty input scenario,
            // only send SpecialLengths.END_OF_DATA_SECTION.
            SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(0, stat.NumEntriesProcessed);

            // Validate the output stream.
            Assert.Equal(0, outputStream.Length);
        }
Example #7
0
        public async Task WriteLegacyIpcFormatAsync(bool writeLegacyIpcFormat)
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);
            var         options       = new IpcOptions()
            {
                WriteLegacyIpcFormat = writeLegacyIpcFormat
            };

            using (MemoryStream stream = new MemoryStream())
            {
                using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true, options))
                {
                    await writer.WriteRecordBatchAsync(originalBatch);

                    await writer.WriteEndAsync();
                }

                stream.Position = 0;

                // ensure the continuation is written correctly
                byte[] buffer        = stream.ToArray();
                int    messageLength = BinaryPrimitives.ReadInt32LittleEndian(buffer);
                int    endOfBuffer1  = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(buffer.Length - 8));
                int    endOfBuffer2  = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(buffer.Length - 4));
                if (writeLegacyIpcFormat)
                {
                    // the legacy IPC format doesn't have a continuation token at the start
                    Assert.NotEqual(-1, messageLength);
                    Assert.NotEqual(-1, endOfBuffer1);
                }
                else
                {
                    // the latest IPC format has a continuation token at the start
                    Assert.Equal(-1, messageLength);
                    Assert.Equal(-1, endOfBuffer1);
                }

                Assert.Equal(0, endOfBuffer2);
            }
        }
 public async Task TestDataFrameGroupedMapCommandExecutor(
     Version sparkVersion,
     IpcOptions ipcOptions)
 {
        public async Task TestArrowGroupedMapCommandExecutor(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            StringArray ConvertStrings(StringArray strings)
            {
                return((StringArray)ToArrowArray(
                           Enumerable.Range(0, strings.Length)
                           .Select(i => $"udf: {strings.GetString(i)}")
                           .ToArray()));
            }

            Int64Array ConvertInt64s(Int64Array int64s)
            {
                return((Int64Array)ToArrowArray(
                           Enumerable.Range(0, int64s.Length)
                           .Select(i => int64s.Values[i] + 100)
                           .ToArray()));
            }

            Schema resultSchema = new Schema.Builder()
                                  .Field(b => b.Name("arg1").DataType(StringType.Default))
                                  .Field(b => b.Name("arg2").DataType(Int64Type.Default))
                                  .Build();

            var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper(
                (batch) => new RecordBatch(
                    resultSchema,
                    new IArrowArray[]
            {
                ConvertStrings((StringArray)batch.Column(0)),
                ConvertInt64s((Int64Array)batch.Column(1)),
            },
                    batch.Length));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Field(b => b.Name("arg2").DataType(Int64Type.Default))
                            .Build();
            var arrowWriter =
                new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions);
            await arrowWriter.WriteRecordBatchAsync(
                new RecordBatch(
                    schema,
                    new[]
            {
                ToArrowArray(
                    Enumerable.Range(0, numRows)
                    .Select(i => i.ToString())
                    .ToArray()),
                ToArrowArray(
                    Enumerable.Range(0, numRows)
                    .Select(i => (long)i)
                    .ToArray())
            },
                    numRows));

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(numRows, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            int arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

            Assert.Equal(numRows, outputBatch.Length);
            StringArray stringArray;
            Int64Array  longArray;

            if (sparkVersion < new Version(Versions.V3_0_0))
            {
                Assert.Equal(2, outputBatch.ColumnCount);
                stringArray = (StringArray)outputBatch.Column(0);
                longArray   = (Int64Array)outputBatch.Column(1);
            }
            else
            {
                Assert.Equal(1, outputBatch.ColumnCount);
                var structArray = (StructArray)outputBatch.Column(0);
                Assert.Equal(2, structArray.Fields.Count);
                stringArray = (StringArray)structArray.Fields[0];
                longArray   = (Int64Array)structArray.Fields[1];
            }

            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal($"udf: {i}", stringArray.GetString(i));
            }

            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal(100 + i, longArray.Values[i]);
            }

            CheckEOS(outputStream, ipcOptions);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
        public void TestDataFrameSqlCommandExecutorWithEmptyInput(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"udf: {cur}"));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Build();
            var arrowWriter = new ArrowStreamWriter(inputStream, schema, false, ipcOptions);

            // The .NET ArrowStreamWriter doesn't currently support writing just a
            // schema with no batches - but Java does. We use Reflection to simulate
            // the request Spark sends.
            MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod(
                "WriteSchemaAsync",
                BindingFlags.NonPublic | BindingFlags.Instance);

            writeSchemaMethod.Invoke(
                arrowWriter,
                new object[] { schema, CancellationToken.None });

            SerDe.Write(inputStream, 0);

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(0, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            int arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = arrowReader.ReadNextRecordBatch();

            Assert.Equal(1, outputBatch.Schema.Fields.Count);
            Assert.IsType <StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType);

            Assert.Equal(0, outputBatch.Length);
            Assert.Single(outputBatch.Arrays);

            var array = (StringArray)outputBatch.Arrays.ElementAt(0);

            Assert.Equal(0, array.Length);

            CheckEOS(outputStream, ipcOptions);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
        public async Task TestDataFrameSqlCommandExecutorWithMultiCommands(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            var udfWrapper1 = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"udf: {cur}"));

            var udfWrapper2 = new Sql.DataFrameUdfWrapper <Int32DataFrameColumn, Int32DataFrameColumn, Int32DataFrameColumn>(
                (arg1, arg2) => arg1 * arg2);

            var command1 = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper1.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var command2 = new SqlCommand()
            {
                ArgOffsets          = new[] { 1, 2 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper2.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command1, command2 }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Field(b => b.Name("arg2").DataType(Int32Type.Default))
                            .Field(b => b.Name("arg3").DataType(Int32Type.Default))
                            .Build();
            var arrowWriter =
                new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions);
            await arrowWriter.WriteRecordBatchAsync(
                new RecordBatch(
                    schema,
                    new[]
            {
                ToArrowArray(
                    Enumerable.Range(0, numRows)
                    .Select(i => i.ToString())
                    .ToArray()),
                ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
                ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
            },
                    numRows));

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(numRows, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            var arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

            Assert.Equal(numRows, outputBatch.Length);
            Assert.Equal(2, outputBatch.Arrays.Count());
            var array1 = (StringArray)outputBatch.Arrays.ElementAt(0);
            var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1);

            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal($"udf: {i}", array1.GetString(i));
                Assert.Equal(i * i, array2.Values[i]);
            }

            CheckEOS(outputStream, ipcOptions);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
        public void TestPicklingSqlCommandExecutorWithSingleCommand(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            _ = ipcOptions;
            var udfWrapper = new Sql.PicklingUdfWrapper <string, string>(
                (str) => "udf: " + ((str is null) ? "NULL" : str));
            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.PicklingWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            var pickler = new Pickler();

            for (int i = 0; i < numRows; ++i)
            {
                byte[] pickled = pickler.dumps(
                    new[] { new object[] { (i % 2 == 0) ? null : i.ToString() } });
                SerDe.Write(inputStream, pickled.Length);
                SerDe.Write(inputStream, pickled);
            }
            SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(10, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            var unpickler = new Unpickler();

            // One row was written as a batch above, thus need to read 'numRows' batches.
            List <object> rows = new List <object>();

            for (int i = 0; i < numRows; ++i)
            {
                int    length       = SerDe.ReadInt32(outputStream);
                byte[] pickledBytes = SerDe.ReadBytes(outputStream, length);
                rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object);
            }

            Assert.Equal(numRows, rows.Count);

            // Validate the single command.
            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal(
                    "udf: " + ((i % 2 == 0) ? "NULL" : i.ToString()),
                    (string)rows[i]);
            }

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
        public void TestPicklingSqlCommandExecutorWithMultiCommands(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            _ = ipcOptions;
            var udfWrapper1 = new Sql.PicklingUdfWrapper <string, string>((str) => $"udf: {str}");
            var udfWrapper2 = new Sql.PicklingUdfWrapper <int, int, int>(
                (arg1, arg2) => arg1 * arg2);

            var command1 = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.PicklingWorkerFunction(udfWrapper1.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var command2 = new SqlCommand()
            {
                ArgOffsets          = new[] { 1, 2 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.PicklingWorkerFunction(udfWrapper2.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
                Commands = new[] { command1, command2 }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            var pickler = new Pickler();

            for (int i = 0; i < numRows; ++i)
            {
                byte[] pickled = pickler.dumps(
                    new[] { new object[] { i.ToString(), i, i } });
                SerDe.Write(inputStream, pickled.Length);
                SerDe.Write(inputStream, pickled);
            }
            SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(10, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            var unpickler = new Unpickler();

            // One row was written as a batch above, thus need to read 'numRows' batches.
            List <object[]> rows = new List <object[]>();

            for (int i = 0; i < numRows; ++i)
            {
                int    length       = SerDe.ReadInt32(outputStream);
                byte[] pickledBytes = SerDe.ReadBytes(outputStream, length);
                rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object[]);
            }

            Assert.Equal(numRows, rows.Count);

            for (int i = 0; i < numRows; ++i)
            {
                // There were two UDFs each of which produces one column.
                object[] columns = rows[i];
                Assert.Equal($"udf: {i}", (string)columns[0]);
                Assert.Equal(i * i, (int)columns[1]);
            }

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
 private static async Task TestRoundTripRecordBatchAsync(RecordBatch originalBatch, IpcOptions options = null)
 {
     await TestRoundTripRecordBatchesAsync(new List <RecordBatch> {
         originalBatch
     }, options);
 }
 private static void TestRoundTripRecordBatch(RecordBatch originalBatch, IpcOptions options = null)
 {
     TestRoundTripRecordBatches(new List <RecordBatch> {
         originalBatch
     }, options);
 }
        private static async Task TestRoundTripRecordBatchesAsync(List <RecordBatch> originalBatches, IpcOptions options = null)
        {
            using (MemoryStream stream = new MemoryStream())
            {
                using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options))
                {
                    foreach (RecordBatch originalBatch in originalBatches)
                    {
                        await writer.WriteRecordBatchAsync(originalBatch);
                    }
                    await writer.WriteEndAsync();
                }

                stream.Position = 0;

                using (var reader = new ArrowStreamReader(stream))
                {
                    foreach (RecordBatch originalBatch in originalBatches)
                    {
                        RecordBatch newBatch = reader.ReadNextRecordBatch();
                        ArrowReaderVerifier.CompareBatches(originalBatch, newBatch);
                    }
                }
            }
        }