private CommandExecutorStat ExecuteDataFrameSqlCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); var inputColumns = new DataFrameColumn[input.ColumnCount]; for (int i = 0; i < dataFrame.Columns.Count; ++i) { inputColumns[i] = dataFrame.Columns[i]; } DataFrameColumn[] results = commandRunner.Run(inputColumns); var resultDataFrame = new FxDataFrame(results); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch result in recordBatches) { stat.NumEntriesProcessed += result.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private static void TestRoundTripRecordBatch(RecordBatch originalBatch, IpcOptions options = null) { using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true, options)) { writer.WriteRecordBatch(originalBatch); writer.WriteEnd(); } stream.Position = 0; using (var reader = new ArrowStreamReader(stream)) { RecordBatch newBatch = reader.ReadNextRecordBatch(); ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); } } }
private CommandExecutorStat ExecuteDataFrameGroupedMapCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); FxDataFrame resultDataFrame = worker.Func(dataFrame); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch batch in recordBatches) { RecordBatch final = WrapColumnsInStructIfApplicable(batch); stat.NumEntriesProcessed += final.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, final.Schema, leaveOpen: true, ipcOptions); } writer.WriteRecordBatch(final); } } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private CommandExecutorStat ExecuteDataFrameGroupedMapCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { Debug.Assert(commands.Length == 1, "Grouped Map UDFs do not support combining multiple UDFs."); var stat = new CommandExecutorStat(); var worker = (DataFrameGroupedMapWorkerFunction)commands[0].WorkerFunction; SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; foreach (RecordBatch input in GetInputIterator(inputStream)) { FxDataFrame dataFrame = FxDataFrame.FromArrowRecordBatch(input); FxDataFrame resultDataFrame = worker.Func(dataFrame); IEnumerable <RecordBatch> recordBatches = resultDataFrame.ToArrowRecordBatches(); foreach (RecordBatch result in recordBatches) { stat.NumEntriesProcessed += result.Length; if (writer == null) { writer = new ArrowStreamWriter(outputStream, result.Schema, leaveOpen: true, ipcOptions); } // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(result).GetAwaiter().GetResult(); } } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
private CommandExecutorStat ExecuteArrowSqlCommand( Stream inputStream, Stream outputStream, SqlCommand[] commands) { var stat = new CommandExecutorStat(); ICommandRunner commandRunner = CreateCommandRunner(commands); SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM); IpcOptions ipcOptions = ArrowIpcOptions(); ArrowStreamWriter writer = null; Schema resultSchema = null; foreach (ReadOnlyMemory <IArrowArray> input in GetArrowInputIterator(inputStream)) { IArrowArray[] results = commandRunner.Run(input); // Assumes all columns have the same length, so uses 0th for num entries. int numEntries = results[0].Length; stat.NumEntriesProcessed += numEntries; if (writer == null) { Debug.Assert(resultSchema == null); resultSchema = BuildSchema(results); writer = new ArrowStreamWriter(outputStream, resultSchema, leaveOpen: true, ipcOptions); } var recordBatch = new RecordBatch(resultSchema, results, numEntries); // TODO: Remove sync-over-async once WriteRecordBatch exists. writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult(); } WriteEnd(outputStream, ipcOptions); writer?.Dispose(); return(stat); }
public void TestPicklingSqlCommandExecutorWithEmptyInput( Version sparkVersion, IpcOptions ipcOptions) { _ = ipcOptions; var udfWrapper = new Sql.PicklingUdfWrapper <string, string>((str) => $"udf: {str}"); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); // Write test data to the input stream. For the empty input scenario, // only send SpecialLengths.END_OF_DATA_SECTION. SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(0, stat.NumEntriesProcessed); // Validate the output stream. Assert.Equal(0, outputStream.Length); }
public async Task WriteLegacyIpcFormatAsync(bool writeLegacyIpcFormat) { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); var options = new IpcOptions() { WriteLegacyIpcFormat = writeLegacyIpcFormat }; using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true, options)) { await writer.WriteRecordBatchAsync(originalBatch); await writer.WriteEndAsync(); } stream.Position = 0; // ensure the continuation is written correctly byte[] buffer = stream.ToArray(); int messageLength = BinaryPrimitives.ReadInt32LittleEndian(buffer); int endOfBuffer1 = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(buffer.Length - 8)); int endOfBuffer2 = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(buffer.Length - 4)); if (writeLegacyIpcFormat) { // the legacy IPC format doesn't have a continuation token at the start Assert.NotEqual(-1, messageLength); Assert.NotEqual(-1, endOfBuffer1); } else { // the latest IPC format has a continuation token at the start Assert.Equal(-1, messageLength); Assert.Equal(-1, endOfBuffer1); } Assert.Equal(0, endOfBuffer2); } }
public async Task TestDataFrameGroupedMapCommandExecutor( Version sparkVersion, IpcOptions ipcOptions) {
public async Task TestArrowGroupedMapCommandExecutor( Version sparkVersion, IpcOptions ipcOptions) { StringArray ConvertStrings(StringArray strings) { return((StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); } Int64Array ConvertInt64s(Int64Array int64s) { return((Int64Array)ToArrowArray( Enumerable.Range(0, int64s.Length) .Select(i => int64s.Values[i] + 100) .ToArray())); } Schema resultSchema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper( (batch) => new RecordBatch( resultSchema, new IArrowArray[] { ConvertStrings((StringArray)batch.Column(0)), ConvertInt64s((Int64Array)batch.Column(1)), }, batch.Length)); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray( Enumerable.Range(0, numRows) .Select(i => (long)i) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); StringArray stringArray; Int64Array longArray; if (sparkVersion < new Version(Versions.V3_0_0)) { Assert.Equal(2, outputBatch.ColumnCount); stringArray = (StringArray)outputBatch.Column(0); longArray = (Int64Array)outputBatch.Column(1); } else { Assert.Equal(1, outputBatch.ColumnCount); var structArray = (StructArray)outputBatch.Column(0); Assert.Equal(2, structArray.Fields.Count); stringArray = (StringArray)structArray.Fields[0]; longArray = (Int64Array)structArray.Fields[1]; } for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", stringArray.GetString(i)); } for (int i = 0; i < numRows; ++i) { Assert.Equal(100 + i, longArray.Values[i]); } CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public void TestDataFrameSqlCommandExecutorWithEmptyInput( Version sparkVersion, IpcOptions ipcOptions) { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"udf: {cur}")); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, false, ipcOptions); // The .NET ArrowStreamWriter doesn't currently support writing just a // schema with no batches - but Java does. We use Reflection to simulate // the request Spark sends. MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod( "WriteSchemaAsync", BindingFlags.NonPublic | BindingFlags.Instance); writeSchemaMethod.Invoke( arrowWriter, new object[] { schema, CancellationToken.None }); SerDe.Write(inputStream, 0); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(0, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = arrowReader.ReadNextRecordBatch(); Assert.Equal(1, outputBatch.Schema.Fields.Count); Assert.IsType <StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType); Assert.Equal(0, outputBatch.Length); Assert.Single(outputBatch.Arrays); var array = (StringArray)outputBatch.Arrays.ElementAt(0); Assert.Equal(0, array.Length); CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public async Task TestDataFrameSqlCommandExecutorWithMultiCommands( Version sparkVersion, IpcOptions ipcOptions) { var udfWrapper1 = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"udf: {cur}")); var udfWrapper2 = new Sql.DataFrameUdfWrapper <Int32DataFrameColumn, Int32DataFrameColumn, Int32DataFrameColumn>( (arg1, arg2) => arg1 * arg2); var command1 = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper1.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var command2 = new SqlCommand() { ArgOffsets = new[] { 1, 2 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper2.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command1, command2 } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int32Type.Default)) .Field(b => b.Name("arg3").DataType(Int32Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); var arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.Arrays.Count()); var array1 = (StringArray)outputBatch.Arrays.ElementAt(0); var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array1.GetString(i)); Assert.Equal(i * i, array2.Values[i]); } CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public void TestPicklingSqlCommandExecutorWithSingleCommand( Version sparkVersion, IpcOptions ipcOptions) { _ = ipcOptions; var udfWrapper = new Sql.PicklingUdfWrapper <string, string>( (str) => "udf: " + ((str is null) ? "NULL" : str)); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. var pickler = new Pickler(); for (int i = 0; i < numRows; ++i) { byte[] pickled = pickler.dumps( new[] { new object[] { (i % 2 == 0) ? null : i.ToString() } }); SerDe.Write(inputStream, pickled.Length); SerDe.Write(inputStream, pickled); } SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(10, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); var unpickler = new Unpickler(); // One row was written as a batch above, thus need to read 'numRows' batches. List <object> rows = new List <object>(); for (int i = 0; i < numRows; ++i) { int length = SerDe.ReadInt32(outputStream); byte[] pickledBytes = SerDe.ReadBytes(outputStream, length); rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object); } Assert.Equal(numRows, rows.Count); // Validate the single command. for (int i = 0; i < numRows; ++i) { Assert.Equal( "udf: " + ((i % 2 == 0) ? "NULL" : i.ToString()), (string)rows[i]); } // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public void TestPicklingSqlCommandExecutorWithMultiCommands( Version sparkVersion, IpcOptions ipcOptions) { _ = ipcOptions; var udfWrapper1 = new Sql.PicklingUdfWrapper <string, string>((str) => $"udf: {str}"); var udfWrapper2 = new Sql.PicklingUdfWrapper <int, int, int>( (arg1, arg2) => arg1 * arg2); var command1 = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper1.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var command2 = new SqlCommand() { ArgOffsets = new[] { 1, 2 }, NumChainedFunctions = 1, WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper2.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF, Commands = new[] { command1, command2 } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. var pickler = new Pickler(); for (int i = 0; i < numRows; ++i) { byte[] pickled = pickler.dumps( new[] { new object[] { i.ToString(), i, i } }); SerDe.Write(inputStream, pickled.Length); SerDe.Write(inputStream, pickled); } SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(10, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); var unpickler = new Unpickler(); // One row was written as a batch above, thus need to read 'numRows' batches. List <object[]> rows = new List <object[]>(); for (int i = 0; i < numRows; ++i) { int length = SerDe.ReadInt32(outputStream); byte[] pickledBytes = SerDe.ReadBytes(outputStream, length); rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object[]); } Assert.Equal(numRows, rows.Count); for (int i = 0; i < numRows; ++i) { // There were two UDFs each of which produces one column. object[] columns = rows[i]; Assert.Equal($"udf: {i}", (string)columns[0]); Assert.Equal(i * i, (int)columns[1]); } // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
private static async Task TestRoundTripRecordBatchAsync(RecordBatch originalBatch, IpcOptions options = null) { await TestRoundTripRecordBatchesAsync(new List <RecordBatch> { originalBatch }, options); }
private static void TestRoundTripRecordBatch(RecordBatch originalBatch, IpcOptions options = null) { TestRoundTripRecordBatches(new List <RecordBatch> { originalBatch }, options); }
private static async Task TestRoundTripRecordBatchesAsync(List <RecordBatch> originalBatches, IpcOptions options = null) { using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) { foreach (RecordBatch originalBatch in originalBatches) { await writer.WriteRecordBatchAsync(originalBatch); } await writer.WriteEndAsync(); } stream.Position = 0; using (var reader = new ArrowStreamReader(stream)) { foreach (RecordBatch originalBatch in originalBatches) { RecordBatch newBatch = reader.ReadNextRecordBatch(); ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); } } } }