public async Task CanWriteToNetworkStreamAsync() { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); const int port = 32154; TcpListener listener = new TcpListener(IPAddress.Loopback, port); listener.Start(); using (TcpClient sender = new TcpClient()) { sender.Connect(IPAddress.Loopback, port); NetworkStream stream = sender.GetStream(); using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema)) { await writer.WriteRecordBatchAsync(originalBatch); await writer.WriteEndAsync(); stream.Flush(); } } using (TcpClient receiver = listener.AcceptTcpClient()) { NetworkStream stream = receiver.GetStream(); using (var reader = new ArrowStreamReader(stream)) { RecordBatch newBatch = reader.ReadNextRecordBatch(); ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); } } }
public async Task Ctor_MemoryPool_AllocatesFromPool(bool shouldLeaveOpen, bool createDictionaryArray, int expectedAllocations) { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100, createDictionaryArray: createDictionaryArray); using (MemoryStream stream = new MemoryStream()) { ArrowStreamWriter writer = new ArrowStreamWriter(stream, originalBatch.Schema); await writer.WriteRecordBatchAsync(originalBatch); await writer.WriteEndAsync(); stream.Position = 0; var memoryPool = new TestMemoryAllocator(); ArrowStreamReader reader = new ArrowStreamReader(stream, memoryPool, shouldLeaveOpen); reader.ReadNextRecordBatch(); Assert.Equal(expectedAllocations, memoryPool.Statistics.Allocations); Assert.True(memoryPool.Statistics.BytesAllocated > 0); reader.Dispose(); if (shouldLeaveOpen) { Assert.True(stream.Position > 0); } else { Assert.Throws <ObjectDisposedException>(() => stream.Position); } } }
private static async Task TestRoundTripRecordBatchesAsync(List <RecordBatch> originalBatches, IpcOptions options = null) { using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options)) { foreach (RecordBatch originalBatch in originalBatches) { await writer.WriteRecordBatchAsync(originalBatch); } await writer.WriteEndAsync(); } stream.Position = 0; using (var reader = new ArrowStreamReader(stream)) { foreach (RecordBatch originalBatch in originalBatches) { RecordBatch newBatch = reader.ReadNextRecordBatch(); ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); } } } }
public void TestEmptyDataFrameRecordBatch() { PrimitiveDataFrameColumn <int> ageColumn = new PrimitiveDataFrameColumn <int>("Age"); PrimitiveDataFrameColumn <int> lengthColumn = new PrimitiveDataFrameColumn <int>("CharCount"); ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("Empty"); DataFrame df = new DataFrame(new List <DataFrameColumn>() { ageColumn, lengthColumn, stringColumn }); IEnumerable <RecordBatch> recordBatches = df.ToArrowRecordBatches(); bool foundARecordBatch = false; foreach (RecordBatch recordBatch in recordBatches) { foundARecordBatch = true; MemoryStream stream = new MemoryStream(); ArrowStreamWriter writer = new ArrowStreamWriter(stream, recordBatch.Schema); writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult(); stream.Position = 0; ArrowStreamReader reader = new ArrowStreamReader(stream); RecordBatch readRecordBatch = reader.ReadNextRecordBatch(); while (readRecordBatch != null) { RecordBatchComparer.CompareBatches(recordBatch, readRecordBatch); readRecordBatch = reader.ReadNextRecordBatch(); } } Assert.True(foundARecordBatch); }
public async Task GlobalSetup() { RecordBatch batch = TestData.CreateSampleRecordBatch(length: Count); _memoryStream = new MemoryStream(); ArrowStreamWriter writer = new ArrowStreamWriter(_memoryStream, batch.Schema); await writer.WriteRecordBatchAsync(batch); }
public async Task WriteBatchWithCorrectPadding() { byte value1 = 0x04; byte value2 = 0x14; var batch = new RecordBatch( new Schema.Builder() .Field(f => f.Name("age").DataType(Int32Type.Default)) .Field(f => f.Name("characterCount").DataType(Int32Type.Default)) .Build(), new IArrowArray[] { new Int32Array( new ArrowBuffer(new byte[] { value1, value1, 0x00, 0x00 }), ArrowBuffer.Empty, length: 1, nullCount: 0, offset: 0), new Int32Array( new ArrowBuffer(new byte[] { value2, value2, 0x00, 0x00 }), ArrowBuffer.Empty, length: 1, nullCount: 0, offset: 0) }, length: 1); await TestRoundTripRecordBatch(batch); using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, batch.Schema, leaveOpen: true)) { await writer.WriteRecordBatchAsync(batch); } byte[] writtenBytes = stream.ToArray(); // ensure that the data buffers at the end are 8-byte aligned Assert.Equal(value1, writtenBytes[writtenBytes.Length - 16]); Assert.Equal(value1, writtenBytes[writtenBytes.Length - 15]); for (int i = 14; i > 8; i--) { Assert.Equal(0, writtenBytes[writtenBytes.Length - i]); } Assert.Equal(value2, writtenBytes[writtenBytes.Length - 8]); Assert.Equal(value2, writtenBytes[writtenBytes.Length - 7]); for (int i = 6; i > 0; i--) { Assert.Equal(0, writtenBytes[writtenBytes.Length - i]); } } }
/// <summary> /// Verifies that the stream reader reads multiple times when a stream /// only returns a subset of the data from each Read. /// </summary> private static async Task TestReaderFromPartialReadStream(Func <ArrowStreamReader, RecordBatch, Task> verificationFunc) { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); using (PartialReadStream stream = new PartialReadStream()) { ArrowStreamWriter writer = new ArrowStreamWriter(stream, originalBatch.Schema); await writer.WriteRecordBatchAsync(originalBatch); stream.Position = 0; ArrowStreamReader reader = new ArrowStreamReader(stream); await verificationFunc(reader, originalBatch); } }
private static async Task TestReaderFromMemory(Func <ArrowStreamReader, RecordBatch, Task> verificationFunc) { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); byte[] buffer; using (MemoryStream stream = new MemoryStream()) { ArrowStreamWriter writer = new ArrowStreamWriter(stream, originalBatch.Schema); await writer.WriteRecordBatchAsync(originalBatch); buffer = stream.GetBuffer(); } ArrowStreamReader reader = new ArrowStreamReader(buffer); await verificationFunc(reader, originalBatch); }
private static async Task TestRoundTripRecordBatch(RecordBatch originalBatch) { using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true)) { await writer.WriteRecordBatchAsync(originalBatch); } stream.Position = 0; using (var reader = new ArrowStreamReader(stream)) { RecordBatch newBatch = reader.ReadNextRecordBatch(); ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); } } }
public async Task WriteEmptyBatch() { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 0); using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true)) { await writer.WriteRecordBatchAsync(originalBatch); } stream.Position = 0; using (var reader = new ArrowStreamReader(stream)) { RecordBatch newBatch = reader.ReadNextRecordBatch(); ArrowReaderVerifier.CompareBatches(originalBatch, newBatch); } } }
public async Task WriteLegacyIpcFormatAsync(bool writeLegacyIpcFormat) { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); var options = new IpcOptions() { WriteLegacyIpcFormat = writeLegacyIpcFormat }; using (MemoryStream stream = new MemoryStream()) { using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema, leaveOpen: true, options)) { await writer.WriteRecordBatchAsync(originalBatch); await writer.WriteEndAsync(); } stream.Position = 0; // ensure the continuation is written correctly byte[] buffer = stream.ToArray(); int messageLength = BinaryPrimitives.ReadInt32LittleEndian(buffer); int endOfBuffer1 = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(buffer.Length - 8)); int endOfBuffer2 = BinaryPrimitives.ReadInt32LittleEndian(buffer.AsSpan(buffer.Length - 4)); if (writeLegacyIpcFormat) { // the legacy IPC format doesn't have a continuation token at the start Assert.NotEqual(-1, messageLength); Assert.NotEqual(-1, endOfBuffer1); } else { // the latest IPC format has a continuation token at the start Assert.Equal(-1, messageLength); Assert.Equal(-1, endOfBuffer1); } Assert.Equal(0, endOfBuffer2); } }
public async Task ReadRecordBatch() { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100); byte[] buffer; using (MemoryStream stream = new MemoryStream()) { ArrowStreamWriter writer = new ArrowStreamWriter(stream, originalBatch.Schema); await writer.WriteRecordBatchAsync(originalBatch); buffer = stream.GetBuffer(); } ArrowStreamReader reader = new ArrowStreamReader(buffer); RecordBatch readBatch = reader.ReadNextRecordBatch(); CompareBatches(originalBatch, readBatch); // There should only be one batch - calling ReadNextRecordBatch again should return null. Assert.Null(reader.ReadNextRecordBatch()); Assert.Null(reader.ReadNextRecordBatch()); }
private static async Task TestReaderFromStream( Func <ArrowStreamReader, RecordBatch, Task> verificationFunc, bool writeEnd, bool createDictionaryArray) { RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100, createDictionaryArray: createDictionaryArray); using (MemoryStream stream = new MemoryStream()) { ArrowStreamWriter writer = new ArrowStreamWriter(stream, originalBatch.Schema); await writer.WriteRecordBatchAsync(originalBatch); if (writeEnd) { await writer.WriteEndAsync(); } stream.Position = 0; ArrowStreamReader reader = new ArrowStreamReader(stream); await verificationFunc(reader, originalBatch); } }
public async Task TestArrowGroupedMapCommandExecutor() { StringArray ConvertStrings(StringArray strings) { return((StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); } Int64Array ConvertInt64s(Int64Array int64s) { return((Int64Array)ToArrowArray( Enumerable.Range(0, int64s.Length) .Select(i => int64s.Values[i] + 100) .ToArray())); } Schema resultSchema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper( (batch) => new RecordBatch( resultSchema, new IArrowArray[] { ConvertStrings((StringArray)batch.Column(0)), ConvertInt64s((Int64Array)batch.Column(1)), }, batch.Length)); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, Commands = new[] { command } }; using (var inputStream = new MemoryStream()) using (var outputStream = new MemoryStream()) { int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray( Enumerable.Range(0, numRows) .Select(i => (long)i) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.ColumnCount); var stringArray = (StringArray)outputBatch.Column(0); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", stringArray.GetString(i)); } var longArray = (Int64Array)outputBatch.Column(1); for (int i = 0; i < numRows; ++i) { Assert.Equal(100 + i, longArray.Values[i]); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); } }
public async Task TestArrowSqlCommandExecutorWithMultiCommands() { var udfWrapper1 = new Sql.ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); var udfWrapper2 = new Sql.ArrowUdfWrapper <Int32Array, Int32Array, Int32Array>( (arg1, arg2) => (Int32Array)ToArrowArray( Enumerable.Range(0, arg1.Length) .Select(i => arg1.Values[i] * arg2.Values[i]) .ToArray())); var command1 = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper1.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var command2 = new SqlCommand() { ArgOffsets = new[] { 1, 2 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper2.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command1, command2 } }; using (var inputStream = new MemoryStream()) using (var outputStream = new MemoryStream()) { int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int32Type.Default)) .Field(b => b.Name("arg3").DataType(Int32Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); var arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.Arrays.Count()); var array1 = (StringArray)outputBatch.Arrays.ElementAt(0); var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array1.GetString(i)); Assert.Equal(i * i, array2.Values[i]); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); } }
public async Task WriteBatch() { ArrowStreamWriter writer = new ArrowStreamWriter(_memoryStream, _batch.Schema); await writer.WriteRecordBatchAsync(_batch); }
public async Task TestDataFrameSqlCommandExecutorWithSingleCommand( Version sparkVersion, IpcOptions ipcOptions) { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"udf: {cur}")); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Single(outputBatch.Arrays); var array = (StringArray)outputBatch.Arrays.ElementAt(0); // Validate the single command. for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array.GetString(i)); } CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
private void WriteRecordBatch(RecordBatch batch) { inputBuffer.Position = 0; writer.WriteRecordBatchAsync(batch).GetAwaiter().GetResult(); }