public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
        {
            var array  = (FloatArray)batch.Column("Values");
            var values = array.Values;

            FindMinMax(values);
        }
        public void TestEmptyDataFrameRecordBatch()
        {
            PrimitiveDataFrameColumn <int> ageColumn    = new PrimitiveDataFrameColumn <int>("Age");
            PrimitiveDataFrameColumn <int> lengthColumn = new PrimitiveDataFrameColumn <int>("CharCount");
            ArrowStringDataFrameColumn     stringColumn = new ArrowStringDataFrameColumn("Empty");
            DataFrame df = new DataFrame(new List <DataFrameColumn>()
            {
                ageColumn, lengthColumn, stringColumn
            });

            IEnumerable <RecordBatch> recordBatches = df.ToArrowRecordBatches();
            bool foundARecordBatch = false;

            foreach (RecordBatch recordBatch in recordBatches)
            {
                foundARecordBatch = true;
                MemoryStream      stream = new MemoryStream();
                ArrowStreamWriter writer = new ArrowStreamWriter(stream, recordBatch.Schema);
                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();

                stream.Position = 0;
                ArrowStreamReader reader          = new ArrowStreamReader(stream);
                RecordBatch       readRecordBatch = reader.ReadNextRecordBatch();
                while (readRecordBatch != null)
                {
                    RecordBatchComparer.CompareBatches(recordBatch, readRecordBatch);
                    readRecordBatch = reader.ReadNextRecordBatch();
                }
            }
            Assert.True(foundARecordBatch);
        }
        public UnsafeRecordBatchDataReader(RecordBatch recordBatch)
        {
            _schema         = recordBatch.Schema;
            _columnDecoders = SchemaToDecoder.SchemaToDecoders(recordBatch.Schema);

            ReadNewRecordBatch(recordBatch);
        }
Ejemplo n.º 4
0
        public async Task WritesFooterAlignedMulitpleOf8()
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);

            var stream = new MemoryStream();
            var writer = new ArrowFileWriter(
                stream,
                originalBatch.Schema,
                leaveOpen: true,
                // use WriteLegacyIpcFormat, which only uses a 4-byte length prefix
                // which causes the length prefix to not be 8-byte aligned by default
                new IpcOptions()
            {
                WriteLegacyIpcFormat = true
            });

            await writer.WriteRecordBatchAsync(originalBatch);

            await writer.WriteEndAsync();

            stream.Position = 0;

            var reader = new ArrowFileReader(stream);
            int count  = await reader.RecordBatchCountAsync();

            Assert.Equal(1, count);
            RecordBatch readBatch = await reader.ReadRecordBatchAsync(0);

            ArrowReaderVerifier.CompareBatches(originalBatch, readBatch);
        }
Ejemplo n.º 5
0
        public async Task CanWriteToNetworkStreamAsync()
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);

            const int   port     = 32154;
            TcpListener listener = new TcpListener(IPAddress.Loopback, port);

            listener.Start();

            using (TcpClient sender = new TcpClient())
            {
                sender.Connect(IPAddress.Loopback, port);
                NetworkStream stream = sender.GetStream();

                using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema))
                {
                    await writer.WriteRecordBatchAsync(originalBatch);

                    await writer.WriteEndAsync();

                    stream.Flush();
                }
            }

            using (TcpClient receiver = listener.AcceptTcpClient())
            {
                NetworkStream stream = receiver.GetStream();
                using (var reader = new ArrowStreamReader(stream))
                {
                    RecordBatch newBatch = reader.ReadNextRecordBatch();
                    ArrowReaderVerifier.CompareBatches(originalBatch, newBatch);
                }
            }
        }
Ejemplo n.º 6
0
        public async Task Ctor_MemoryPool_AllocatesFromPool(bool shouldLeaveOpen)
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);

            using (MemoryStream stream = new MemoryStream())
            {
                ArrowFileWriter writer = new ArrowFileWriter(stream, originalBatch.Schema);
                await writer.WriteRecordBatchAsync(originalBatch);

                await writer.WriteEndAsync();

                stream.Position = 0;

                var             memoryPool = new TestMemoryAllocator();
                ArrowFileReader reader     = new ArrowFileReader(stream, memoryPool, leaveOpen: shouldLeaveOpen);
                reader.ReadNextRecordBatch();

                Assert.Equal(1, memoryPool.Statistics.Allocations);
                Assert.True(memoryPool.Statistics.BytesAllocated > 0);

                reader.Dispose();

                if (shouldLeaveOpen)
                {
                    Assert.True(stream.Position > 0);
                }
                else
                {
                    Assert.Throws <ObjectDisposedException>(() => stream.Position);
                }
            }
        }
Ejemplo n.º 7
0
 private protected async Task WriteDictionariesAsync(RecordBatch recordBatch, CancellationToken cancellationToken)
 {
     foreach (Field field in recordBatch.Schema.Fields.Values)
     {
         await WriteDictionaryAsync(field, cancellationToken).ConfigureAwait(false);
     }
 }
Ejemplo n.º 8
0
        public void TestTableFromRecordBatches()
        {
            RecordBatch         recordBatch1  = TestData.CreateSampleRecordBatch(length: 10, true);
            RecordBatch         recordBatch2  = TestData.CreateSampleRecordBatch(length: 10, true);
            IList <RecordBatch> recordBatches = new List <RecordBatch>()
            {
                recordBatch1, recordBatch2
            };

            Table table1 = Table.TableFromRecordBatches(recordBatch1.Schema, recordBatches);

            Assert.Equal(20, table1.RowCount);
            Assert.Equal(21, table1.ColumnCount);

            FixedSizeBinaryType type = new FixedSizeBinaryType(17);
            Field  newField1         = new Field(type.Name, type, false);
            Schema newSchema1        = recordBatch1.Schema.SetField(20, newField1);

            Assert.Throws <ArgumentException>(() => Table.TableFromRecordBatches(newSchema1, recordBatches));

            List <Field> fields = new List <Field>();

            Field.Builder fieldBuilder = new Field.Builder();
            fields.Add(fieldBuilder.Name("Ints").DataType(Int32Type.Default).Nullable(true).Build());
            fieldBuilder = new Field.Builder();
            fields.Add(fieldBuilder.Name("Strings").DataType(StringType.Default).Nullable(true).Build());
            StructType structType = new StructType(fields);

            Field  newField2  = new Field(structType.Name, structType, false);
            Schema newSchema2 = recordBatch1.Schema.SetField(16, newField2);

            Assert.Throws <ArgumentException>(() => Table.TableFromRecordBatches(newSchema2, recordBatches));
        }
Ejemplo n.º 9
0
        private static RecordBatch TotalCostOfAllowableExpenses(RecordBatch records)
        {
            var purchaseColumn = records.Column("Purchase") as StringArray;
            var costColumn     = records.Column("Cost") as FloatArray;

            float totalCost = 0F;

            for (int i = 0; i < purchaseColumn.Length; i++)
            {
                var cost = costColumn.GetValue(i);

                var purchase = purchaseColumn.GetString(i);

                if (purchase != "Drink" && cost.HasValue)
                {
                    totalCost += cost.Value;
                }
            }

            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(f => f.Name("Name").DataType(ArrowStringType.Default))
                       .Field(f => f.Name("TotalCostOfAllowableExpenses").DataType(Apache.Arrow.Types.FloatType.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column("Name"),
                new FloatArray.Builder().Append(totalCost).Build()
            }, returnLength));
        }
        protected RecordBatch ReadRecordBatch()
        {
            ReadSchema();

            int messageLength = ReadMessageLength(throwOnFullRead: false);

            if (messageLength == 0)
            {
                // reached end
                return(null);
            }

            RecordBatch result = null;

            ArrayPool <byte> .Shared.RentReturn(messageLength, messageBuff =>
            {
                int bytesRead = BaseStream.ReadFullBuffer(messageBuff);
                EnsureFullRead(messageBuff, bytesRead);

                Flatbuf.Message message = Flatbuf.Message.GetRootAsMessage(CreateByteBuffer(messageBuff));

                int bodyLength = checked ((int)message.BodyLength);

                IMemoryOwner <byte> bodyBuffOwner = _allocator.Allocate(bodyLength);
                Memory <byte> bodyBuff            = bodyBuffOwner.Memory.Slice(0, bodyLength);
                bytesRead = BaseStream.ReadFullBuffer(bodyBuff);
                EnsureFullRead(bodyBuff, bytesRead);

                FlatBuffers.ByteBuffer bodybb = CreateByteBuffer(bodyBuff);
                result = CreateArrowObjectFromMessage(message, bodybb, bodyBuffOwner);
            });

            return(result);
        }
Ejemplo n.º 11
0
        private static RecordBatch CountCharacters(
            RecordBatch records,
            string groupFieldName,
            string stringFieldName)
        {
            int         stringFieldIndex = records.Schema.GetFieldIndex(stringFieldName);
            StringArray stringValues     = records.Column(stringFieldIndex) as StringArray;

            int characterCount = 0;

            for (int i = 0; i < stringValues.Length; ++i)
            {
                string current = stringValues.GetString(i);
                characterCount += current.Length;
            }

            int   groupFieldIndex = records.Schema.GetFieldIndex(groupFieldName);
            Field groupField      = records.Schema.GetFieldByIndex(groupFieldIndex);

            // Return 1 record, if we were given any. 0, otherwise.
            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(groupField)
                       .Field(f => f.Name(stringFieldName + "_CharCount").DataType(Int32Type.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column(groupFieldIndex),
                new Int32Array.Builder().Append(characterCount).Build()
            },
                       returnLength));
        }
Ejemplo n.º 12
0
        public void WritesMetadataCorrectly()
        {
            Schema.Builder schemaBuilder = new Schema.Builder()
                                           .Metadata("index", "1, 2, 3, 4, 5")
                                           .Metadata("reverseIndex", "5, 4, 3, 2, 1")
                                           .Field(f => f
                                                  .Name("IntCol")
                                                  .DataType(UInt32Type.Default)
                                                  .Metadata("custom1", "false")
                                                  .Metadata("custom2", "true"))
                                           .Field(f => f
                                                  .Name("StringCol")
                                                  .DataType(StringType.Default)
                                                  .Metadata("custom2", "false")
                                                  .Metadata("custom3", "4"))
                                           .Field(f => f
                                                  .Name("StructCol")
                                                  .DataType(new StructType(new[] {
                new Field("Inner1", FloatType.Default, nullable: false),
                new Field("Inner2", DoubleType.Default, nullable: true, new Dictionary <string, string>()
                {
                    { "customInner", "1" }, { "customInner2", "3" }
                })
            }))
                                                  .Metadata("custom4", "6.4")
                                                  .Metadata("custom1", "true"));

            var         schema        = schemaBuilder.Build();
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(schema, length: 10);

            TestRoundTripRecordBatch(originalBatch);
        }
Ejemplo n.º 13
0
        public void CanWriteToNetworkStream(bool createDictionaryArray, int port)
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100, createDictionaryArray: createDictionaryArray);

            TcpListener listener = new TcpListener(IPAddress.Loopback, port);

            listener.Start();

            using (TcpClient sender = new TcpClient())
            {
                sender.Connect(IPAddress.Loopback, port);
                NetworkStream stream = sender.GetStream();

                using (var writer = new ArrowStreamWriter(stream, originalBatch.Schema))
                {
                    writer.WriteRecordBatch(originalBatch);
                    writer.WriteEnd();

                    stream.Flush();
                }
            }

            using (TcpClient receiver = listener.AcceptTcpClient())
            {
                NetworkStream stream = receiver.GetStream();
                using (var reader = new ArrowStreamReader(stream))
                {
                    RecordBatch newBatch = reader.ReadNextRecordBatch();
                    ArrowReaderVerifier.CompareBatches(originalBatch, newBatch);
                }
            }
        }
Ejemplo n.º 14
0
        private static async Task TestRoundTripRecordBatchesAsync(List <RecordBatch> originalBatches, IpcOptions options = null)
        {
            using (MemoryStream stream = new MemoryStream())
            {
                using (var writer = new ArrowStreamWriter(stream, originalBatches[0].Schema, leaveOpen: true, options))
                {
                    foreach (RecordBatch originalBatch in originalBatches)
                    {
                        await writer.WriteRecordBatchAsync(originalBatch);
                    }
                    await writer.WriteEndAsync();
                }

                stream.Position = 0;

                using (var reader = new ArrowStreamReader(stream))
                {
                    foreach (RecordBatch originalBatch in originalBatches)
                    {
                        RecordBatch newBatch = reader.ReadNextRecordBatch();
                        ArrowReaderVerifier.CompareBatches(originalBatch, newBatch);
                    }
                }
            }
        }
Ejemplo n.º 15
0
        public async Task TestReadMultipleRecordBatchAsync()
        {
            RecordBatch originalBatch1 = TestData.CreateSampleRecordBatch(length: 100);
            RecordBatch originalBatch2 = TestData.CreateSampleRecordBatch(length: 50);

            using (MemoryStream stream = new MemoryStream())
            {
                ArrowFileWriter writer = new ArrowFileWriter(stream, originalBatch1.Schema);
                await writer.WriteRecordBatchAsync(originalBatch1);

                await writer.WriteRecordBatchAsync(originalBatch2);

                await writer.WriteFooterAsync();

                stream.Position = 0;

                // the recordbatches by index are in reverse order - back to front.
                // TODO: is this a bug??
                ArrowFileReader reader     = new ArrowFileReader(stream);
                RecordBatch     readBatch1 = await reader.ReadRecordBatchAsync(0);

                ArrowReaderVerifier.CompareBatches(originalBatch2, readBatch1);

                RecordBatch readBatch2 = await reader.ReadRecordBatchAsync(1);

                ArrowReaderVerifier.CompareBatches(originalBatch1, readBatch2);

                // now read the first again, for random access
                RecordBatch readBatch3 = await reader.ReadRecordBatchAsync(0);

                ArrowReaderVerifier.CompareBatches(originalBatch2, readBatch3);
            }
        }
Ejemplo n.º 16
0
        private static double SumAllNumbers(RecordBatch recordBatch)
        {
            double sum = 0;

            for (int k = 0; k < recordBatch.ColumnCount; k++)
            {
                var array = recordBatch.Arrays.ElementAt(k);
                switch (recordBatch.Schema.GetFieldByIndex(k).DataType.TypeId)
                {
                case ArrowTypeId.Int64:
                    Int64Array int64Array = (Int64Array)array;
                    sum += Sum(int64Array);
                    break;

                case ArrowTypeId.Double:
                    DoubleArray doubleArray = (DoubleArray)array;
                    sum += Sum(doubleArray);
                    break;

                case ArrowTypeId.Decimal128:
                    Decimal128Array decimalArray = (Decimal128Array)array;
                    sum += Sum(decimalArray);
                    break;
                }
            }
            return(sum);
        }
Ejemplo n.º 17
0
 private protected void WriteDictionaries(RecordBatch recordBatch)
 {
     foreach (Field field in recordBatch.Schema.Fields.Values)
     {
         WriteDictionary(field);
     }
 }
Ejemplo n.º 18
0
        public async Task TestReadMultipleRecordBatchAsync()
        {
            RecordBatch originalBatch1 = TestData.CreateSampleRecordBatch(length: 100);
            RecordBatch originalBatch2 = TestData.CreateSampleRecordBatch(length: 50);

            using (MemoryStream stream = new MemoryStream())
            {
                ArrowFileWriter writer = new ArrowFileWriter(stream, originalBatch1.Schema);
                await writer.WriteRecordBatchAsync(originalBatch1);

                await writer.WriteRecordBatchAsync(originalBatch2);

                await writer.WriteEndAsync();

                stream.Position = 0;

                ArrowFileReader reader     = new ArrowFileReader(stream);
                RecordBatch     readBatch1 = await reader.ReadRecordBatchAsync(0);

                ArrowReaderVerifier.CompareBatches(originalBatch1, readBatch1);

                RecordBatch readBatch2 = await reader.ReadRecordBatchAsync(1);

                ArrowReaderVerifier.CompareBatches(originalBatch2, readBatch2);

                // now read the first again, for random access
                RecordBatch readBatch3 = await reader.ReadRecordBatchAsync(0);

                ArrowReaderVerifier.CompareBatches(originalBatch1, readBatch3);
            }
        }
Ejemplo n.º 19
0
        private static RecordBatch ArrowBasedCountCharacters(RecordBatch records)
        {
            StringArray nameColumn = records.Column("name") as StringArray;

            int characterCount = 0;

            for (int i = 0; i < nameColumn.Length; ++i)
            {
                string current = nameColumn.GetString(i);
                characterCount += current.Length;
            }

            int   ageFieldIndex = records.Schema.GetFieldIndex("age");
            Field ageField      = records.Schema.GetFieldByIndex(ageFieldIndex);

            // Return 1 record, if we were given any. 0, otherwise.
            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(ageField)
                       .Field(f => f.Name("name_CharCount").DataType(Int32Type.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column(ageFieldIndex),
                new Int32Array.Builder().Append(characterCount).Build()
            },
                       returnLength));
        }
Ejemplo n.º 20
0
        private protected void WriteRecordBatchInternal(RecordBatch recordBatch)
        {
            // TODO: Truncate buffers with extraneous padding / unused capacity

            if (!HasWrittenSchema)
            {
                WriteSchema(Schema);
                HasWrittenSchema = true;
            }

            (ArrowRecordBatchFlatBufferBuilder recordBatchBuilder, VectorOffset fieldNodesVectorOffset) =
                PreparingWritingRecordBatch(recordBatch);

            VectorOffset buffersVectorOffset = Builder.EndVector();

            // Serialize record batch

            StartingWritingRecordBatch();

            Offset <Flatbuf.RecordBatch> recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length,
                                                                                                   fieldNodesVectorOffset,
                                                                                                   buffersVectorOffset);

            long metadataLength = WriteMessage(Flatbuf.MessageHeader.RecordBatch,
                                               recordBatchOffset, recordBatchBuilder.TotalLength);

            // Write buffer data

            IReadOnlyList <ArrowRecordBatchFlatBufferBuilder.Buffer> buffers = recordBatchBuilder.Buffers;

            long bodyLength = 0;

            for (int i = 0; i < buffers.Count; i++)
            {
                ArrowBuffer buffer = buffers[i].DataBuffer;
                if (buffer.IsEmpty)
                {
                    continue;
                }

                WriteBuffer(buffer);

                int paddedLength = checked ((int)BitUtility.RoundUpToMultipleOf8(buffer.Length));
                int padding      = paddedLength - buffer.Length;
                if (padding > 0)
                {
                    WritePadding(padding);
                }

                bodyLength += paddedLength;
            }

            // Write padding so the record batch message body length is a multiple of 8 bytes

            int bodyPaddingLength = CalculatePadding(bodyLength);

            WritePadding(bodyPaddingLength);

            FinishedWritingRecordBatch(bodyLength + bodyPaddingLength, metadataLength);
        }
Ejemplo n.º 21
0
        public override void WriteRecordBatch(RecordBatch recordBatch)
        {
            // TODO: Compare record batch schema

            WriteStart();

            WriteRecordBatchInternal(recordBatch);
        }
Ejemplo n.º 22
0
        public void Ctor_LeaveOpenTrue_StreamValidOnDispose()
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);
            var         stream        = new MemoryStream();

            new ArrowFileWriter(stream, originalBatch.Schema, leaveOpen: true).Dispose();
            Assert.Equal(0, stream.Position);
        }
Ejemplo n.º 23
0
        public void Ctor_LeaveOpenFalse_StreamClosedOnDispose()
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);
            var         stream        = new MemoryStream();

            new ArrowFileWriter(stream, originalBatch.Schema, leaveOpen: false).Dispose();
            Assert.Throws <ObjectDisposedException>(() => stream.Position);
        }
        public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
        {
            var length = batch.Arrays.First().Length;
            var values = Generator.Float(length);

            batchBuilder.Append("Force", false, arrayBuilder => arrayBuilder.Float(builder =>
                                                                                   builder.AppendRange(values)));
        }
Ejemplo n.º 25
0
        public async Task LegacyIpcFormatRoundTripsAsync()
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);

            await TestRoundTripRecordBatchAsync(originalBatch, new IpcOptions()
            {
                WriteLegacyIpcFormat = true
            });
        }
Ejemplo n.º 26
0
        public async Task GlobalSetup()
        {
            RecordBatch batch = TestData.CreateSampleRecordBatch(length: Count);

            _memoryStream = new MemoryStream();

            ArrowStreamWriter writer = new ArrowStreamWriter(_memoryStream, batch.Schema);
            await writer.WriteRecordBatchAsync(batch);
        }
Ejemplo n.º 27
0
        public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
        {
            var array   = (FloatArray)batch.Column("Values");
            var array2  = (FloatArray)batch.Column("Values2");
            var values  = array.Values;
            var values2 = array2.Values;

            FindSum(values, values2);
        }
Ejemplo n.º 28
0
        public void LegacyIpcFormatRoundTrips()
        {
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(length: 100);

            TestRoundTripRecordBatch(originalBatch, new IpcOptions()
            {
                WriteLegacyIpcFormat = true
            });
        }
 protected override void Dispose(bool disposing)
 {
     //Dispose the currently read batch
     if (disposing)
     {
         _currentBatch?.Dispose();
         _currentBatch = null;
     }
 }
Ejemplo n.º 30
0
        protected override CommandExecutorStat ExecuteCore(
            Stream inputStream,
            Stream outputStream,
            SqlCommand[] commands)
        {
            var            stat          = new CommandExecutorStat();
            ICommandRunner commandRunner = CreateCommandRunner(commands);

            SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);

            // TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams.
            // For now, we write to a temporary seekable MemoryStream which we then copy to
            // the actual destination stream.
            MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream());

            ArrowStreamWriter writer       = null;
            Schema            resultSchema = null;

            foreach (ReadOnlyMemory <IArrowArray> input in GetInputIterator(inputStream))
            {
                // Split id is currently not used, so 0 is passed.
                IArrowArray[] results = commandRunner.Run(0, input);

                // Assumes all columns have the same length, so uses 0th for num entries.
                int numEntries = results[0].Length;
                stat.NumEntriesProcessed += numEntries;

                tmp.SetLength(0);

                if (writer == null)
                {
                    Debug.Assert(resultSchema == null);
                    resultSchema = BuildSchema(results);

                    writer = new ArrowStreamWriter(tmp, resultSchema, leaveOpen: true);
                }

                var recordBatch = new RecordBatch(resultSchema, results, numEntries);

                // TODO: Remove sync-over-async once WriteRecordBatch exists.
                writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();

                tmp.Position = 0;
                tmp.CopyTo(outputStream);
                outputStream.Flush();
            }

            SerDe.Write(outputStream, 0);

            if (writer != null)
            {
                writer.Dispose();
            }

            return(stat);
        }