示例#1
0
        private static RecordBatch ArrowBasedCountCharacters(RecordBatch records)
        {
            StringArray nameColumn = records.Column("name") as StringArray;

            int characterCount = 0;

            for (int i = 0; i < nameColumn.Length; ++i)
            {
                string current = nameColumn.GetString(i);
                characterCount += current.Length;
            }

            int   ageFieldIndex = records.Schema.GetFieldIndex("age");
            Field ageField      = records.Schema.GetFieldByIndex(ageFieldIndex);

            // Return 1 record, if we were given any. 0, otherwise.
            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(ageField)
                       .Field(f => f.Name("name_CharCount").DataType(Int32Type.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column(ageFieldIndex),
                new Int32Array.Builder().Append(characterCount).Build()
            },
                       returnLength));
        }
示例#2
0
        private static RecordBatch CountCharacters(
            RecordBatch records,
            string groupFieldName,
            string stringFieldName)
        {
            int         stringFieldIndex = records.Schema.GetFieldIndex(stringFieldName);
            StringArray stringValues     = records.Column(stringFieldIndex) as StringArray;

            int characterCount = 0;

            for (int i = 0; i < stringValues.Length; ++i)
            {
                string current = stringValues.GetString(i);
                characterCount += current.Length;
            }

            int   groupFieldIndex = records.Schema.GetFieldIndex(groupFieldName);
            Field groupField      = records.Schema.GetFieldByIndex(groupFieldIndex);

            // Return 1 record, if we were given any. 0, otherwise.
            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(groupField)
                       .Field(f => f.Name(stringFieldName + "_CharCount").DataType(Int32Type.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column(groupFieldIndex),
                new Int32Array.Builder().Append(characterCount).Build()
            },
                       returnLength));
        }
示例#3
0
        private static RecordBatch TotalCostOfAllowableExpenses(RecordBatch records)
        {
            var purchaseColumn = records.Column("Purchase") as StringArray;
            var costColumn     = records.Column("Cost") as FloatArray;

            float totalCost = 0F;

            for (int i = 0; i < purchaseColumn.Length; i++)
            {
                var cost = costColumn.GetValue(i);

                var purchase = purchaseColumn.GetString(i);

                if (purchase != "Drink" && cost.HasValue)
                {
                    totalCost += cost.Value;
                }
            }

            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(f => f.Name("Name").DataType(ArrowStringType.Default))
                       .Field(f => f.Name("TotalCostOfAllowableExpenses").DataType(Apache.Arrow.Types.FloatType.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column("Name"),
                new FloatArray.Builder().Append(totalCost).Build()
            }, returnLength));
        }
示例#4
0
        public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
        {
            var array   = (FloatArray)batch.Column("Values");
            var array2  = (FloatArray)batch.Column("Values2");
            var values  = array.Values;
            var values2 = array2.Values;

            FindSum(values, values2);
        }
示例#5
0
        public void TestRecordBatchBasics()
        {
            RecordBatch recordBatch = TestData.CreateSampleRecordBatch(length: 1);

            Assert.Throws <ArgumentOutOfRangeException>(() => new RecordBatch(recordBatch.Schema, recordBatch.Arrays, -1));

            var col1 = recordBatch.Column(0);
            var col2 = recordBatch.Column("list0");

            ArrowReaderVerifier.CompareArrays(col1, col2);

            recordBatch.Dispose();
        }
        public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
        {
            var array  = (FloatArray)batch.Column("Values");
            var values = array.Values;

            FindMinMax(values);
        }
        public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
        {
            var velocity = (FloatArray)batch.Column("Velocity");
            var force    = (FloatArray)batch.Column("Force");
            var mass     = (FloatArray)batch.Column("Mass");

            var length  = velocity.Length;
            var results = new float[length];

            for (var i = 0; i < length; i++)
            {
                results[i] = velocity.Values[i] + force.Values[i] / mass.Values[i];
            }

            batchBuilder.Append("Velocity", false,
                                arrayBuilder => arrayBuilder.Float(builder => builder.AppendRange(results)));
        }
        private Tuple <ArrowRecordBatchFlatBufferBuilder, VectorOffset> PreparingWritingRecordBatch(RecordBatch recordBatch)
        {
            Builder.Clear();

            // Serialize field nodes

            int fieldCount = Schema.Fields.Count;

            Flatbuf.RecordBatch.StartNodesVector(Builder, CountAllNodes());

            // flatbuffer struct vectors have to be created in reverse order
            for (int i = fieldCount - 1; i >= 0; i--)
            {
                CreateSelfAndChildrenFieldNodes(recordBatch.Column(i).Data);
            }

            VectorOffset fieldNodesVectorOffset = Builder.EndVector();

            // Serialize buffers

            var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder();

            for (int i = 0; i < fieldCount; i++)
            {
                IArrowArray fieldArray = recordBatch.Column(i);
                fieldArray.Accept(recordBatchBuilder);
            }

            IReadOnlyList <ArrowRecordBatchFlatBufferBuilder.Buffer> buffers = recordBatchBuilder.Buffers;

            Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count);

            // flatbuffer struct vectors have to be created in reverse order
            for (int i = buffers.Count - 1; i >= 0; i--)
            {
                Flatbuf.Buffer.CreateBuffer(Builder,
                                            buffers[i].Offset, buffers[i].DataBuffer.Length);
            }

            return(Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset));
        }
        public void ReadNewRecordBatch(RecordBatch recordBatch)
        {
            //Dispose the previous batch
            _currentBatch?.Dispose();
            _currentBatch = recordBatch;
            _currentIndex = -1;

            for (int i = 0; i < _columnDecoders.Count; i++)
            {
                _columnDecoders[i].NewBatch(recordBatch.Column(i));
            }
        }
示例#10
0
        private static RecordBatch CountCharacters(RecordBatch records)
        {
            int         stringFieldIndex = records.Schema.GetFieldIndex("name");
            StringArray stringValues     = records.Column(stringFieldIndex) as StringArray;

            int characterCount = 0;

            for (int i = 0; i < stringValues.Length; ++i)
            {
                string current = stringValues.GetString(i);
                characterCount += current.Length;
            }

            int   groupFieldIndex = records.Schema.GetFieldIndex("age");
            Field groupField      = records.Schema.GetFieldByIndex(groupFieldIndex);

            // Return 1 record, if we were given any. 0, otherwise.
            int returnLength = records.Length > 0 ? 1 : 0;

            return(new RecordBatch(
                       new Schema.Builder()
                       .Field(f => f.Name(groupField.Name).DataType(groupField.DataType))
                       .Field(f => f.Name("name_CharCount").DataType(Int32Type.Default))
                       .Build(),
                       new IArrowArray[]
            {
                records.Column(groupFieldIndex),
                new Int32Array(
                    new ArrowBuffer.Builder <int>().Append(characterCount).Build(),
                    ArrowBuffer.Empty,
                    length: 1,
                    nullCount: 0,
                    offset: 0)
            },
                       returnLength));
        }
示例#11
0
        private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBatch,
                                                                   CancellationToken cancellationToken = default)
        {
            // TODO: Truncate buffers with extraneous padding / unused capacity

            if (!HasWrittenSchema)
            {
                await WriteSchemaAsync(Schema, cancellationToken).ConfigureAwait(false);

                HasWrittenSchema = true;
            }

            Builder.Clear();

            // Serialize field nodes

            var fieldCount = Schema.Fields.Count;

            Flatbuf.RecordBatch.StartNodesVector(Builder, fieldCount);

            // flatbuffer struct vectors have to be created in reverse order
            for (var i = fieldCount - 1; i >= 0; i--)
            {
                var fieldArray = recordBatch.Column(i);
                Flatbuf.FieldNode.CreateFieldNode(Builder, fieldArray.Length, fieldArray.NullCount);
            }

            var fieldNodesVectorOffset = Builder.EndVector();

            // Serialize buffers

            var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder();

            for (var i = 0; i < fieldCount; i++)
            {
                var fieldArray = recordBatch.Column(i);
                fieldArray.Accept(recordBatchBuilder);
            }

            var buffers = recordBatchBuilder.Buffers;

            Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count);

            // flatbuffer struct vectors have to be created in reverse order
            for (var i = buffers.Count - 1; i >= 0; i--)
            {
                Flatbuf.Buffer.CreateBuffer(Builder,
                                            buffers[i].Offset, buffers[i].DataBuffer.Length);
            }

            var buffersVectorOffset = Builder.EndVector();

            // Serialize record batch

            StartingWritingRecordBatch();

            var recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length,
                                                                          fieldNodesVectorOffset,
                                                                          buffersVectorOffset);

            long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch,
                                                          recordBatchOffset, recordBatchBuilder.TotalLength,
                                                          cancellationToken).ConfigureAwait(false);

            // Write buffer data

            long bodyLength = 0;

            for (var i = 0; i < buffers.Count; i++)
            {
                ArrowBuffer buffer = buffers[i].DataBuffer;
                if (buffer.IsEmpty)
                {
                    continue;
                }

                await WriteBufferAsync(buffer, cancellationToken).ConfigureAwait(false);

                int paddedLength = checked ((int)BitUtility.RoundUpToMultipleOf8(buffer.Length));
                int padding      = paddedLength - buffer.Length;
                if (padding > 0)
                {
                    await WritePaddingAsync(padding).ConfigureAwait(false);
                }

                bodyLength += paddedLength;
            }

            // Write padding so the record batch message body length is a multiple of 8 bytes

            int bodyPaddingLength = CalculatePadding(bodyLength);

            await WritePaddingAsync(bodyPaddingLength).ConfigureAwait(false);

            FinishedWritingRecordBatch(bodyLength + bodyPaddingLength, metadataLength);
        }
示例#12
0
        public async Task TestArrowGroupedMapCommandExecutor()
        {
            StringArray ConvertStrings(StringArray strings)
            {
                return((StringArray)ToArrowArray(
                           Enumerable.Range(0, strings.Length)
                           .Select(i => $"udf: {strings.GetString(i)}")
                           .ToArray()));
            }

            Int64Array ConvertInt64s(Int64Array int64s)
            {
                return((Int64Array)ToArrowArray(
                           Enumerable.Range(0, int64s.Length)
                           .Select(i => int64s.Values[i] + 100)
                           .ToArray()));
            }

            Schema resultSchema = new Schema.Builder()
                                  .Field(b => b.Name("arg1").DataType(StringType.Default))
                                  .Field(b => b.Name("arg2").DataType(Int64Type.Default))
                                  .Build();

            var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper(
                (batch) => new RecordBatch(
                    resultSchema,
                    new IArrowArray[]
            {
                ConvertStrings((StringArray)batch.Column(0)),
                ConvertInt64s((Int64Array)batch.Column(1)),
            },
                    batch.Length));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                Commands = new[] { command }
            };

            using (var inputStream = new MemoryStream())
                using (var outputStream = new MemoryStream())
                {
                    int numRows = 10;

                    // Write test data to the input stream.
                    Schema schema = new Schema.Builder()
                                    .Field(b => b.Name("arg1").DataType(StringType.Default))
                                    .Field(b => b.Name("arg2").DataType(Int64Type.Default))
                                    .Build();
                    var arrowWriter = new ArrowStreamWriter(inputStream, schema);
                    await arrowWriter.WriteRecordBatchAsync(
                        new RecordBatch(
                            schema,
                            new[]
                    {
                        ToArrowArray(
                            Enumerable.Range(0, numRows)
                            .Select(i => i.ToString())
                            .ToArray()),
                        ToArrowArray(
                            Enumerable.Range(0, numRows)
                            .Select(i => (long)i)
                            .ToArray())
                    },
                            numRows));

                    inputStream.Seek(0, SeekOrigin.Begin);

                    CommandExecutorStat stat = new CommandExecutor().Execute(
                        inputStream,
                        outputStream,
                        0,
                        commandPayload);

                    // Validate that all the data on the stream is read.
                    Assert.Equal(inputStream.Length, inputStream.Position);
                    Assert.Equal(numRows, stat.NumEntriesProcessed);

                    // Validate the output stream.
                    outputStream.Seek(0, SeekOrigin.Begin);
                    int arrowLength = SerDe.ReadInt32(outputStream);
                    Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
                    var         arrowReader = new ArrowStreamReader(outputStream);
                    RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

                    Assert.Equal(numRows, outputBatch.Length);
                    Assert.Equal(2, outputBatch.ColumnCount);

                    var stringArray = (StringArray)outputBatch.Column(0);
                    for (int i = 0; i < numRows; ++i)
                    {
                        Assert.Equal($"udf: {i}", stringArray.GetString(i));
                    }

                    var longArray = (Int64Array)outputBatch.Column(1);
                    for (int i = 0; i < numRows; ++i)
                    {
                        Assert.Equal(100 + i, longArray.Values[i]);
                    }

                    int end = SerDe.ReadInt32(outputStream);
                    Assert.Equal(0, end);

                    // Validate all the data on the stream is read.
                    Assert.Equal(outputStream.Length, outputStream.Position);
                }
        }
示例#13
0
        protected virtual async Task <Block> WriteRecordBatchInternalAsync(RecordBatch recordBatch,
                                                                           CancellationToken cancellationToken = default)
        {
            // TODO: Truncate buffers with extraneous padding / unused capacity

            if (!HasWrittenSchema)
            {
                await WriteSchemaAsync(Schema, cancellationToken).ConfigureAwait(false);

                HasWrittenSchema = true;
            }

            var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder();

            Builder.Clear();

            // Serialize field nodes

            var fieldCount       = Schema.Fields.Count;
            var fieldNodeOffsets = new Offset <Flatbuf.FieldNode> [fieldCount];

            Flatbuf.RecordBatch.StartNodesVector(Builder, fieldCount);

            for (var i = 0; i < fieldCount; i++)
            {
                var fieldArray = recordBatch.Column(i);
                fieldNodeOffsets[i] =
                    Flatbuf.FieldNode.CreateFieldNode(Builder, fieldArray.Length, fieldArray.NullCount);
            }

            var fieldNodesVectorOffset = Builder.EndVector();

            // Serialize buffers

            for (var i = 0; i < fieldCount; i++)
            {
                var fieldArray = recordBatch.Column(i);
                fieldArray.Accept(recordBatchBuilder);
            }

            var buffers       = recordBatchBuilder.Buffers;
            var bufferOffsets = new Offset <Flatbuf.Buffer> [buffers.Count];

            Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count);

            for (var i = buffers.Count - 1; i >= 0; i--)
            {
                bufferOffsets[i] = Flatbuf.Buffer.CreateBuffer(Builder,
                                                               buffers[i].Offset, buffers[i].Length);
            }

            var buffersVectorOffset = Builder.EndVector();

            // Serialize record batch

            var recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length,
                                                                          fieldNodesVectorOffset,
                                                                          buffersVectorOffset);

            var metadataOffset = BaseStream.Position;

            await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch,
                                    recordBatchOffset, recordBatchBuilder.TotalLength,
                                    cancellationToken).ConfigureAwait(false);

            var metadataLength = BaseStream.Position - metadataOffset;

            // Write buffer data

            var lengthOffset = BaseStream.Position;

            for (var i = 0; i < buffers.Count; i++)
            {
                if (buffers[i].DataBuffer.IsEmpty)
                {
                    continue;
                }


                await WriteBufferAsync(buffers[i].DataBuffer, cancellationToken).ConfigureAwait(false);
            }

            // Write padding so the record batch message body length is a multiple of 8 bytes

            var bodyLength        = Convert.ToInt32(BaseStream.Position - lengthOffset);
            var bodyPaddingLength = CalculatePadding(bodyLength);

            await WritePaddingAsync(bodyPaddingLength).ConfigureAwait(false);

            return(new Block(
                       offset: Convert.ToInt32(metadataOffset),
                       length: bodyLength + bodyPaddingLength,
                       metadataLength: Convert.ToInt32(metadataLength)));
        }
示例#14
0
 public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder)
 {
     batchBuilder.Append("Mass", false, batch.Column("Mass"));
 }
示例#15
0
        public int FilterLandRegistryRecordsArrow()
        {
            var recordCount = recordBatch.Length;
            var selectMask  = new bool[recordCount];

            const long MillisecondsPerDay = 86400000;

            var dateFilter = (int)(DateTimeOffset.Parse("2019-01-01").ToUnixTimeMilliseconds() / MillisecondsPerDay);
            var dateValues = (recordBatch.Column(0) as Date32Array).Values;

            for (var i = 0; i < recordCount; i++)
            {
                selectMask[i] = dateValues[i] >= dateFilter;
            }

            var priceValues = (recordBatch.Column(1) as FloatArray).Values;

            for (var i = 0; i < recordCount; i++)
            {
                selectMask[i] = selectMask[i] && priceValues[i] > 5000000;
            }

            var stringEncoding = Encoding.ASCII;
            var propertyTypeFilter = new string[] { "D", "S", "T" }.Select(x => stringEncoding.GetBytes(x)[0]).ToArray();
            var propertyTypeValues = (recordBatch.Column(2) as StringArray).Values;

            for (var i = 0; i < recordCount; i++)
            {
                selectMask[i] = selectMask[i] && propertyTypeFilter.Contains(propertyTypeValues[i]);
            }

            var tenureFilter = stringEncoding.GetBytes("F")[0];
            var tenureValues = (recordBatch.Column(3) as StringArray).Values;

            for (var i = 0; i < recordCount; i++)
            {
                selectMask[i] = selectMask[i] && tenureValues[i] == tenureFilter;
            }

            var itemCount = selectMask.Count(v => v);

#if LOG
            Console.WriteLine("Found {0} records", itemCount);
#endif
            return(itemCount);
        }