private static RecordBatch ArrowBasedCountCharacters(RecordBatch records) { StringArray nameColumn = records.Column("name") as StringArray; int characterCount = 0; for (int i = 0; i < nameColumn.Length; ++i) { string current = nameColumn.GetString(i); characterCount += current.Length; } int ageFieldIndex = records.Schema.GetFieldIndex("age"); Field ageField = records.Schema.GetFieldByIndex(ageFieldIndex); // Return 1 record, if we were given any. 0, otherwise. int returnLength = records.Length > 0 ? 1 : 0; return(new RecordBatch( new Schema.Builder() .Field(ageField) .Field(f => f.Name("name_CharCount").DataType(Int32Type.Default)) .Build(), new IArrowArray[] { records.Column(ageFieldIndex), new Int32Array.Builder().Append(characterCount).Build() }, returnLength)); }
private static RecordBatch CountCharacters( RecordBatch records, string groupFieldName, string stringFieldName) { int stringFieldIndex = records.Schema.GetFieldIndex(stringFieldName); StringArray stringValues = records.Column(stringFieldIndex) as StringArray; int characterCount = 0; for (int i = 0; i < stringValues.Length; ++i) { string current = stringValues.GetString(i); characterCount += current.Length; } int groupFieldIndex = records.Schema.GetFieldIndex(groupFieldName); Field groupField = records.Schema.GetFieldByIndex(groupFieldIndex); // Return 1 record, if we were given any. 0, otherwise. int returnLength = records.Length > 0 ? 1 : 0; return(new RecordBatch( new Schema.Builder() .Field(groupField) .Field(f => f.Name(stringFieldName + "_CharCount").DataType(Int32Type.Default)) .Build(), new IArrowArray[] { records.Column(groupFieldIndex), new Int32Array.Builder().Append(characterCount).Build() }, returnLength)); }
private static RecordBatch TotalCostOfAllowableExpenses(RecordBatch records) { var purchaseColumn = records.Column("Purchase") as StringArray; var costColumn = records.Column("Cost") as FloatArray; float totalCost = 0F; for (int i = 0; i < purchaseColumn.Length; i++) { var cost = costColumn.GetValue(i); var purchase = purchaseColumn.GetString(i); if (purchase != "Drink" && cost.HasValue) { totalCost += cost.Value; } } int returnLength = records.Length > 0 ? 1 : 0; return(new RecordBatch( new Schema.Builder() .Field(f => f.Name("Name").DataType(ArrowStringType.Default)) .Field(f => f.Name("TotalCostOfAllowableExpenses").DataType(Apache.Arrow.Types.FloatType.Default)) .Build(), new IArrowArray[] { records.Column("Name"), new FloatArray.Builder().Append(totalCost).Build() }, returnLength)); }
public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder) { var array = (FloatArray)batch.Column("Values"); var array2 = (FloatArray)batch.Column("Values2"); var values = array.Values; var values2 = array2.Values; FindSum(values, values2); }
public void TestRecordBatchBasics() { RecordBatch recordBatch = TestData.CreateSampleRecordBatch(length: 1); Assert.Throws <ArgumentOutOfRangeException>(() => new RecordBatch(recordBatch.Schema, recordBatch.Arrays, -1)); var col1 = recordBatch.Column(0); var col2 = recordBatch.Column("list0"); ArrowReaderVerifier.CompareArrays(col1, col2); recordBatch.Dispose(); }
public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder) { var array = (FloatArray)batch.Column("Values"); var values = array.Values; FindMinMax(values); }
public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder) { var velocity = (FloatArray)batch.Column("Velocity"); var force = (FloatArray)batch.Column("Force"); var mass = (FloatArray)batch.Column("Mass"); var length = velocity.Length; var results = new float[length]; for (var i = 0; i < length; i++) { results[i] = velocity.Values[i] + force.Values[i] / mass.Values[i]; } batchBuilder.Append("Velocity", false, arrayBuilder => arrayBuilder.Float(builder => builder.AppendRange(results))); }
private Tuple <ArrowRecordBatchFlatBufferBuilder, VectorOffset> PreparingWritingRecordBatch(RecordBatch recordBatch) { Builder.Clear(); // Serialize field nodes int fieldCount = Schema.Fields.Count; Flatbuf.RecordBatch.StartNodesVector(Builder, CountAllNodes()); // flatbuffer struct vectors have to be created in reverse order for (int i = fieldCount - 1; i >= 0; i--) { CreateSelfAndChildrenFieldNodes(recordBatch.Column(i).Data); } VectorOffset fieldNodesVectorOffset = Builder.EndVector(); // Serialize buffers var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(); for (int i = 0; i < fieldCount; i++) { IArrowArray fieldArray = recordBatch.Column(i); fieldArray.Accept(recordBatchBuilder); } IReadOnlyList <ArrowRecordBatchFlatBufferBuilder.Buffer> buffers = recordBatchBuilder.Buffers; Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); // flatbuffer struct vectors have to be created in reverse order for (int i = buffers.Count - 1; i >= 0; i--) { Flatbuf.Buffer.CreateBuffer(Builder, buffers[i].Offset, buffers[i].DataBuffer.Length); } return(Tuple.Create(recordBatchBuilder, fieldNodesVectorOffset)); }
public void ReadNewRecordBatch(RecordBatch recordBatch) { //Dispose the previous batch _currentBatch?.Dispose(); _currentBatch = recordBatch; _currentIndex = -1; for (int i = 0; i < _columnDecoders.Count; i++) { _columnDecoders[i].NewBatch(recordBatch.Column(i)); } }
private static RecordBatch CountCharacters(RecordBatch records) { int stringFieldIndex = records.Schema.GetFieldIndex("name"); StringArray stringValues = records.Column(stringFieldIndex) as StringArray; int characterCount = 0; for (int i = 0; i < stringValues.Length; ++i) { string current = stringValues.GetString(i); characterCount += current.Length; } int groupFieldIndex = records.Schema.GetFieldIndex("age"); Field groupField = records.Schema.GetFieldByIndex(groupFieldIndex); // Return 1 record, if we were given any. 0, otherwise. int returnLength = records.Length > 0 ? 1 : 0; return(new RecordBatch( new Schema.Builder() .Field(f => f.Name(groupField.Name).DataType(groupField.DataType)) .Field(f => f.Name("name_CharCount").DataType(Int32Type.Default)) .Build(), new IArrowArray[] { records.Column(groupFieldIndex), new Int32Array( new ArrowBuffer.Builder <int>().Append(characterCount).Build(), ArrowBuffer.Empty, length: 1, nullCount: 0, offset: 0) }, returnLength)); }
private protected async Task WriteRecordBatchInternalAsync(RecordBatch recordBatch, CancellationToken cancellationToken = default) { // TODO: Truncate buffers with extraneous padding / unused capacity if (!HasWrittenSchema) { await WriteSchemaAsync(Schema, cancellationToken).ConfigureAwait(false); HasWrittenSchema = true; } Builder.Clear(); // Serialize field nodes var fieldCount = Schema.Fields.Count; Flatbuf.RecordBatch.StartNodesVector(Builder, fieldCount); // flatbuffer struct vectors have to be created in reverse order for (var i = fieldCount - 1; i >= 0; i--) { var fieldArray = recordBatch.Column(i); Flatbuf.FieldNode.CreateFieldNode(Builder, fieldArray.Length, fieldArray.NullCount); } var fieldNodesVectorOffset = Builder.EndVector(); // Serialize buffers var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(); for (var i = 0; i < fieldCount; i++) { var fieldArray = recordBatch.Column(i); fieldArray.Accept(recordBatchBuilder); } var buffers = recordBatchBuilder.Buffers; Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); // flatbuffer struct vectors have to be created in reverse order for (var i = buffers.Count - 1; i >= 0; i--) { Flatbuf.Buffer.CreateBuffer(Builder, buffers[i].Offset, buffers[i].DataBuffer.Length); } var buffersVectorOffset = Builder.EndVector(); // Serialize record batch StartingWritingRecordBatch(); var recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, buffersVectorOffset); long metadataLength = await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength, cancellationToken).ConfigureAwait(false); // Write buffer data long bodyLength = 0; for (var i = 0; i < buffers.Count; i++) { ArrowBuffer buffer = buffers[i].DataBuffer; if (buffer.IsEmpty) { continue; } await WriteBufferAsync(buffer, cancellationToken).ConfigureAwait(false); int paddedLength = checked ((int)BitUtility.RoundUpToMultipleOf8(buffer.Length)); int padding = paddedLength - buffer.Length; if (padding > 0) { await WritePaddingAsync(padding).ConfigureAwait(false); } bodyLength += paddedLength; } // Write padding so the record batch message body length is a multiple of 8 bytes int bodyPaddingLength = CalculatePadding(bodyLength); await WritePaddingAsync(bodyPaddingLength).ConfigureAwait(false); FinishedWritingRecordBatch(bodyLength + bodyPaddingLength, metadataLength); }
public async Task TestArrowGroupedMapCommandExecutor() { StringArray ConvertStrings(StringArray strings) { return((StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); } Int64Array ConvertInt64s(Int64Array int64s) { return((Int64Array)ToArrowArray( Enumerable.Range(0, int64s.Length) .Select(i => int64s.Values[i] + 100) .ToArray())); } Schema resultSchema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper( (batch) => new RecordBatch( resultSchema, new IArrowArray[] { ConvertStrings((StringArray)batch.Column(0)), ConvertInt64s((Int64Array)batch.Column(1)), }, batch.Length)); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, Commands = new[] { command } }; using (var inputStream = new MemoryStream()) using (var outputStream = new MemoryStream()) { int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray( Enumerable.Range(0, numRows) .Select(i => (long)i) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.ColumnCount); var stringArray = (StringArray)outputBatch.Column(0); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", stringArray.GetString(i)); } var longArray = (Int64Array)outputBatch.Column(1); for (int i = 0; i < numRows; ++i) { Assert.Equal(100 + i, longArray.Values[i]); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); } }
protected virtual async Task <Block> WriteRecordBatchInternalAsync(RecordBatch recordBatch, CancellationToken cancellationToken = default) { // TODO: Truncate buffers with extraneous padding / unused capacity if (!HasWrittenSchema) { await WriteSchemaAsync(Schema, cancellationToken).ConfigureAwait(false); HasWrittenSchema = true; } var recordBatchBuilder = new ArrowRecordBatchFlatBufferBuilder(); Builder.Clear(); // Serialize field nodes var fieldCount = Schema.Fields.Count; var fieldNodeOffsets = new Offset <Flatbuf.FieldNode> [fieldCount]; Flatbuf.RecordBatch.StartNodesVector(Builder, fieldCount); for (var i = 0; i < fieldCount; i++) { var fieldArray = recordBatch.Column(i); fieldNodeOffsets[i] = Flatbuf.FieldNode.CreateFieldNode(Builder, fieldArray.Length, fieldArray.NullCount); } var fieldNodesVectorOffset = Builder.EndVector(); // Serialize buffers for (var i = 0; i < fieldCount; i++) { var fieldArray = recordBatch.Column(i); fieldArray.Accept(recordBatchBuilder); } var buffers = recordBatchBuilder.Buffers; var bufferOffsets = new Offset <Flatbuf.Buffer> [buffers.Count]; Flatbuf.RecordBatch.StartBuffersVector(Builder, buffers.Count); for (var i = buffers.Count - 1; i >= 0; i--) { bufferOffsets[i] = Flatbuf.Buffer.CreateBuffer(Builder, buffers[i].Offset, buffers[i].Length); } var buffersVectorOffset = Builder.EndVector(); // Serialize record batch var recordBatchOffset = Flatbuf.RecordBatch.CreateRecordBatch(Builder, recordBatch.Length, fieldNodesVectorOffset, buffersVectorOffset); var metadataOffset = BaseStream.Position; await WriteMessageAsync(Flatbuf.MessageHeader.RecordBatch, recordBatchOffset, recordBatchBuilder.TotalLength, cancellationToken).ConfigureAwait(false); var metadataLength = BaseStream.Position - metadataOffset; // Write buffer data var lengthOffset = BaseStream.Position; for (var i = 0; i < buffers.Count; i++) { if (buffers[i].DataBuffer.IsEmpty) { continue; } await WriteBufferAsync(buffers[i].DataBuffer, cancellationToken).ConfigureAwait(false); } // Write padding so the record batch message body length is a multiple of 8 bytes var bodyLength = Convert.ToInt32(BaseStream.Position - lengthOffset); var bodyPaddingLength = CalculatePadding(bodyLength); await WritePaddingAsync(bodyPaddingLength).ConfigureAwait(false); return(new Block( offset: Convert.ToInt32(metadataOffset), length: bodyLength + bodyPaddingLength, metadataLength: Convert.ToInt32(metadataLength))); }
public void Execute(RecordBatch batch, RecordBatch.Builder batchBuilder) { batchBuilder.Append("Mass", false, batch.Column("Mass")); }
public int FilterLandRegistryRecordsArrow() { var recordCount = recordBatch.Length; var selectMask = new bool[recordCount]; const long MillisecondsPerDay = 86400000; var dateFilter = (int)(DateTimeOffset.Parse("2019-01-01").ToUnixTimeMilliseconds() / MillisecondsPerDay); var dateValues = (recordBatch.Column(0) as Date32Array).Values; for (var i = 0; i < recordCount; i++) { selectMask[i] = dateValues[i] >= dateFilter; } var priceValues = (recordBatch.Column(1) as FloatArray).Values; for (var i = 0; i < recordCount; i++) { selectMask[i] = selectMask[i] && priceValues[i] > 5000000; } var stringEncoding = Encoding.ASCII; var propertyTypeFilter = new string[] { "D", "S", "T" }.Select(x => stringEncoding.GetBytes(x)[0]).ToArray(); var propertyTypeValues = (recordBatch.Column(2) as StringArray).Values; for (var i = 0; i < recordCount; i++) { selectMask[i] = selectMask[i] && propertyTypeFilter.Contains(propertyTypeValues[i]); } var tenureFilter = stringEncoding.GetBytes("F")[0]; var tenureValues = (recordBatch.Column(3) as StringArray).Values; for (var i = 0; i < recordCount; i++) { selectMask[i] = selectMask[i] && tenureValues[i] == tenureFilter; } var itemCount = selectMask.Count(v => v); #if LOG Console.WriteLine("Found {0} records", itemCount); #endif return(itemCount); }