public Builder(MemoryAllocator allocator = default) { _allocator = allocator ?? MemoryAllocator.Default.Value; _arrayBuilder = new ArrayBuilder(_allocator); _schemaBuilder = new Schema.Builder(); _arrays = new List <IArrowArray>(); }
public void getOutputSchemaInner(string sql) { reader = new ArrowStreamReader(this.outputBuffer, leaveOpen: false); // reader one batch to get the arrow schema first reader.ReadNextRecordBatch(); this.schema = ArrowSchemaToASARecordSchema(reader.Schema); var result = SqlCompiler.Compile( sql, new QueryBindings( new Dictionary <string, InputDescription> { { "input", new InputDescription(this.schema, InputType.Stream) } })); var step = result.Steps.First(); Schema.Builder builder = new Schema.Builder(); foreach (KeyValuePair <string, int> kv in step.Output.PayloadSchema.Ordinals.OrderBy(kv => kv.Value)) { builder = builder.Field(f => f.Name(kv.Key).DataType(ASATypeToArrowType(step.Output.PayloadSchema[kv.Value].Schema)).Nullable(false)); } this.outputArrowSchema = builder.Build(); this.writer = new ArrowStreamWriter(this.inputBuffer, this.outputArrowSchema); //Write empty batch to send the schema to Java side var emptyRecordBatch = createOutputRecordBatch(new List <IRecord>()); WriteRecordBatch(emptyRecordBatch); }
public static RecordBatch CreateSampleRecordBatch(int length) { Schema.Builder builder = new Schema.Builder(); builder.Field(CreateField(BooleanType.Default)); builder.Field(CreateField(UInt8Type.Default)); builder.Field(CreateField(Int8Type.Default)); builder.Field(CreateField(UInt16Type.Default)); builder.Field(CreateField(Int16Type.Default)); builder.Field(CreateField(UInt32Type.Default)); builder.Field(CreateField(Int32Type.Default)); builder.Field(CreateField(UInt64Type.Default)); builder.Field(CreateField(Int64Type.Default)); builder.Field(CreateField(FloatType.Default)); builder.Field(CreateField(DoubleType.Default)); //builder.Field(CreateField(new DecimalType(19, 2))); //builder.Field(CreateField(HalfFloatType.Default)); //builder.Field(CreateField(StringType.Default)); //builder.Field(CreateField(Date32Type.Default)); //builder.Field(CreateField(Date64Type.Default)); //builder.Field(CreateField(Time32Type.Default)); //builder.Field(CreateField(Time64Type.Default)); //builder.Field(CreateField(TimestampType.Default)); Schema schema = builder.Build(); IEnumerable <IArrowArray> arrays = CreateArrays(schema, length); return(new RecordBatch(schema, arrays, length)); }
public void WritesMetadataCorrectly() { Schema.Builder schemaBuilder = new Schema.Builder() .Metadata("index", "1, 2, 3, 4, 5") .Metadata("reverseIndex", "5, 4, 3, 2, 1") .Field(f => f .Name("IntCol") .DataType(UInt32Type.Default) .Metadata("custom1", "false") .Metadata("custom2", "true")) .Field(f => f .Name("StringCol") .DataType(StringType.Default) .Metadata("custom2", "false") .Metadata("custom3", "4")) .Field(f => f .Name("StructCol") .DataType(new StructType(new[] { new Field("Inner1", FloatType.Default, nullable: false), new Field("Inner2", DoubleType.Default, nullable: true, new Dictionary <string, string>() { { "customInner", "1" }, { "customInner2", "3" } }) })) .Metadata("custom4", "6.4") .Metadata("custom1", "true")); var schema = schemaBuilder.Build(); RecordBatch originalBatch = TestData.CreateSampleRecordBatch(schema, length: 10); TestRoundTripRecordBatch(originalBatch); }
public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount) { Schema.Builder builder = new Schema.Builder(); for (int i = 0; i < columnSetCount; i++) { builder.Field(CreateField(BooleanType.Default, i)); builder.Field(CreateField(UInt8Type.Default, i)); builder.Field(CreateField(Int8Type.Default, i)); builder.Field(CreateField(UInt16Type.Default, i)); builder.Field(CreateField(Int16Type.Default, i)); builder.Field(CreateField(UInt32Type.Default, i)); builder.Field(CreateField(Int32Type.Default, i)); builder.Field(CreateField(UInt64Type.Default, i)); builder.Field(CreateField(Int64Type.Default, i)); builder.Field(CreateField(FloatType.Default, i)); builder.Field(CreateField(DoubleType.Default, i)); builder.Field(CreateField(Date32Type.Default, i)); builder.Field(CreateField(Date64Type.Default, i)); builder.Field(CreateField(TimestampType.Default, i)); builder.Field(CreateField(StringType.Default, i)); //builder.Field(CreateField(new FixedSizeBinaryType(16), i)); //builder.Field(CreateField(new DecimalType(19, 2))); //builder.Field(CreateField(HalfFloatType.Default)); //builder.Field(CreateField(StringType.Default)); //builder.Field(CreateField(Time32Type.Default)); //builder.Field(CreateField(Time64Type.Default)); } Schema schema = builder.Build(); IEnumerable <IArrowArray> arrays = CreateArrays(schema, length); return(new RecordBatch(schema, arrays, length)); }
public void FieldsHaveNullTypeByDefault() { var schema = new Schema.Builder() .Field(f => f.Name("f0")) .Build(); Assert.True(schema.Fields["f0"].DataType.GetType() == typeof(NullType)); }
public void FieldNameIsRequired() { Assert.Throws <ArgumentNullException>(() => { var schema = new Schema.Builder() .Field(f => f.DataType(Int32Type.Default)) .Build(); }); }
public void GetFieldIndex() { var schema = new Schema.Builder() .Field(f => f.Name("f0").DataType(Int32Type.Default)) .Field(f => f.Name("f1").DataType(Int8Type.Default)) .Build(); Assert.True(schema.GetFieldIndex("f0") == 0 && schema.GetFieldIndex("f1") == 1); }
private static Schema CreateSchema(JsonSchema jsonSchema) { Schema.Builder builder = new Schema.Builder(); for (int i = 0; i < jsonSchema.Fields.Count; i++) { builder.Field(f => CreateField(f, jsonSchema.Fields[i])); } return(builder.Build()); }
public void FieldsAreNullableByDefault() { var b = new Schema.Builder(); var schema = new Schema.Builder() .Field(f => f.Name("f0").DataType(Int32Type.Default)) .Build(); Assert.True(schema.Fields["f0"].IsNullable); }
private Schema GetSchema(IImmutableList <Column> columns) { var schemaBuilder = new Schema.Builder(); foreach (var column in columns) { schemaBuilder.Field(new Field(column.Name, TypeConverter.Convert(column), column.IsNullable)); } return(schemaBuilder.Build()); }
public void TestBuildFromRecordBatch() { Schema.Builder builder = new Schema.Builder(); builder.Field(new Field("A", Int64Type.Default, nullable: false)); Schema schema = builder.Build(); RecordBatch batch = TestData.CreateSampleRecordBatch(schema, 10); Table table = Table.TableFromRecordBatches(schema, new[] { batch }); Assert.NotNull(table.Column(0).Data.Array(0) as Int64Array); }
public void GetFieldByName() { Field f0 = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build(); Field f1 = new Field.Builder().Name("f1").DataType(Int8Type.Default).Build(); var schema = new Schema.Builder() .Field(f0) .Field(f1) .Build(); Assert.True(schema.GetFieldByName("f0") == f0 && schema.GetFieldByName("f1") == f1); }
public void FieldsHaveExpectedValues(string name, IArrowType type, bool nullable) { var schema = new Schema.Builder() .Field(f => f.Name(name).DataType(type).Nullable(nullable)) .Build(); var field = schema.Fields[name]; Assert.Equal(name, field.Name); Assert.Equal(type.Name, field.DataType.Name); Assert.Equal(nullable, field.IsNullable); }
internal static Schema GetSchema(Flatbuf.Schema schema) { var schemaBuilder = new Schema.Builder(); for (int i = 0; i < schema.FieldsLength; i++) { Flatbuf.Field field = schema.Fields(i).GetValueOrDefault(); schemaBuilder.Field(FieldFromFlatbuffer(field)); } return(schemaBuilder.Build()); }
internal static Schema GetSchema(Flatbuf.Schema schema) { var schemaBuilder = new Schema.Builder(); for (var i = 0; i < schema.FieldsLength; i++) { var field = schema.Fields(i).GetValueOrDefault(); schemaBuilder.Field( new Field(field.Name, GetFieldArrowType(field), field.Nullable)); } return(schemaBuilder.Build()); }
public void MetadataConstruction() { var metadata0 = new Dictionary <string, string> { { "foo", "bar" }, { "bizz", "buzz" } }; var metadata1 = new Dictionary <string, string> { { "foo", "bar" } }; var metadata0Copy = new Dictionary <string, string>(metadata0); var metadata1Copy = new Dictionary <string, string>(metadata1); Field f0 = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build(); Field f1 = new Field.Builder().Name("f1").DataType(UInt8Type.Default).Nullable(false).Build(); Field f2 = new Field.Builder().Name("f2").DataType(StringType.Default).Build(); Field f3 = new Field.Builder().Name("f2").DataType(StringType.Default).Metadata(metadata1Copy).Build(); var schema0 = new Schema.Builder() .Field(f0) .Field(f1) .Field(f2) .Metadata(metadata0) .Build(); var schema1 = new Schema.Builder() .Field(f0) .Field(f1) .Field(f2) .Metadata(metadata1) .Build(); var schema2 = new Schema.Builder() .Field(f0) .Field(f1) .Field(f2) .Metadata(metadata0Copy) .Build(); var schema3 = new Schema.Builder() .Field(f0) .Field(f1) .Field(f3) .Metadata(metadata0Copy) .Build(); Assert.True(metadata0.Keys.SequenceEqual(schema0.Metadata.Keys) && metadata0.Values.SequenceEqual(schema0.Metadata.Values)); Assert.True(metadata1.Keys.SequenceEqual(schema1.Metadata.Keys) && metadata1.Values.SequenceEqual(schema1.Metadata.Values)); Assert.True(metadata0.Keys.SequenceEqual(schema2.Metadata.Keys) && metadata0.Values.SequenceEqual(schema2.Metadata.Values)); SchemaComparer.Compare(schema0, schema2); Assert.Throws <EqualException>(() => SchemaComparer.Compare(schema0, schema1)); Assert.Throws <EqualException>(() => SchemaComparer.Compare(schema2, schema1)); Assert.Throws <EqualException>(() => SchemaComparer.Compare(schema2, schema3)); }
public static Table MakeTableWithOneColumnOfTwoIntArrays(int lengthOfEachArray) { Array intArray = ColumnTests.MakeIntArray(lengthOfEachArray); Array intArrayCopy = ColumnTests.MakeIntArray(lengthOfEachArray); Field field = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build(); Schema s0 = new Schema.Builder().Field(field).Build(); Column column = new Column(field, new List <Array> { intArray, intArrayCopy }); Table table = new Table(s0, new List <Column> { column }); return(table); }
public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount, bool createDictionaryArray) { Schema.Builder builder = new Schema.Builder(); for (int i = 0; i < columnSetCount; i++) { builder.Field(CreateField(new ListType(Int64Type.Default), i)); builder.Field(CreateField(BooleanType.Default, i)); builder.Field(CreateField(UInt8Type.Default, i)); builder.Field(CreateField(Int8Type.Default, i)); builder.Field(CreateField(UInt16Type.Default, i)); builder.Field(CreateField(Int16Type.Default, i)); builder.Field(CreateField(UInt32Type.Default, i)); builder.Field(CreateField(Int32Type.Default, i)); builder.Field(CreateField(UInt64Type.Default, i)); builder.Field(CreateField(Int64Type.Default, i)); builder.Field(CreateField(FloatType.Default, i)); builder.Field(CreateField(DoubleType.Default, i)); builder.Field(CreateField(Date32Type.Default, i)); builder.Field(CreateField(Date64Type.Default, i)); builder.Field(CreateField(TimestampType.Default, i)); builder.Field(CreateField(StringType.Default, i)); builder.Field(CreateField(new StructType(new List <Field> { CreateField(StringType.Default, i), CreateField(Int32Type.Default, i) }), i)); builder.Field(CreateField(new Decimal128Type(10, 6), i)); builder.Field(CreateField(new Decimal256Type(16, 8), i)); if (createDictionaryArray) { builder.Field(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i)); } //builder.Field(CreateField(new FixedSizeBinaryType(16), i)); //builder.Field(CreateField(HalfFloatType.Default)); //builder.Field(CreateField(StringType.Default)); //builder.Field(CreateField(Time32Type.Default)); //builder.Field(CreateField(Time64Type.Default)); } Schema schema = builder.Build(); return(CreateSampleRecordBatch(schema, length)); }
private RecordBatch WrapColumnsInStructIfApplicable(RecordBatch batch) { if (_version >= new Version(Versions.V3_0_0)) { var fields = new Field[batch.Schema.Fields.Count]; for (int i = 0; i < batch.Schema.Fields.Count; ++i) { fields[i] = batch.Schema.GetFieldByIndex(i); } var structType = new StructType(fields); var structArray = new StructArray( structType, batch.Length, batch.Arrays.Cast <Apache.Arrow.Array>(), ArrowBuffer.Empty); Schema schema = new Schema.Builder().Field(new Field("Struct", structType, false)).Build(); return(new RecordBatch(schema, new[] { structArray }, batch.Length)); } return(batch); }
private static Schema BuildSchema(IArrowArray[] resultColumns) { var schemaBuilder = new Schema.Builder(); if (resultColumns.Length == 1) { schemaBuilder = schemaBuilder .Field(f => f.Name("Result") .DataType(resultColumns[0].Data.DataType) .Nullable(false)); } else { for (int i = 0; i < resultColumns.Length; ++i) { schemaBuilder = schemaBuilder .Field(f => f.Name("Result" + i) .DataType(resultColumns[i].Data.DataType) .Nullable(false)); } } return(schemaBuilder.Build()); }
private FlightInfo GetFlightInfo(string sql, ServerCallContext context) { var partitionsResult = _koraliumTransportService.GetPartitions(IsPartitionsEnabled(context), sql, new Shared.SqlParameters(), context.GetHttpContext()).Result; var schemaBuilder = new Schema.Builder(); foreach (var column in partitionsResult.Columns) { schemaBuilder.Field(new Field(column.Name, TypeConverter.Convert(column), column.IsNullable)); } var descriptor = FlightDescriptor.CreateCommandDescriptor(sql); List <FlightEndpoint> endpoints = new List <FlightEndpoint>(); foreach (var partition in partitionsResult.Partitions) { List <FlightLocation> locations = new List <FlightLocation>(); foreach (var location in partition.Locations) { string uri = null; if (location.Tls) { uri = $"grpc+tls://{location.Host}"; } else { uri = $"grpc+tcp://{location.Host}"; } locations.Add(new FlightLocation(uri)); } endpoints.Add(new FlightEndpoint(new FlightTicket(partition.Sql), locations)); } return(new FlightInfo(schemaBuilder.Build(), descriptor, endpoints)); }
public void TestListOfStructArray() { Schema.Builder builder = new Schema.Builder(); Field structField = new Field( "struct", new StructType( new[] { new Field("name", StringType.Default, nullable: false), new Field("age", Int64Type.Default, nullable: false), }), nullable: false); Field listField = new Field("listOfStructs", new ListType(structField), nullable: false); builder.Field(listField); Schema schema = builder.Build(); StringArray stringArray = new StringArray.Builder() .Append("joe").AppendNull().AppendNull().Append("mark").Append("abe").Append("phil").Build(); Int64Array intArray = new Int64Array.Builder() .Append(1).Append(2).AppendNull().Append(4).Append(10).Append(55).Build(); ArrowBuffer nullBitmapBuffer = new ArrowBuffer.BitmapBuilder() .Append(true).Append(true).Append(false).Append(true).Append(true).Append(true).Build(); StructArray structs = new StructArray(structField.DataType, 6, new IArrowArray[] { stringArray, intArray }, nullBitmapBuffer, nullCount: 1); ArrowBuffer offsetsBuffer = new ArrowBuffer.Builder <int>() .Append(0).Append(2).Append(5).Append(6).Build(); ListArray listArray = new ListArray(listField.DataType, 3, offsetsBuffer, structs, ArrowBuffer.Empty); RecordBatch batch = new RecordBatch(schema, new[] { listArray }, 3); TestRoundTripRecordBatch(batch); }
public async Task TestArrowGroupedMapCommandExecutor() { StringArray ConvertStrings(StringArray strings) { return((StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); } Int64Array ConvertInt64s(Int64Array int64s) { return((Int64Array)ToArrowArray( Enumerable.Range(0, int64s.Length) .Select(i => int64s.Values[i] + 100) .ToArray())); } Schema resultSchema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper( (batch) => new RecordBatch( resultSchema, new IArrowArray[] { ConvertStrings((StringArray)batch.Column(0)), ConvertInt64s((Int64Array)batch.Column(1)), }, batch.Length)); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, Commands = new[] { command } }; using (var inputStream = new MemoryStream()) using (var outputStream = new MemoryStream()) { int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray( Enumerable.Range(0, numRows) .Select(i => (long)i) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.ColumnCount); var stringArray = (StringArray)outputBatch.Column(0); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", stringArray.GetString(i)); } var longArray = (Int64Array)outputBatch.Column(1); for (int i = 0; i < numRows; ++i) { Assert.Equal(100 + i, longArray.Values[i]); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); } }
public void TestArrowSqlCommandExecutorWithEmptyInput() { var udfWrapper = new Sql.ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command } }; using (var inputStream = new MemoryStream()) using (var outputStream = new MemoryStream()) { // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); // The .NET ArrowStreamWriter doesn't currently support writing just a // schema with no batches - but Java does. We use Reflection to simulate // the request Spark sends. MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod( "WriteSchemaAsync", BindingFlags.NonPublic | BindingFlags.Instance); writeSchemaMethod.Invoke( arrowWriter, new object[] { schema, CancellationToken.None }); SerDe.Write(inputStream, 0); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(0, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = arrowReader.ReadNextRecordBatch(); Assert.Equal(1, outputBatch.Schema.Fields.Count); Assert.IsType <StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType); Assert.Equal(0, outputBatch.Length); Assert.Single(outputBatch.Arrays); var array = (StringArray)outputBatch.Arrays.ElementAt(0); Assert.Equal(0, array.Length); int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); } }
public async Task TestArrowSqlCommandExecutorWithMultiCommands() { var udfWrapper1 = new Sql.ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"udf: {strings.GetString(i)}") .ToArray())); var udfWrapper2 = new Sql.ArrowUdfWrapper <Int32Array, Int32Array, Int32Array>( (arg1, arg2) => (Int32Array)ToArrowArray( Enumerable.Range(0, arg1.Length) .Select(i => arg1.Values[i] * arg2.Values[i]) .ToArray())); var command1 = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper1.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var command2 = new SqlCommand() { ArgOffsets = new[] { 1, 2 }, NumChainedFunctions = 1, WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper2.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command1, command2 } }; using (var inputStream = new MemoryStream()) using (var outputStream = new MemoryStream()) { int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int32Type.Default)) .Field(b => b.Name("arg3").DataType(Int32Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); var arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.Arrays.Count()); var array1 = (StringArray)outputBatch.Arrays.ElementAt(0); var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array1.GetString(i)); Assert.Equal(i * i, array2.Values[i]); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); } }
public Builder SetSchema(Schema.Builder builderForValue) { _producer.Schema = builderForValue.Build(); return(this); }
public async Task TestDataFrameSqlCommandExecutorWithSingleCommand( Version sparkVersion, IpcOptions ipcOptions) { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"udf: {cur}")); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Single(outputBatch.Arrays); var array = (StringArray)outputBatch.Arrays.ElementAt(0); // Validate the single command. for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array.GetString(i)); } CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public void TestRecordBatchWithStructArrays() { RecordBatch CreateRecordBatch(string prependColumnNamesWith = "") { RecordBatch ret = new RecordBatch.Builder() .Append(prependColumnNamesWith + "Column1", false, col => col.Int32(array => array.AppendRange(Enumerable.Range(0, 10)))) .Append(prependColumnNamesWith + "Column2", true, new Int32Array( valueBuffer: new ArrowBuffer.Builder <int>().AppendRange(Enumerable.Range(0, 10)).Build(), nullBitmapBuffer: new ArrowBuffer.Builder <byte>().Append(0xfd).Append(0xff).Build(), length: 10, nullCount: 1, offset: 0)) .Append(prependColumnNamesWith + "Column3", true, new Int32Array( valueBuffer: new ArrowBuffer.Builder <int>().AppendRange(Enumerable.Range(0, 10)).Build(), nullBitmapBuffer: new ArrowBuffer.Builder <byte>().Append(0x00).Append(0x00).Build(), length: 10, nullCount: 10, offset: 0)) .Append(prependColumnNamesWith + "NullableBooleanColumn", true, new BooleanArray( valueBuffer: new ArrowBuffer.Builder <byte>().Append(0xfd).Append(0xff).Build(), nullBitmapBuffer: new ArrowBuffer.Builder <byte>().Append(0xed).Append(0xff).Build(), length: 10, nullCount: 2, offset: 0)) .Append(prependColumnNamesWith + "StringDataFrameColumn", false, new StringArray.Builder().AppendRange(Enumerable.Range(0, 10).Select(x => x.ToString())).Build()) .Append(prependColumnNamesWith + "DoubleColumn", false, new DoubleArray.Builder().AppendRange(Enumerable.Repeat(1.0, 10)).Build()) .Append(prependColumnNamesWith + "FloatColumn", false, new FloatArray.Builder().AppendRange(Enumerable.Repeat(1.0f, 10)).Build()) .Append(prependColumnNamesWith + "ShortColumn", false, new Int16Array.Builder().AppendRange(Enumerable.Repeat((short)1, 10)).Build()) .Append(prependColumnNamesWith + "LongColumn", false, new Int64Array.Builder().AppendRange(Enumerable.Repeat((long)1, 10)).Build()) .Append(prependColumnNamesWith + "UIntColumn", false, new UInt32Array.Builder().AppendRange(Enumerable.Repeat((uint)1, 10)).Build()) .Append(prependColumnNamesWith + "UShortColumn", false, new UInt16Array.Builder().AppendRange(Enumerable.Repeat((ushort)1, 10)).Build()) .Append(prependColumnNamesWith + "ULongColumn", false, new UInt64Array.Builder().AppendRange(Enumerable.Repeat((ulong)1, 10)).Build()) .Append(prependColumnNamesWith + "ByteColumn", false, new Int8Array.Builder().AppendRange(Enumerable.Repeat((sbyte)1, 10)).Build()) .Append(prependColumnNamesWith + "UByteColumn", false, new UInt8Array.Builder().AppendRange(Enumerable.Repeat((byte)1, 10)).Build()) .Build(); return(ret); } RecordBatch originalBatch = CreateRecordBatch(); ArrowBuffer.BitmapBuilder validityBitmapBuilder = new ArrowBuffer.BitmapBuilder(); for (int i = 0; i < originalBatch.Length; i++) { validityBitmapBuilder.Append(true); } ArrowBuffer validityBitmap = validityBitmapBuilder.Build(); StructType structType = new StructType(originalBatch.Schema.Fields.Select((KeyValuePair <string, Field> pair) => pair.Value).ToList()); StructArray structArray = new StructArray(structType, originalBatch.Length, originalBatch.Arrays.Cast <Apache.Arrow.Array>(), validityBitmap); Schema schema = new Schema.Builder().Field(new Field("Struct", structType, false)).Build(); RecordBatch recordBatch = new RecordBatch(schema, new[] { structArray }, originalBatch.Length); DataFrame df = DataFrame.FromArrowRecordBatch(recordBatch); DataFrameIOTests.VerifyColumnTypes(df, testArrowStringColumn: true); IEnumerable <RecordBatch> recordBatches = df.ToArrowRecordBatches(); RecordBatch expected = CreateRecordBatch("Struct_"); foreach (RecordBatch batch in recordBatches) { RecordBatchComparer.CompareBatches(expected, batch); } }
public Builder SetSchema(Schema.Builder builderForValue) { _subscribe.Schema = builderForValue.Build(); return(this); }