public Builder(MemoryAllocator allocator = default)
 {
     _allocator     = allocator ?? MemoryAllocator.Default.Value;
     _arrayBuilder  = new ArrayBuilder(_allocator);
     _schemaBuilder = new Schema.Builder();
     _arrays        = new List <IArrowArray>();
 }
Esempio n. 2
0
    public void getOutputSchemaInner(string sql)
    {
        reader = new ArrowStreamReader(this.outputBuffer, leaveOpen: false);
        // reader one batch to get the arrow schema first
        reader.ReadNextRecordBatch();

        this.schema = ArrowSchemaToASARecordSchema(reader.Schema);

        var result =
            SqlCompiler.Compile(
                sql,
                new QueryBindings(
                    new Dictionary <string, InputDescription> {
            { "input", new InputDescription(this.schema, InputType.Stream) }
        }));

        var step = result.Steps.First();

        Schema.Builder builder = new Schema.Builder();
        foreach (KeyValuePair <string, int> kv in step.Output.PayloadSchema.Ordinals.OrderBy(kv => kv.Value))
        {
            builder = builder.Field(f => f.Name(kv.Key).DataType(ASATypeToArrowType(step.Output.PayloadSchema[kv.Value].Schema)).Nullable(false));
        }

        this.outputArrowSchema = builder.Build();

        this.writer = new ArrowStreamWriter(this.inputBuffer, this.outputArrowSchema);
        //Write empty batch to send the schema to Java side
        var emptyRecordBatch = createOutputRecordBatch(new List <IRecord>());

        WriteRecordBatch(emptyRecordBatch);
    }
Esempio n. 3
0
        public static RecordBatch CreateSampleRecordBatch(int length)
        {
            Schema.Builder builder = new Schema.Builder();
            builder.Field(CreateField(BooleanType.Default));
            builder.Field(CreateField(UInt8Type.Default));
            builder.Field(CreateField(Int8Type.Default));
            builder.Field(CreateField(UInt16Type.Default));
            builder.Field(CreateField(Int16Type.Default));
            builder.Field(CreateField(UInt32Type.Default));
            builder.Field(CreateField(Int32Type.Default));
            builder.Field(CreateField(UInt64Type.Default));
            builder.Field(CreateField(Int64Type.Default));
            builder.Field(CreateField(FloatType.Default));
            builder.Field(CreateField(DoubleType.Default));
            //builder.Field(CreateField(new DecimalType(19, 2)));
            //builder.Field(CreateField(HalfFloatType.Default));
            //builder.Field(CreateField(StringType.Default));
            //builder.Field(CreateField(Date32Type.Default));
            //builder.Field(CreateField(Date64Type.Default));
            //builder.Field(CreateField(Time32Type.Default));
            //builder.Field(CreateField(Time64Type.Default));
            //builder.Field(CreateField(TimestampType.Default));

            Schema schema = builder.Build();

            IEnumerable <IArrowArray> arrays = CreateArrays(schema, length);

            return(new RecordBatch(schema, arrays, length));
        }
        public void WritesMetadataCorrectly()
        {
            Schema.Builder schemaBuilder = new Schema.Builder()
                                           .Metadata("index", "1, 2, 3, 4, 5")
                                           .Metadata("reverseIndex", "5, 4, 3, 2, 1")
                                           .Field(f => f
                                                  .Name("IntCol")
                                                  .DataType(UInt32Type.Default)
                                                  .Metadata("custom1", "false")
                                                  .Metadata("custom2", "true"))
                                           .Field(f => f
                                                  .Name("StringCol")
                                                  .DataType(StringType.Default)
                                                  .Metadata("custom2", "false")
                                                  .Metadata("custom3", "4"))
                                           .Field(f => f
                                                  .Name("StructCol")
                                                  .DataType(new StructType(new[] {
                new Field("Inner1", FloatType.Default, nullable: false),
                new Field("Inner2", DoubleType.Default, nullable: true, new Dictionary <string, string>()
                {
                    { "customInner", "1" }, { "customInner2", "3" }
                })
            }))
                                                  .Metadata("custom4", "6.4")
                                                  .Metadata("custom1", "true"));

            var         schema        = schemaBuilder.Build();
            RecordBatch originalBatch = TestData.CreateSampleRecordBatch(schema, length: 10);

            TestRoundTripRecordBatch(originalBatch);
        }
Esempio n. 5
0
        public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount)
        {
            Schema.Builder builder = new Schema.Builder();
            for (int i = 0; i < columnSetCount; i++)
            {
                builder.Field(CreateField(BooleanType.Default, i));
                builder.Field(CreateField(UInt8Type.Default, i));
                builder.Field(CreateField(Int8Type.Default, i));
                builder.Field(CreateField(UInt16Type.Default, i));
                builder.Field(CreateField(Int16Type.Default, i));
                builder.Field(CreateField(UInt32Type.Default, i));
                builder.Field(CreateField(Int32Type.Default, i));
                builder.Field(CreateField(UInt64Type.Default, i));
                builder.Field(CreateField(Int64Type.Default, i));
                builder.Field(CreateField(FloatType.Default, i));
                builder.Field(CreateField(DoubleType.Default, i));
                builder.Field(CreateField(Date32Type.Default, i));
                builder.Field(CreateField(Date64Type.Default, i));
                builder.Field(CreateField(TimestampType.Default, i));
                builder.Field(CreateField(StringType.Default, i));
                //builder.Field(CreateField(new FixedSizeBinaryType(16), i));
                //builder.Field(CreateField(new DecimalType(19, 2)));
                //builder.Field(CreateField(HalfFloatType.Default));
                //builder.Field(CreateField(StringType.Default));
                //builder.Field(CreateField(Time32Type.Default));
                //builder.Field(CreateField(Time64Type.Default));
            }

            Schema schema = builder.Build();

            IEnumerable <IArrowArray> arrays = CreateArrays(schema, length);

            return(new RecordBatch(schema, arrays, length));
        }
Esempio n. 6
0
            public void FieldsHaveNullTypeByDefault()
            {
                var schema = new Schema.Builder()
                             .Field(f => f.Name("f0"))
                             .Build();

                Assert.True(schema.Fields["f0"].DataType.GetType() == typeof(NullType));
            }
Esempio n. 7
0
 public void FieldNameIsRequired()
 {
     Assert.Throws <ArgumentNullException>(() =>
     {
         var schema = new Schema.Builder()
                      .Field(f => f.DataType(Int32Type.Default))
                      .Build();
     });
 }
            public void GetFieldIndex()
            {
                var schema = new Schema.Builder()
                             .Field(f => f.Name("f0").DataType(Int32Type.Default))
                             .Field(f => f.Name("f1").DataType(Int8Type.Default))
                             .Build();

                Assert.True(schema.GetFieldIndex("f0") == 0 && schema.GetFieldIndex("f1") == 1);
            }
 private static Schema CreateSchema(JsonSchema jsonSchema)
 {
     Schema.Builder builder = new Schema.Builder();
     for (int i = 0; i < jsonSchema.Fields.Count; i++)
     {
         builder.Field(f => CreateField(f, jsonSchema.Fields[i]));
     }
     return(builder.Build());
 }
Esempio n. 10
0
            public void FieldsAreNullableByDefault()
            {
                var b = new Schema.Builder();

                var schema = new Schema.Builder()
                             .Field(f => f.Name("f0").DataType(Int32Type.Default))
                             .Build();

                Assert.True(schema.Fields["f0"].IsNullable);
            }
Esempio n. 11
0
        private Schema GetSchema(IImmutableList <Column> columns)
        {
            var schemaBuilder = new Schema.Builder();

            foreach (var column in columns)
            {
                schemaBuilder.Field(new Field(column.Name, TypeConverter.Convert(column), column.IsNullable));
            }

            return(schemaBuilder.Build());
        }
Esempio n. 12
0
        public void TestBuildFromRecordBatch()
        {
            Schema.Builder builder = new Schema.Builder();
            builder.Field(new Field("A", Int64Type.Default, nullable: false));
            Schema schema = builder.Build();

            RecordBatch batch = TestData.CreateSampleRecordBatch(schema, 10);
            Table       table = Table.TableFromRecordBatches(schema, new[] { batch });

            Assert.NotNull(table.Column(0).Data.Array(0) as Int64Array);
        }
Esempio n. 13
0
            public void GetFieldByName()
            {
                Field f0 = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build();
                Field f1 = new Field.Builder().Name("f1").DataType(Int8Type.Default).Build();

                var schema = new Schema.Builder()
                             .Field(f0)
                             .Field(f1)
                             .Build();

                Assert.True(schema.GetFieldByName("f0") == f0 && schema.GetFieldByName("f1") == f1);
            }
Esempio n. 14
0
            public void FieldsHaveExpectedValues(string name, IArrowType type, bool nullable)
            {
                var schema = new Schema.Builder()
                             .Field(f => f.Name(name).DataType(type).Nullable(nullable))
                             .Build();

                var field = schema.Fields[name];

                Assert.Equal(name, field.Name);
                Assert.Equal(type.Name, field.DataType.Name);
                Assert.Equal(nullable, field.IsNullable);
            }
Esempio n. 15
0
        internal static Schema GetSchema(Flatbuf.Schema schema)
        {
            var schemaBuilder = new Schema.Builder();

            for (int i = 0; i < schema.FieldsLength; i++)
            {
                Flatbuf.Field field = schema.Fields(i).GetValueOrDefault();

                schemaBuilder.Field(FieldFromFlatbuffer(field));
            }

            return(schemaBuilder.Build());
        }
Esempio n. 16
0
        internal static Schema GetSchema(Flatbuf.Schema schema)
        {
            var schemaBuilder = new Schema.Builder();

            for (var i = 0; i < schema.FieldsLength; i++)
            {
                var field = schema.Fields(i).GetValueOrDefault();

                schemaBuilder.Field(
                    new Field(field.Name, GetFieldArrowType(field), field.Nullable));
            }

            return(schemaBuilder.Build());
        }
Esempio n. 17
0
            public void MetadataConstruction()
            {
                var metadata0 = new Dictionary <string, string> {
                    { "foo", "bar" }, { "bizz", "buzz" }
                };
                var metadata1 = new Dictionary <string, string> {
                    { "foo", "bar" }
                };
                var   metadata0Copy = new Dictionary <string, string>(metadata0);
                var   metadata1Copy = new Dictionary <string, string>(metadata1);
                Field f0            = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build();
                Field f1            = new Field.Builder().Name("f1").DataType(UInt8Type.Default).Nullable(false).Build();
                Field f2            = new Field.Builder().Name("f2").DataType(StringType.Default).Build();
                Field f3            = new Field.Builder().Name("f2").DataType(StringType.Default).Metadata(metadata1Copy).Build();

                var schema0 = new Schema.Builder()
                              .Field(f0)
                              .Field(f1)
                              .Field(f2)
                              .Metadata(metadata0)
                              .Build();
                var schema1 = new Schema.Builder()
                              .Field(f0)
                              .Field(f1)
                              .Field(f2)
                              .Metadata(metadata1)
                              .Build();
                var schema2 = new Schema.Builder()
                              .Field(f0)
                              .Field(f1)
                              .Field(f2)
                              .Metadata(metadata0Copy)
                              .Build();
                var schema3 = new Schema.Builder()
                              .Field(f0)
                              .Field(f1)
                              .Field(f3)
                              .Metadata(metadata0Copy)
                              .Build();

                Assert.True(metadata0.Keys.SequenceEqual(schema0.Metadata.Keys) && metadata0.Values.SequenceEqual(schema0.Metadata.Values));
                Assert.True(metadata1.Keys.SequenceEqual(schema1.Metadata.Keys) && metadata1.Values.SequenceEqual(schema1.Metadata.Values));
                Assert.True(metadata0.Keys.SequenceEqual(schema2.Metadata.Keys) && metadata0.Values.SequenceEqual(schema2.Metadata.Values));
                SchemaComparer.Compare(schema0, schema2);
                Assert.Throws <EqualException>(() => SchemaComparer.Compare(schema0, schema1));
                Assert.Throws <EqualException>(() => SchemaComparer.Compare(schema2, schema1));
                Assert.Throws <EqualException>(() => SchemaComparer.Compare(schema2, schema3));
            }
Esempio n. 18
0
        public static Table MakeTableWithOneColumnOfTwoIntArrays(int lengthOfEachArray)
        {
            Array intArray     = ColumnTests.MakeIntArray(lengthOfEachArray);
            Array intArrayCopy = ColumnTests.MakeIntArray(lengthOfEachArray);

            Field  field = new Field.Builder().Name("f0").DataType(Int32Type.Default).Build();
            Schema s0    = new Schema.Builder().Field(field).Build();

            Column column = new Column(field, new List <Array> {
                intArray, intArrayCopy
            });
            Table table = new Table(s0, new List <Column> {
                column
            });

            return(table);
        }
Esempio n. 19
0
        public static RecordBatch CreateSampleRecordBatch(int length, int columnSetCount, bool createDictionaryArray)
        {
            Schema.Builder builder = new Schema.Builder();
            for (int i = 0; i < columnSetCount; i++)
            {
                builder.Field(CreateField(new ListType(Int64Type.Default), i));
                builder.Field(CreateField(BooleanType.Default, i));
                builder.Field(CreateField(UInt8Type.Default, i));
                builder.Field(CreateField(Int8Type.Default, i));
                builder.Field(CreateField(UInt16Type.Default, i));
                builder.Field(CreateField(Int16Type.Default, i));
                builder.Field(CreateField(UInt32Type.Default, i));
                builder.Field(CreateField(Int32Type.Default, i));
                builder.Field(CreateField(UInt64Type.Default, i));
                builder.Field(CreateField(Int64Type.Default, i));
                builder.Field(CreateField(FloatType.Default, i));
                builder.Field(CreateField(DoubleType.Default, i));
                builder.Field(CreateField(Date32Type.Default, i));
                builder.Field(CreateField(Date64Type.Default, i));
                builder.Field(CreateField(TimestampType.Default, i));
                builder.Field(CreateField(StringType.Default, i));
                builder.Field(CreateField(new StructType(new List <Field> {
                    CreateField(StringType.Default, i), CreateField(Int32Type.Default, i)
                }), i));
                builder.Field(CreateField(new Decimal128Type(10, 6), i));
                builder.Field(CreateField(new Decimal256Type(16, 8), i));

                if (createDictionaryArray)
                {
                    builder.Field(CreateField(new DictionaryType(Int32Type.Default, StringType.Default, false), i));
                }

                //builder.Field(CreateField(new FixedSizeBinaryType(16), i));
                //builder.Field(CreateField(HalfFloatType.Default));
                //builder.Field(CreateField(StringType.Default));
                //builder.Field(CreateField(Time32Type.Default));
                //builder.Field(CreateField(Time64Type.Default));
            }

            Schema schema = builder.Build();

            return(CreateSampleRecordBatch(schema, length));
        }
Esempio n. 20
0
        private RecordBatch WrapColumnsInStructIfApplicable(RecordBatch batch)
        {
            if (_version >= new Version(Versions.V3_0_0))
            {
                var fields = new Field[batch.Schema.Fields.Count];
                for (int i = 0; i < batch.Schema.Fields.Count; ++i)
                {
                    fields[i] = batch.Schema.GetFieldByIndex(i);
                }

                var structType  = new StructType(fields);
                var structArray = new StructArray(
                    structType,
                    batch.Length,
                    batch.Arrays.Cast <Apache.Arrow.Array>(),
                    ArrowBuffer.Empty);
                Schema schema = new Schema.Builder().Field(new Field("Struct", structType, false)).Build();
                return(new RecordBatch(schema, new[] { structArray }, batch.Length));
            }

            return(batch);
        }
Esempio n. 21
0
        private static Schema BuildSchema(IArrowArray[] resultColumns)
        {
            var schemaBuilder = new Schema.Builder();

            if (resultColumns.Length == 1)
            {
                schemaBuilder = schemaBuilder
                                .Field(f => f.Name("Result")
                                       .DataType(resultColumns[0].Data.DataType)
                                       .Nullable(false));
            }
            else
            {
                for (int i = 0; i < resultColumns.Length; ++i)
                {
                    schemaBuilder = schemaBuilder
                                    .Field(f => f.Name("Result" + i)
                                           .DataType(resultColumns[i].Data.DataType)
                                           .Nullable(false));
                }
            }
            return(schemaBuilder.Build());
        }
Esempio n. 22
0
        private FlightInfo GetFlightInfo(string sql, ServerCallContext context)
        {
            var partitionsResult = _koraliumTransportService.GetPartitions(IsPartitionsEnabled(context), sql, new Shared.SqlParameters(), context.GetHttpContext()).Result;

            var schemaBuilder = new Schema.Builder();

            foreach (var column in partitionsResult.Columns)
            {
                schemaBuilder.Field(new Field(column.Name, TypeConverter.Convert(column), column.IsNullable));
            }
            var descriptor = FlightDescriptor.CreateCommandDescriptor(sql);

            List <FlightEndpoint> endpoints = new List <FlightEndpoint>();

            foreach (var partition in partitionsResult.Partitions)
            {
                List <FlightLocation> locations = new List <FlightLocation>();

                foreach (var location in partition.Locations)
                {
                    string uri = null;

                    if (location.Tls)
                    {
                        uri = $"grpc+tls://{location.Host}";
                    }
                    else
                    {
                        uri = $"grpc+tcp://{location.Host}";
                    }

                    locations.Add(new FlightLocation(uri));
                }
                endpoints.Add(new FlightEndpoint(new FlightTicket(partition.Sql), locations));
            }
            return(new FlightInfo(schemaBuilder.Build(), descriptor, endpoints));
        }
Esempio n. 23
0
        public void TestListOfStructArray()
        {
            Schema.Builder builder     = new Schema.Builder();
            Field          structField = new Field(
                "struct",
                new StructType(
                    new[]
            {
                new Field("name", StringType.Default, nullable: false),
                new Field("age", Int64Type.Default, nullable: false),
            }),
                nullable: false);

            Field listField = new Field("listOfStructs", new ListType(structField), nullable: false);

            builder.Field(listField);
            Schema schema = builder.Build();

            StringArray stringArray = new StringArray.Builder()
                                      .Append("joe").AppendNull().AppendNull().Append("mark").Append("abe").Append("phil").Build();
            Int64Array intArray = new Int64Array.Builder()
                                  .Append(1).Append(2).AppendNull().Append(4).Append(10).Append(55).Build();

            ArrowBuffer nullBitmapBuffer = new ArrowBuffer.BitmapBuilder()
                                           .Append(true).Append(true).Append(false).Append(true).Append(true).Append(true).Build();

            StructArray structs = new StructArray(structField.DataType, 6, new IArrowArray[] { stringArray, intArray }, nullBitmapBuffer, nullCount: 1);

            ArrowBuffer offsetsBuffer = new ArrowBuffer.Builder <int>()
                                        .Append(0).Append(2).Append(5).Append(6).Build();
            ListArray listArray = new ListArray(listField.DataType, 3, offsetsBuffer, structs, ArrowBuffer.Empty);

            RecordBatch batch = new RecordBatch(schema, new[] { listArray }, 3);

            TestRoundTripRecordBatch(batch);
        }
Esempio n. 24
0
        public async Task TestArrowGroupedMapCommandExecutor()
        {
            StringArray ConvertStrings(StringArray strings)
            {
                return((StringArray)ToArrowArray(
                           Enumerable.Range(0, strings.Length)
                           .Select(i => $"udf: {strings.GetString(i)}")
                           .ToArray()));
            }

            Int64Array ConvertInt64s(Int64Array int64s)
            {
                return((Int64Array)ToArrowArray(
                           Enumerable.Range(0, int64s.Length)
                           .Select(i => int64s.Values[i] + 100)
                           .ToArray()));
            }

            Schema resultSchema = new Schema.Builder()
                                  .Field(b => b.Name("arg1").DataType(StringType.Default))
                                  .Field(b => b.Name("arg2").DataType(Int64Type.Default))
                                  .Build();

            var udfWrapper = new Sql.ArrowGroupedMapUdfWrapper(
                (batch) => new RecordBatch(
                    resultSchema,
                    new IArrowArray[]
            {
                ConvertStrings((StringArray)batch.Column(0)),
                ConvertInt64s((Int64Array)batch.Column(1)),
            },
                    batch.Length));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.ArrowGroupedMapWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                Commands = new[] { command }
            };

            using (var inputStream = new MemoryStream())
                using (var outputStream = new MemoryStream())
                {
                    int numRows = 10;

                    // Write test data to the input stream.
                    Schema schema = new Schema.Builder()
                                    .Field(b => b.Name("arg1").DataType(StringType.Default))
                                    .Field(b => b.Name("arg2").DataType(Int64Type.Default))
                                    .Build();
                    var arrowWriter = new ArrowStreamWriter(inputStream, schema);
                    await arrowWriter.WriteRecordBatchAsync(
                        new RecordBatch(
                            schema,
                            new[]
                    {
                        ToArrowArray(
                            Enumerable.Range(0, numRows)
                            .Select(i => i.ToString())
                            .ToArray()),
                        ToArrowArray(
                            Enumerable.Range(0, numRows)
                            .Select(i => (long)i)
                            .ToArray())
                    },
                            numRows));

                    inputStream.Seek(0, SeekOrigin.Begin);

                    CommandExecutorStat stat = new CommandExecutor().Execute(
                        inputStream,
                        outputStream,
                        0,
                        commandPayload);

                    // Validate that all the data on the stream is read.
                    Assert.Equal(inputStream.Length, inputStream.Position);
                    Assert.Equal(numRows, stat.NumEntriesProcessed);

                    // Validate the output stream.
                    outputStream.Seek(0, SeekOrigin.Begin);
                    int arrowLength = SerDe.ReadInt32(outputStream);
                    Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
                    var         arrowReader = new ArrowStreamReader(outputStream);
                    RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

                    Assert.Equal(numRows, outputBatch.Length);
                    Assert.Equal(2, outputBatch.ColumnCount);

                    var stringArray = (StringArray)outputBatch.Column(0);
                    for (int i = 0; i < numRows; ++i)
                    {
                        Assert.Equal($"udf: {i}", stringArray.GetString(i));
                    }

                    var longArray = (Int64Array)outputBatch.Column(1);
                    for (int i = 0; i < numRows; ++i)
                    {
                        Assert.Equal(100 + i, longArray.Values[i]);
                    }

                    int end = SerDe.ReadInt32(outputStream);
                    Assert.Equal(0, end);

                    // Validate all the data on the stream is read.
                    Assert.Equal(outputStream.Length, outputStream.Position);
                }
        }
Esempio n. 25
0
        public void TestArrowSqlCommandExecutorWithEmptyInput()
        {
            var udfWrapper = new Sql.ArrowUdfWrapper <StringArray, StringArray>(
                (strings) => (StringArray)ToArrowArray(
                    Enumerable.Range(0, strings.Length)
                    .Select(i => $"udf: {strings.GetString(i)}")
                    .ToArray()));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.ArrowWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command }
            };

            using (var inputStream = new MemoryStream())
                using (var outputStream = new MemoryStream())
                {
                    // Write test data to the input stream.
                    Schema schema = new Schema.Builder()
                                    .Field(b => b.Name("arg1").DataType(StringType.Default))
                                    .Build();
                    var arrowWriter = new ArrowStreamWriter(inputStream, schema);

                    // The .NET ArrowStreamWriter doesn't currently support writing just a
                    // schema with no batches - but Java does. We use Reflection to simulate
                    // the request Spark sends.
                    MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod(
                        "WriteSchemaAsync",
                        BindingFlags.NonPublic | BindingFlags.Instance);

                    writeSchemaMethod.Invoke(
                        arrowWriter,
                        new object[] { schema, CancellationToken.None });

                    SerDe.Write(inputStream, 0);

                    inputStream.Seek(0, SeekOrigin.Begin);

                    CommandExecutorStat stat = new CommandExecutor().Execute(
                        inputStream,
                        outputStream,
                        0,
                        commandPayload);

                    // Validate that all the data on the stream is read.
                    Assert.Equal(inputStream.Length, inputStream.Position);
                    Assert.Equal(0, stat.NumEntriesProcessed);

                    // Validate the output stream.
                    outputStream.Seek(0, SeekOrigin.Begin);
                    int arrowLength = SerDe.ReadInt32(outputStream);
                    Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
                    var         arrowReader = new ArrowStreamReader(outputStream);
                    RecordBatch outputBatch = arrowReader.ReadNextRecordBatch();

                    Assert.Equal(1, outputBatch.Schema.Fields.Count);
                    Assert.IsType <StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType);

                    Assert.Equal(0, outputBatch.Length);
                    Assert.Single(outputBatch.Arrays);

                    var array = (StringArray)outputBatch.Arrays.ElementAt(0);
                    Assert.Equal(0, array.Length);

                    int end = SerDe.ReadInt32(outputStream);
                    Assert.Equal(0, end);

                    // Validate all the data on the stream is read.
                    Assert.Equal(outputStream.Length, outputStream.Position);
                }
        }
Esempio n. 26
0
        public async Task TestArrowSqlCommandExecutorWithMultiCommands()
        {
            var udfWrapper1 = new Sql.ArrowUdfWrapper <StringArray, StringArray>(
                (strings) => (StringArray)ToArrowArray(
                    Enumerable.Range(0, strings.Length)
                    .Select(i => $"udf: {strings.GetString(i)}")
                    .ToArray()));
            var udfWrapper2 = new Sql.ArrowUdfWrapper <Int32Array, Int32Array, Int32Array>(
                (arg1, arg2) => (Int32Array)ToArrowArray(
                    Enumerable.Range(0, arg1.Length)
                    .Select(i => arg1.Values[i] * arg2.Values[i])
                    .ToArray()));

            var command1 = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.ArrowWorkerFunction(udfWrapper1.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var command2 = new SqlCommand()
            {
                ArgOffsets          = new[] { 1, 2 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.ArrowWorkerFunction(udfWrapper2.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command1, command2 }
            };

            using (var inputStream = new MemoryStream())
                using (var outputStream = new MemoryStream())
                {
                    int numRows = 10;

                    // Write test data to the input stream.
                    Schema schema = new Schema.Builder()
                                    .Field(b => b.Name("arg1").DataType(StringType.Default))
                                    .Field(b => b.Name("arg2").DataType(Int32Type.Default))
                                    .Field(b => b.Name("arg3").DataType(Int32Type.Default))
                                    .Build();
                    var arrowWriter = new ArrowStreamWriter(inputStream, schema);
                    await arrowWriter.WriteRecordBatchAsync(
                        new RecordBatch(
                            schema,
                            new[]
                    {
                        ToArrowArray(
                            Enumerable.Range(0, numRows)
                            .Select(i => i.ToString())
                            .ToArray()),
                        ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
                        ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
                    },
                            numRows));

                    inputStream.Seek(0, SeekOrigin.Begin);

                    CommandExecutorStat stat = new CommandExecutor().Execute(
                        inputStream,
                        outputStream,
                        0,
                        commandPayload);

                    // Validate all the data on the stream is read.
                    Assert.Equal(inputStream.Length, inputStream.Position);
                    Assert.Equal(numRows, stat.NumEntriesProcessed);

                    // Validate the output stream.
                    outputStream.Seek(0, SeekOrigin.Begin);
                    var arrowLength = SerDe.ReadInt32(outputStream);
                    Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
                    var         arrowReader = new ArrowStreamReader(outputStream);
                    RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

                    Assert.Equal(numRows, outputBatch.Length);
                    Assert.Equal(2, outputBatch.Arrays.Count());
                    var array1 = (StringArray)outputBatch.Arrays.ElementAt(0);
                    var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1);
                    for (int i = 0; i < numRows; ++i)
                    {
                        Assert.Equal($"udf: {i}", array1.GetString(i));
                        Assert.Equal(i * i, array2.Values[i]);
                    }

                    int end = SerDe.ReadInt32(outputStream);
                    Assert.Equal(0, end);

                    // Validate all the data on the stream is read.
                    Assert.Equal(outputStream.Length, outputStream.Position);
                }
        }
Esempio n. 27
0
 public Builder SetSchema(Schema.Builder builderForValue)
 {
     _producer.Schema = builderForValue.Build();
     return(this);
 }
        public async Task TestDataFrameSqlCommandExecutorWithSingleCommand(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"udf: {cur}"));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Build();
            var arrowWriter =
                new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions);
            await arrowWriter.WriteRecordBatchAsync(
                new RecordBatch(
                    schema,
                    new[]
            {
                ToArrowArray(
                    Enumerable.Range(0, numRows)
                    .Select(i => i.ToString())
                    .ToArray())
            },
                    numRows));

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(numRows, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            int arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

            Assert.Equal(numRows, outputBatch.Length);
            Assert.Single(outputBatch.Arrays);
            var array = (StringArray)outputBatch.Arrays.ElementAt(0);

            // Validate the single command.
            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal($"udf: {i}", array.GetString(i));
            }

            CheckEOS(outputStream, ipcOptions);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
        public void TestRecordBatchWithStructArrays()
        {
            RecordBatch CreateRecordBatch(string prependColumnNamesWith = "")
            {
                RecordBatch ret = new RecordBatch.Builder()
                                  .Append(prependColumnNamesWith + "Column1", false, col => col.Int32(array => array.AppendRange(Enumerable.Range(0, 10))))
                                  .Append(prependColumnNamesWith + "Column2", true, new Int32Array(
                                              valueBuffer: new ArrowBuffer.Builder <int>().AppendRange(Enumerable.Range(0, 10)).Build(),
                                              nullBitmapBuffer: new ArrowBuffer.Builder <byte>().Append(0xfd).Append(0xff).Build(),
                                              length: 10,
                                              nullCount: 1,
                                              offset: 0))
                                  .Append(prependColumnNamesWith + "Column3", true, new Int32Array(
                                              valueBuffer: new ArrowBuffer.Builder <int>().AppendRange(Enumerable.Range(0, 10)).Build(),
                                              nullBitmapBuffer: new ArrowBuffer.Builder <byte>().Append(0x00).Append(0x00).Build(),
                                              length: 10,
                                              nullCount: 10,
                                              offset: 0))
                                  .Append(prependColumnNamesWith + "NullableBooleanColumn", true, new BooleanArray(
                                              valueBuffer: new ArrowBuffer.Builder <byte>().Append(0xfd).Append(0xff).Build(),
                                              nullBitmapBuffer: new ArrowBuffer.Builder <byte>().Append(0xed).Append(0xff).Build(),
                                              length: 10,
                                              nullCount: 2,
                                              offset: 0))
                                  .Append(prependColumnNamesWith + "StringDataFrameColumn", false, new StringArray.Builder().AppendRange(Enumerable.Range(0, 10).Select(x => x.ToString())).Build())
                                  .Append(prependColumnNamesWith + "DoubleColumn", false, new DoubleArray.Builder().AppendRange(Enumerable.Repeat(1.0, 10)).Build())
                                  .Append(prependColumnNamesWith + "FloatColumn", false, new FloatArray.Builder().AppendRange(Enumerable.Repeat(1.0f, 10)).Build())
                                  .Append(prependColumnNamesWith + "ShortColumn", false, new Int16Array.Builder().AppendRange(Enumerable.Repeat((short)1, 10)).Build())
                                  .Append(prependColumnNamesWith + "LongColumn", false, new Int64Array.Builder().AppendRange(Enumerable.Repeat((long)1, 10)).Build())
                                  .Append(prependColumnNamesWith + "UIntColumn", false, new UInt32Array.Builder().AppendRange(Enumerable.Repeat((uint)1, 10)).Build())
                                  .Append(prependColumnNamesWith + "UShortColumn", false, new UInt16Array.Builder().AppendRange(Enumerable.Repeat((ushort)1, 10)).Build())
                                  .Append(prependColumnNamesWith + "ULongColumn", false, new UInt64Array.Builder().AppendRange(Enumerable.Repeat((ulong)1, 10)).Build())
                                  .Append(prependColumnNamesWith + "ByteColumn", false, new Int8Array.Builder().AppendRange(Enumerable.Repeat((sbyte)1, 10)).Build())
                                  .Append(prependColumnNamesWith + "UByteColumn", false, new UInt8Array.Builder().AppendRange(Enumerable.Repeat((byte)1, 10)).Build())
                                  .Build();

                return(ret);
            }

            RecordBatch originalBatch = CreateRecordBatch();

            ArrowBuffer.BitmapBuilder validityBitmapBuilder = new ArrowBuffer.BitmapBuilder();
            for (int i = 0; i < originalBatch.Length; i++)
            {
                validityBitmapBuilder.Append(true);
            }
            ArrowBuffer validityBitmap = validityBitmapBuilder.Build();

            StructType  structType  = new StructType(originalBatch.Schema.Fields.Select((KeyValuePair <string, Field> pair) => pair.Value).ToList());
            StructArray structArray = new StructArray(structType, originalBatch.Length, originalBatch.Arrays.Cast <Apache.Arrow.Array>(), validityBitmap);
            Schema      schema      = new Schema.Builder().Field(new Field("Struct", structType, false)).Build();
            RecordBatch recordBatch = new RecordBatch(schema, new[] { structArray }, originalBatch.Length);

            DataFrame df = DataFrame.FromArrowRecordBatch(recordBatch);

            DataFrameIOTests.VerifyColumnTypes(df, testArrowStringColumn: true);

            IEnumerable <RecordBatch> recordBatches = df.ToArrowRecordBatches();

            RecordBatch expected = CreateRecordBatch("Struct_");

            foreach (RecordBatch batch in recordBatches)
            {
                RecordBatchComparer.CompareBatches(expected, batch);
            }
        }
Esempio n. 30
0
 public Builder SetSchema(Schema.Builder builderForValue)
 {
     _subscribe.Schema = builderForValue.Build();
     return(this);
 }