Пример #1
0
        public void TestCommandSerDeForSqlArrowDataFrame()
        {
            var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"hello {cur}"));

            var workerFunction = new DataFrameWorkerFunction(udfWrapper.Execute);

            byte[] serializedCommand = Utils.CommandSerDe.Serialize(
                workerFunction.Func,
                Utils.CommandSerDe.SerializedMode.Row,
                Utils.CommandSerDe.SerializedMode.Row);

            using var ms = new MemoryStream(serializedCommand);
            var deserializedWorkerFunction = new DataFrameWorkerFunction(
                Utils.CommandSerDe.Deserialize <DataFrameWorkerFunction.ExecuteDelegate>(
                    ms,
                    out Utils.CommandSerDe.SerializedMode serializerMode,
                    out Utils.CommandSerDe.SerializedMode deserializerMode,
                    out var runMode));

            Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode);
            Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode);
            Assert.Equal("N", runMode);

            var column = (StringArray)ToArrowArray(new[] { "spark" });

            ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column);
            DataFrameColumn            result =
                deserializedWorkerFunction.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 });

            AssertEquals("hello spark", result);
        }
Пример #2
0
        public void TestCommandSerDeForSqlArrowDataFrame()
        {
            var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) =>
            {
                var stringColumn = (StringArray)ToArrowArray(
                    Enumerable.Range(0, (int)strings.Length)
                    .Select(i => $"hello {strings[i]}")
                    .ToArray());
                return(ToArrowStringDataFrameColumn(stringColumn));
            });

            var workerFunction = new DataFrameWorkerFunction(udfWrapper.Execute);

            byte[] serializedCommand = Utils.CommandSerDe.Serialize(
                workerFunction.Func,
                Utils.CommandSerDe.SerializedMode.Row,
                Utils.CommandSerDe.SerializedMode.Row);

            using (var ms = new MemoryStream(serializedCommand))
            {
                var deserializedWorkerFunction = new DataFrameWorkerFunction(
                    Utils.CommandSerDe.Deserialize <DataFrameWorkerFunction.ExecuteDelegate>(
                        ms,
                        out Utils.CommandSerDe.SerializedMode serializerMode,
                        out Utils.CommandSerDe.SerializedMode deserializerMode,
                        out var runMode));

                Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode);
                Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode);
                Assert.Equal("N", runMode);

                var column = (StringArray)ToArrowArray(new[] { "spark" });

                ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column);
                DataFrameColumn            result =
                    deserializedWorkerFunction.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 });
                ArrowTestUtils.AssertEquals("hello spark", result);
            }
        }
        public void TestDataFrameSqlCommandExecutorWithEmptyInput(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"udf: {cur}"));

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Build();
            var arrowWriter = new ArrowStreamWriter(inputStream, schema, false, ipcOptions);

            // The .NET ArrowStreamWriter doesn't currently support writing just a
            // schema with no batches - but Java does. We use Reflection to simulate
            // the request Spark sends.
            MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod(
                "WriteSchemaAsync",
                BindingFlags.NonPublic | BindingFlags.Instance);

            writeSchemaMethod.Invoke(
                arrowWriter,
                new object[] { schema, CancellationToken.None });

            SerDe.Write(inputStream, 0);

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(0, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            int arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = arrowReader.ReadNextRecordBatch();

            Assert.Equal(1, outputBatch.Schema.Fields.Count);
            Assert.IsType <StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType);

            Assert.Equal(0, outputBatch.Length);
            Assert.Single(outputBatch.Arrays);

            var array = (StringArray)outputBatch.Arrays.ElementAt(0);

            Assert.Equal(0, array.Length);

            CheckEOS(outputStream, ipcOptions);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
        public async Task TestDataFrameSqlCommandExecutorWithMultiCommands(
            Version sparkVersion,
            IpcOptions ipcOptions)
        {
            var udfWrapper1 = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) => strings.Apply(cur => $"udf: {cur}"));

            var udfWrapper2 = new Sql.DataFrameUdfWrapper <Int32DataFrameColumn, Int32DataFrameColumn, Int32DataFrameColumn>(
                (arg1, arg2) => arg1 * arg2);

            var command1 = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper1.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var command2 = new SqlCommand()
            {
                ArgOffsets          = new[] { 1, 2 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper2.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command1, command2 }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Field(b => b.Name("arg2").DataType(Int32Type.Default))
                            .Field(b => b.Name("arg3").DataType(Int32Type.Default))
                            .Build();
            var arrowWriter =
                new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions);
            await arrowWriter.WriteRecordBatchAsync(
                new RecordBatch(
                    schema,
                    new[]
            {
                ToArrowArray(
                    Enumerable.Range(0, numRows)
                    .Select(i => i.ToString())
                    .ToArray()),
                ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
                ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
            },
                    numRows));

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(numRows, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            var arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

            Assert.Equal(numRows, outputBatch.Length);
            Assert.Equal(2, outputBatch.Arrays.Count());
            var array1 = (StringArray)outputBatch.Arrays.ElementAt(0);
            var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1);

            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal($"udf: {i}", array1.GetString(i));
                Assert.Equal(i * i, array2.Values[i]);
            }

            CheckEOS(outputStream, ipcOptions);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }
Пример #5
0
        public async Task TestDataFrameSqlCommandExecutorWithSingleCommand()
        {
            var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>(
                (strings) =>
            {
                var stringArray = (StringArray)ToArrowArray(
                    Enumerable.Range(0, (int)strings.Length)
                    .Select(i => $"udf: {strings[i]}")
                    .ToArray());
                return(ToArrowStringDataFrameColumn(stringArray));
            });

            var command = new SqlCommand()
            {
                ArgOffsets          = new[] { 0 },
                NumChainedFunctions = 1,
                WorkerFunction      = new Sql.DataFrameWorkerFunction(udfWrapper.Execute),
                SerializerMode      = CommandSerDe.SerializedMode.Row,
                DeserializerMode    = CommandSerDe.SerializedMode.Row
            };

            var commandPayload = new Worker.CommandPayload()
            {
                EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                Commands = new[] { command }
            };

            using var inputStream  = new MemoryStream();
            using var outputStream = new MemoryStream();
            int numRows = 10;

            // Write test data to the input stream.
            Schema schema = new Schema.Builder()
                            .Field(b => b.Name("arg1").DataType(StringType.Default))
                            .Build();
            var arrowWriter = new ArrowStreamWriter(inputStream, schema);
            await arrowWriter.WriteRecordBatchAsync(
                new RecordBatch(
                    schema,
                    new[]
            {
                ToArrowArray(
                    Enumerable.Range(0, numRows)
                    .Select(i => i.ToString())
                    .ToArray())
            },
                    numRows));

            inputStream.Seek(0, SeekOrigin.Begin);

            CommandExecutorStat stat = new CommandExecutor().Execute(
                inputStream,
                outputStream,
                0,
                commandPayload);

            // Validate that all the data on the stream is read.
            Assert.Equal(inputStream.Length, inputStream.Position);
            Assert.Equal(numRows, stat.NumEntriesProcessed);

            // Validate the output stream.
            outputStream.Seek(0, SeekOrigin.Begin);
            int arrowLength = SerDe.ReadInt32(outputStream);

            Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
            var         arrowReader = new ArrowStreamReader(outputStream);
            RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();

            Assert.Equal(numRows, outputBatch.Length);
            Assert.Single(outputBatch.Arrays);
            var array = (StringArray)outputBatch.Arrays.ElementAt(0);

            // Validate the single command.
            for (int i = 0; i < numRows; ++i)
            {
                Assert.Equal($"udf: {i}", array.GetString(i));
            }

            int end = SerDe.ReadInt32(outputStream);

            Assert.Equal(0, end);

            // Validate all the data on the stream is read.
            Assert.Equal(outputStream.Length, outputStream.Position);
        }