public void TestCommandSerDeForSqlArrow() { var udfWrapper = new Sql.ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"hello {strings.GetString(i)}") .ToArray())); var workerFunction = new ArrowWorkerFunction(udfWrapper.Execute); var serializedCommand = Utils.CommandSerDe.Serialize( workerFunction.Func, Utils.CommandSerDe.SerializedMode.Row, Utils.CommandSerDe.SerializedMode.Row); using (var ms = new MemoryStream(serializedCommand)) { var deserializedWorkerFunction = new ArrowWorkerFunction( Utils.CommandSerDe.Deserialize <ArrowWorkerFunction.ExecuteDelegate>( ms, out Utils.CommandSerDe.SerializedMode serializerMode, out Utils.CommandSerDe.SerializedMode deserializerMode, out var runMode)); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode); Assert.Equal("N", runMode); Apache.Arrow.IArrowArray input = ToArrowArray(new[] { "spark" }); Apache.Arrow.IArrowArray result = deserializedWorkerFunction.Func(new[] { input }, new[] { 0 }); ArrowTestUtils.AssertEquals("hello spark", result); } }
public void TestInvalidChainingArrow() { var func1 = new ArrowWorkerFunction( new ArrowUdfWrapper <Int32Array, StringArray, StringArray>( (numbers, strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"{strings.GetString(i)}:{numbers.Values[i]}") .ToArray())).Execute); var func2 = new ArrowWorkerFunction( new ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"outer1:{strings.GetString(i)}") .ToArray())).Execute); IArrowArray[] input = new[] { ToArrowArray(new[] { 100 }), ToArrowArray(new[] { "name" }) }; // The order does not align since workerFunction2 is executed first. ArrowWorkerFunction chainedFunc1 = ArrowWorkerFunction.Chain(func2, func1); Assert.ThrowsAny <Exception>(() => chainedFunc1.Func(input, new[] { 0, 1 })); }
public void TestCommandSerDeForSqlArrow() { var udfWrapper = new ArrowUdfWrapper <string, string>((str) => $"hello {str}"); var workerFunction = new ArrowWorkerFunction(udfWrapper.Execute); var serializedCommand = Utils.CommandSerDe.Serialize( workerFunction.Func, Utils.CommandSerDe.SerializedMode.Row, Utils.CommandSerDe.SerializedMode.Row); using (var ms = new MemoryStream(serializedCommand)) { var deserializedWorkerFunction = new ArrowWorkerFunction( Utils.CommandSerDe.Deserialize <ArrowWorkerFunction.ExecuteDelegate>( ms, out Utils.CommandSerDe.SerializedMode serializerMode, out Utils.CommandSerDe.SerializedMode deserializerMode, out var runMode)); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode); Assert.Equal("N", runMode); Apache.Arrow.IArrowArray input = ArrowArrayHelpers.ToArrowArray(new[] { "spark" }); Apache.Arrow.IArrowArray result = deserializedWorkerFunction.Func(0, new[] { input }, new[] { 0 }); ArrowTestUtils.AssertEquals("hello spark", result); } }
public void TestChainingArrowWorkerFunction() { var func1 = new ArrowWorkerFunction( new ArrowUdfWrapper <int, string, string>( (number, str) => $"{str}:{number}").Execute); var func2 = new ArrowWorkerFunction( new ArrowUdfWrapper <string, string>( (str) => $"outer1:{str}").Execute); var func3 = new ArrowWorkerFunction( new ArrowUdfWrapper <string, string>( (str) => $"outer2:{str}").Execute); Apache.Arrow.IArrowArray[] input = new[] { ToArrowArray(new[] { 100 }), ToArrowArray(new[] { "name" }) }; // Validate one-level chaining. var chainedFunc1 = ArrowWorkerFunction.Chain(func1, func2); ArrowTestUtils.AssertEquals( "outer1:name:100", chainedFunc1.Func(0, input, new[] { 0, 1 })); // Validate two-level chaining. var chainedFunc2 = ArrowWorkerFunction.Chain(chainedFunc1, func3); ArrowTestUtils.AssertEquals( "outer2:outer1:name:100", chainedFunc2.Func(0, input, new[] { 0, 1 })); }
public void TestArrowWorkerFunction() { var func = new ArrowWorkerFunction( new ArrowUdfWrapper <StringArray, StringArray>( (str) => str).Execute); string[] input = { "arg1" }; ArrowTestUtils.AssertEquals( input[0], func.Func(new[] { ToArrowArray(input) }, new[] { 0 })); }
public void TestChainingArrowWorkerFunction() { var func1 = new ArrowWorkerFunction( new ArrowUdfWrapper <Int32Array, StringArray, StringArray>( (numbers, strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"{strings.GetString(i)}:{numbers.Values[i]}") .ToArray())).Execute); var func2 = new ArrowWorkerFunction( new ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"outer1:{strings.GetString(i)}") .ToArray())).Execute); var func3 = new ArrowWorkerFunction( new ArrowUdfWrapper <StringArray, StringArray>( (strings) => (StringArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => $"outer2:{strings.GetString(i)}") .ToArray())).Execute); var input = new IArrowArray[] { ToArrowArray(new[] { 100 }), ToArrowArray(new[] { "name" }) }; // Validate one-level chaining. ArrowWorkerFunction chainedFunc1 = ArrowWorkerFunction.Chain(func1, func2); AssertEquals( "outer1:name:100", chainedFunc1.Func(input, new[] { 0, 1 })); // Validate two-level chaining. ArrowWorkerFunction chainedFunc2 = ArrowWorkerFunction.Chain(chainedFunc1, func3); AssertEquals( "outer2:outer1:name:100", chainedFunc2.Func(input, new[] { 0, 1 })); }
public void TestArrowWorkerFunctionForBool() { var func = new ArrowWorkerFunction( new ArrowUdfWrapper <string, bool, bool>( (str, flag) => flag || str.Contains("true")).Execute); IArrowArray[] input = new[] { ToArrowArray(new[] { "arg1_true", "arg1_true", "arg1_false", "arg1_false" }), ToArrowArray(new[] { true, false, true, false }), }; var results = (BooleanArray)func.Func(0, input, new[] { 0, 1 }); Assert.Equal(4, results.Length); Assert.True(results.GetBoolean(0)); Assert.True(results.GetBoolean(1)); Assert.True(results.GetBoolean(2)); Assert.False(results.GetBoolean(3)); }
public void TestInvalidChainingArrow() { var func1 = new ArrowWorkerFunction( new ArrowUdfWrapper <int, string, string>( (number, str) => $"{str}:{number}").Execute); var func2 = new ArrowWorkerFunction( new ArrowUdfWrapper <string, string>( (str) => $"outer1:{str}").Execute); Apache.Arrow.IArrowArray[] input = new[] { ToArrowArray(new[] { 100 }), ToArrowArray(new[] { "name" }) }; // The order does not align since workerFunction2 is executed first. var chainedFunc1 = ArrowWorkerFunction.Chain(func2, func1); Assert.ThrowsAny <Exception>(() => chainedFunc1.Func(0, input, new[] { 0, 1 })); }
public void TestArrowWorkerFunctionForBool() { var func = new ArrowWorkerFunction( new ArrowUdfWrapper <StringArray, BooleanArray, BooleanArray>( (strings, flags) => (BooleanArray)ToArrowArray( Enumerable.Range(0, strings.Length) .Select(i => flags.GetBoolean(i) || strings.GetString(i).Contains("true")) .ToArray())).Execute); IArrowArray[] input = new[] { ToArrowArray(new[] { "arg1_true", "arg1_true", "arg1_false", "arg1_false" }), ToArrowArray(new[] { true, false, true, false }), }; var results = (BooleanArray)func.Func(input, new[] { 0, 1 }); Assert.Equal(4, results.Length); Assert.True(results.GetBoolean(0)); Assert.True(results.GetBoolean(1)); Assert.True(results.GetBoolean(2)); Assert.False(results.GetBoolean(3)); }
/// <summary> /// Read SqlCommands from the stream. /// </summary> /// <param name="stream">Stream to read from</param> /// <param name="evalType">Evaluation type for the current commands</param> /// <returns>SqlCommand objects</returns> private static SqlCommand[] ReadSqlCommands( PythonEvalType evalType, Stream stream) { int numUdfs = SerDe.ReadInt32(stream); var commands = new SqlCommand[numUdfs]; for (int i = 0; i < numUdfs; ++i) { var command = new SqlCommand(); int numArgsOffsets = SerDe.ReadInt32(stream); command.ArgOffsets = new int[numArgsOffsets]; for (int argIndex = 0; argIndex < numArgsOffsets; ++argIndex) { command.ArgOffsets[argIndex] = SerDe.ReadInt32(stream); } command.NumChainedFunctions = SerDe.ReadInt32(stream); for (int funcIndex = 0; funcIndex < command.NumChainedFunctions; ++funcIndex) { int commandBytesCount = SerDe.ReadInt32(stream); if (commandBytesCount > 0) { CommandSerDe.SerializedMode serializerMode; CommandSerDe.SerializedMode deserializerMode; if (evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF) { var curWorkerFunction = new ArrowWorkerFunction( CommandSerDe.Deserialize <ArrowWorkerFunction.ExecuteDelegate>( stream, out serializerMode, out deserializerMode, out string runMode)); command.WorkerFunction = (command.WorkerFunction == null) ? curWorkerFunction : ArrowWorkerFunction.Chain( (ArrowWorkerFunction)command.WorkerFunction, curWorkerFunction); } else if (evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF) { if ((numUdfs != 1) || (command.WorkerFunction != null)) { throw new InvalidDataException( "Grouped map UDFs do not support combining multiple UDFs"); } command.WorkerFunction = new ArrowGroupedMapWorkerFunction( CommandSerDe.Deserialize <ArrowGroupedMapWorkerFunction.ExecuteDelegate>( stream, out serializerMode, out deserializerMode, out string runMode)); } else { var curWorkerFunction = new PicklingWorkerFunction( CommandSerDe.Deserialize <PicklingWorkerFunction.ExecuteDelegate>( stream, out serializerMode, out deserializerMode, out string runMode)); command.WorkerFunction = (command.WorkerFunction == null) ? curWorkerFunction : PicklingWorkerFunction.Chain( (PicklingWorkerFunction)command.WorkerFunction, curWorkerFunction); } command.SerializerMode = serializerMode; command.DeserializerMode = deserializerMode; } else { throw new InvalidDataException( $"Invalid command size: {commandBytesCount}"); } } commands[i] = command; } return(commands); }