public void TestCommandSerDeForSqlArrowDataFrame() { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"hello {cur}")); var workerFunction = new DataFrameWorkerFunction(udfWrapper.Execute); byte[] serializedCommand = Utils.CommandSerDe.Serialize( workerFunction.Func, Utils.CommandSerDe.SerializedMode.Row, Utils.CommandSerDe.SerializedMode.Row); using var ms = new MemoryStream(serializedCommand); var deserializedWorkerFunction = new DataFrameWorkerFunction( Utils.CommandSerDe.Deserialize <DataFrameWorkerFunction.ExecuteDelegate>( ms, out Utils.CommandSerDe.SerializedMode serializerMode, out Utils.CommandSerDe.SerializedMode deserializerMode, out var runMode)); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode); Assert.Equal("N", runMode); var column = (StringArray)ToArrowArray(new[] { "spark" }); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); DataFrameColumn result = deserializedWorkerFunction.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 }); AssertEquals("hello spark", result); }
public void TestCommandSerDeForSqlArrowDataFrame() { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringColumn = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"hello {strings[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringColumn)); }); var workerFunction = new DataFrameWorkerFunction(udfWrapper.Execute); byte[] serializedCommand = Utils.CommandSerDe.Serialize( workerFunction.Func, Utils.CommandSerDe.SerializedMode.Row, Utils.CommandSerDe.SerializedMode.Row); using (var ms = new MemoryStream(serializedCommand)) { var deserializedWorkerFunction = new DataFrameWorkerFunction( Utils.CommandSerDe.Deserialize <DataFrameWorkerFunction.ExecuteDelegate>( ms, out Utils.CommandSerDe.SerializedMode serializerMode, out Utils.CommandSerDe.SerializedMode deserializerMode, out var runMode)); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode); Assert.Equal("N", runMode); var column = (StringArray)ToArrowArray(new[] { "spark" }); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); DataFrameColumn result = deserializedWorkerFunction.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 }); ArrowTestUtils.AssertEquals("hello spark", result); } }
public void TestDataFrameSqlCommandExecutorWithEmptyInput( Version sparkVersion, IpcOptions ipcOptions) { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"udf: {cur}")); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, false, ipcOptions); // The .NET ArrowStreamWriter doesn't currently support writing just a // schema with no batches - but Java does. We use Reflection to simulate // the request Spark sends. MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod( "WriteSchemaAsync", BindingFlags.NonPublic | BindingFlags.Instance); writeSchemaMethod.Invoke( arrowWriter, new object[] { schema, CancellationToken.None }); SerDe.Write(inputStream, 0); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(0, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = arrowReader.ReadNextRecordBatch(); Assert.Equal(1, outputBatch.Schema.Fields.Count); Assert.IsType <StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType); Assert.Equal(0, outputBatch.Length); Assert.Single(outputBatch.Arrays); var array = (StringArray)outputBatch.Arrays.ElementAt(0); Assert.Equal(0, array.Length); CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public async Task TestDataFrameSqlCommandExecutorWithMultiCommands( Version sparkVersion, IpcOptions ipcOptions) { var udfWrapper1 = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"udf: {cur}")); var udfWrapper2 = new Sql.DataFrameUdfWrapper <Int32DataFrameColumn, Int32DataFrameColumn, Int32DataFrameColumn>( (arg1, arg2) => arg1 * arg2); var command1 = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper1.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var command2 = new SqlCommand() { ArgOffsets = new[] { 1, 2 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper2.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command1, command2 } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int32Type.Default)) .Field(b => b.Name("arg3").DataType(Int32Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema, leaveOpen: false, ipcOptions); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), ToArrowArray(Enumerable.Range(0, numRows).ToArray()), }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor(sparkVersion).Execute( inputStream, outputStream, 0, commandPayload); // Validate all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); var arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.Arrays.Count()); var array1 = (StringArray)outputBatch.Arrays.ElementAt(0); var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array1.GetString(i)); Assert.Equal(i * i, array2.Values[i]); } CheckEOS(outputStream, ipcOptions); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public async Task TestDataFrameSqlCommandExecutorWithSingleCommand() { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringArray = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"udf: {strings[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringArray)); }); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. Schema schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Single(outputBatch.Arrays); var array = (StringArray)outputBatch.Arrays.ElementAt(0); // Validate the single command. for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", array.GetString(i)); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }