public void TestEmptyDataFrameRecordBatch() { PrimitiveDataFrameColumn <int> ageColumn = new PrimitiveDataFrameColumn <int>("Age"); PrimitiveDataFrameColumn <int> lengthColumn = new PrimitiveDataFrameColumn <int>("CharCount"); ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("Empty"); DataFrame df = new DataFrame(new List <DataFrameColumn>() { ageColumn, lengthColumn, stringColumn }); IEnumerable <RecordBatch> recordBatches = df.ToArrowRecordBatches(); bool foundARecordBatch = false; foreach (RecordBatch recordBatch in recordBatches) { foundARecordBatch = true; MemoryStream stream = new MemoryStream(); ArrowStreamWriter writer = new ArrowStreamWriter(stream, recordBatch.Schema); writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult(); stream.Position = 0; ArrowStreamReader reader = new ArrowStreamReader(stream); RecordBatch readRecordBatch = reader.ReadNextRecordBatch(); while (readRecordBatch != null) { RecordBatchComparer.CompareBatches(recordBatch, readRecordBatch); readRecordBatch = reader.ReadNextRecordBatch(); } } Assert.True(foundARecordBatch); }
public void TestCommandSerDeForSqlArrowDataFrame() { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"hello {cur}")); var workerFunction = new DataFrameWorkerFunction(udfWrapper.Execute); byte[] serializedCommand = Utils.CommandSerDe.Serialize( workerFunction.Func, Utils.CommandSerDe.SerializedMode.Row, Utils.CommandSerDe.SerializedMode.Row); using var ms = new MemoryStream(serializedCommand); var deserializedWorkerFunction = new DataFrameWorkerFunction( Utils.CommandSerDe.Deserialize <DataFrameWorkerFunction.ExecuteDelegate>( ms, out Utils.CommandSerDe.SerializedMode serializerMode, out Utils.CommandSerDe.SerializedMode deserializerMode, out var runMode)); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode); Assert.Equal("N", runMode); var column = (StringArray)ToArrowArray(new[] { "spark" }); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); DataFrameColumn result = deserializedWorkerFunction.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 }); AssertEquals("hello spark", result); }
public void TestInvalidChainingDataFrame() { var func1 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <Int32DataFrameColumn, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (numbers, strings) => { long i = 0; return(strings.Apply(cur => $"{cur}:{numbers[i++]}")); }).Execute); var func2 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"outer1:{cur}")) .Execute); string[] inputString = { "name" }; var column = (StringArray)ToArrowArray(inputString); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); var input = new DataFrameColumn[] { new Int32DataFrameColumn("Int", new List <int>() { 100 }), ArrowStringDataFrameColumn }; // The order does not align since workerFunction2 is executed first. DataFrameWorkerFunction chainedFunc1 = DataFrameWorkerFunction.Chain(func2, func1); Assert.ThrowsAny <Exception>(() => chainedFunc1.Func(input, new[] { 0, 1 })); }
public void TestDataFrameWorkerFunctionForBool() { var func = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, PrimitiveDataFrameColumn <bool>, PrimitiveDataFrameColumn <bool> >( (strings, flags) => { for (long i = 0; i < strings.Length; ++i) { flags[i] = flags[i].Value || strings[i].Contains("true"); } return(flags); }).Execute); var stringColumn = (StringArray)ToArrowArray(new[] { "arg1_true", "arg1_true", "arg1_false", "arg1_false" }); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(stringColumn); var boolColumn = new PrimitiveDataFrameColumn <bool>("Bool", Enumerable.Range(0, 4).Select(x => x % 2 == 0)); var input = new DataFrameColumn[] { ArrowStringDataFrameColumn, boolColumn }; var results = (PrimitiveDataFrameColumn <bool>)func.Func(input, new[] { 0, 1 }); Assert.Equal(4, results.Length); Assert.True(results[0]); Assert.True(results[1]); Assert.True(results[2]); Assert.False(results[3]); }
public void TestBasicArrowStringColumn() { StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); Memory <byte> dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; Memory <byte> nullMemory = new byte[] { 0, 0, 0, 0 }; Memory <byte> offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); Assert.Equal(2, stringColumn.Length); Assert.Equal("foo", stringColumn[0]); Assert.Equal("bar", stringColumn[1]); }
public void TestDataFrameWorkerFunction() { var func = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (str) => str).Execute); string[] input = { "arg1" }; var column = (StringArray)ToArrowArray(input); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); ArrowTestUtils.AssertEquals( input[0], func.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 })); }
public void TestChainingDataFrameWorkerFunction() { var func1 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <Int32DataFrameColumn, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (numbers, strings) => { long i = 0; return(strings.Apply(cur => $"{cur}:{numbers[i++]}")); }).Execute); var func2 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"outer1:{cur}")) .Execute); var func3 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => strings.Apply(cur => $"outer2:{cur}")) .Execute); string[] inputString = { "name" }; var column = (StringArray)ToArrowArray(inputString); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); var input = new DataFrameColumn[] { new Int32DataFrameColumn("Int", new List <int>() { 100 }), ArrowStringDataFrameColumn }; // Validate one-level chaining. DataFrameWorkerFunction chainedFunc1 = DataFrameWorkerFunction.Chain(func1, func2); ArrowTestUtils.AssertEquals( "outer1:name:100", chainedFunc1.Func(input, new[] { 0, 1 })); // Validate two-level chaining. DataFrameWorkerFunction chainedFunc2 = DataFrameWorkerFunction.Chain(chainedFunc1, func3); ArrowTestUtils.AssertEquals( "outer2:outer1:name:100", chainedFunc2.Func(input, new[] { 0, 1 })); }
public void TestArrowStringColumnClone() { StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); Memory <byte> dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; Memory <byte> nullMemory = new byte[] { 0, 0, 0, 0 }; Memory <byte> offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); DataFrameColumn clone = stringColumn.Clone(numberOfNullsToAppend: 5); Assert.Equal(7, clone.Length); Assert.Equal(stringColumn[0], clone[0]); Assert.Equal(stringColumn[1], clone[1]); for (int i = 2; i < 7; i++) { Assert.Null(clone[i]); } }
public void TestArrowStringColumnGetReadOnlyBuffers() { // Test ArrowStringDataFrameColumn. StringArray strArray = new StringArray.Builder().Append("foo").Append("bar").Build(); Memory <byte> dataMemory = new byte[] { 102, 111, 111, 98, 97, 114 }; Memory <byte> nullMemory = new byte[] { 1 }; Memory <byte> offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0 }; ArrowStringDataFrameColumn column = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, strArray.Length, strArray.NullCount); IEnumerable <ReadOnlyMemory <byte> > dataBuffers = column.GetReadOnlyDataBuffers(); IEnumerable <ReadOnlyMemory <byte> > nullBitMaps = column.GetReadOnlyNullBitMapBuffers(); IEnumerable <ReadOnlyMemory <int> > offsetsBuffers = column.GetReadOnlyOffsetsBuffers(); using (IEnumerator <ReadOnlyMemory <byte> > bufferEnumerator = dataBuffers.GetEnumerator()) using (IEnumerator <ReadOnlyMemory <int> > offsetsEnumerator = offsetsBuffers.GetEnumerator()) using (IEnumerator <ReadOnlyMemory <byte> > nullBitMapsEnumerator = nullBitMaps.GetEnumerator()) { while (bufferEnumerator.MoveNext() && nullBitMapsEnumerator.MoveNext() && offsetsEnumerator.MoveNext()) { ReadOnlyMemory <byte> dataBuffer = bufferEnumerator.Current; ReadOnlyMemory <byte> nullBitMap = nullBitMapsEnumerator.Current; ReadOnlyMemory <int> offsets = offsetsEnumerator.Current; ReadOnlySpan <byte> dataSpan = dataBuffer.Span; ReadOnlySpan <int> offsetsSpan = offsets.Span; int dataStart = 0; for (int j = 1; j < offsetsSpan.Length; j++) { int length = offsetsSpan[j] - offsetsSpan[j - 1]; ReadOnlySpan <byte> str = dataSpan.Slice(dataStart, length); ReadOnlySpan <byte> columnStr = dataMemory.Span.Slice(dataStart, length); Assert.Equal(str.Length, columnStr.Length); for (int s = 0; s < str.Length; s++) { Assert.Equal(str[s], columnStr[s]); } dataStart = length; } } } }
public void TestInvalidChainingDataFrame() { var func1 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <PrimitiveDataFrameColumn <int>, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (numbers, strings) => { var stringArray = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"{strings[i]}:{numbers[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringArray)); }).Execute); var func2 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringArray = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"outer1:{strings[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringArray)); }).Execute); string[] inputString = { "name" }; var column = (StringArray)ToArrowArray(inputString); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); var input = new DataFrameColumn[] { new PrimitiveDataFrameColumn <int>("Int", new List <int>() { 100 }), ArrowStringDataFrameColumn }; // The order does not align since workerFunction2 is executed first. DataFrameWorkerFunction chainedFunc1 = DataFrameWorkerFunction.Chain(func2, func1); Assert.ThrowsAny <Exception>(() => chainedFunc1.Func(input, new[] { 0, 1 })); }
private static FxDataFrame CountCharacters(FxDataFrame dataFrame) { int characterCount = 0; var characterCountColumn = new PrimitiveDataFrameColumn <int>("nameCharCount"); var ageColumn = new PrimitiveDataFrameColumn <int>("age"); ArrowStringDataFrameColumn fieldColumn = dataFrame["name"] as ArrowStringDataFrameColumn; for (long i = 0; i < dataFrame.Rows.Count; ++i) { characterCount += fieldColumn[i].Length; } if (dataFrame.Rows.Count > 0) { characterCountColumn.Append(characterCount); ageColumn.Append((int?)dataFrame["age"][0]); } return(new FxDataFrame(ageColumn, characterCountColumn)); }
private static FxDataFrame CountCharacters(FxDataFrame dataFrame) { int characterCount = 0; var characterCountColumn = new Int32DataFrameColumn("nameCharCount"); var ageColumn = new Int32DataFrameColumn("age"); ArrowStringDataFrameColumn nameColumn = dataFrame.Columns.GetArrowStringColumn("name"); for (long i = 0; i < dataFrame.Rows.Count; ++i) { characterCount += nameColumn[i].Length; } if (dataFrame.Rows.Count > 0) { characterCountColumn.Append(characterCount); ageColumn.Append(dataFrame.Columns.GetInt32Column("age")[0]); } return(new FxDataFrame(ageColumn, characterCountColumn)); }
public void TestCommandSerDeForSqlArrowDataFrame() { var udfWrapper = new Sql.DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringColumn = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"hello {strings[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringColumn)); }); var workerFunction = new DataFrameWorkerFunction(udfWrapper.Execute); byte[] serializedCommand = Utils.CommandSerDe.Serialize( workerFunction.Func, Utils.CommandSerDe.SerializedMode.Row, Utils.CommandSerDe.SerializedMode.Row); using (var ms = new MemoryStream(serializedCommand)) { var deserializedWorkerFunction = new DataFrameWorkerFunction( Utils.CommandSerDe.Deserialize <DataFrameWorkerFunction.ExecuteDelegate>( ms, out Utils.CommandSerDe.SerializedMode serializerMode, out Utils.CommandSerDe.SerializedMode deserializerMode, out var runMode)); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode); Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode); Assert.Equal("N", runMode); var column = (StringArray)ToArrowArray(new[] { "spark" }); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); DataFrameColumn result = deserializedWorkerFunction.Func(new[] { ArrowStringDataFrameColumn }, new[] { 0 }); ArrowTestUtils.AssertEquals("hello spark", result); } }
public void TestArrowStringColumnWithNulls() { string data = "joemark"; byte[] bytes = Encoding.UTF8.GetBytes(data); Memory <byte> dataMemory = new Memory <byte>(bytes); Memory <byte> nullMemory = new byte[] { 0b1101 }; Memory <byte> offsetMemory = new byte[] { 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0, 7, 0, 0, 0 }; ArrowStringDataFrameColumn stringColumn = new ArrowStringDataFrameColumn("String", dataMemory, offsetMemory, nullMemory, 4, 1); Assert.Equal(4, stringColumn.Length); Assert.Equal("joe", stringColumn[0]); Assert.Null(stringColumn[1]); Assert.Equal("mark", stringColumn[2]); Assert.Equal("", stringColumn[3]); List <string> ret = stringColumn[0, 4]; Assert.Equal("joe", ret[0]); Assert.Null(ret[1]); Assert.Equal("mark", ret[2]); Assert.Equal("", ret[3]); }
static ArrowStringDataFrameColumn ConvertStrings(ArrowStringDataFrameColumn strings) { return(strings.Apply(cur => $"udf: {cur}")); }
public async Task TestDataFrameGroupedMapCommandExecutor() { StringArray ConvertStrings(DataFrameColumn strings) { return((StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"udf: {strings[i]}") .ToArray())); } var resultSchema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var udfWrapper = new Sql.DataFrameGroupedMapUdfWrapper( (dataFrame) => { StringArray strings = ConvertStrings(dataFrame.Columns[0]); var stringColumn = new ArrowStringDataFrameColumn(dataFrame.Columns[0].Name, strings.ValueBuffer.Memory, strings.ValueOffsetsBuffer.Memory, strings.NullBitmapBuffer.Memory, strings.Length, strings.NullCount); DataFrameColumn doubles = dataFrame.Columns[1] + 100; return(new DataFrame(new List <DataFrameColumn>() { stringColumn, doubles })); }); var command = new SqlCommand() { ArgOffsets = new[] { 0 }, NumChainedFunctions = 1, WorkerFunction = new Sql.DataFrameGroupedMapWorkerFunction(udfWrapper.Execute), SerializerMode = CommandSerDe.SerializedMode.Row, DeserializerMode = CommandSerDe.SerializedMode.Row }; var commandPayload = new Worker.CommandPayload() { EvalType = UdfUtils.PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, Commands = new[] { command } }; using var inputStream = new MemoryStream(); using var outputStream = new MemoryStream(); int numRows = 10; // Write test data to the input stream. var schema = new Schema.Builder() .Field(b => b.Name("arg1").DataType(StringType.Default)) .Field(b => b.Name("arg2").DataType(Int64Type.Default)) .Build(); var arrowWriter = new ArrowStreamWriter(inputStream, schema); await arrowWriter.WriteRecordBatchAsync( new RecordBatch( schema, new[] { ToArrowArray( Enumerable.Range(0, numRows) .Select(i => i.ToString()) .ToArray()), ToArrowArray( Enumerable.Range(0, numRows) .Select(i => (long)i) .ToArray()) }, numRows)); inputStream.Seek(0, SeekOrigin.Begin); CommandExecutorStat stat = new CommandExecutor().Execute( inputStream, outputStream, 0, commandPayload); // Validate that all the data on the stream is read. Assert.Equal(inputStream.Length, inputStream.Position); Assert.Equal(numRows, stat.NumEntriesProcessed); // Validate the output stream. outputStream.Seek(0, SeekOrigin.Begin); int arrowLength = SerDe.ReadInt32(outputStream); Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength); var arrowReader = new ArrowStreamReader(outputStream); RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync(); Assert.Equal(numRows, outputBatch.Length); Assert.Equal(2, outputBatch.ColumnCount); var stringArray = (StringArray)outputBatch.Column(0); for (int i = 0; i < numRows; ++i) { Assert.Equal($"udf: {i}", stringArray.GetString(i)); } var doubleArray = (DoubleArray)outputBatch.Column(1); for (int i = 0; i < numRows; ++i) { Assert.Equal(100 + i, doubleArray.Values[i]); } int end = SerDe.ReadInt32(outputStream); Assert.Equal(0, end); // Validate all the data on the stream is read. Assert.Equal(outputStream.Length, outputStream.Position); }
public void TestChainingDataFrameWorkerFunction() { var func1 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <PrimitiveDataFrameColumn <int>, ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (numbers, strings) => { var stringColumn = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"{strings[i]}:{numbers[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringColumn)); }).Execute); var func2 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringColumn = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"outer1:{strings[i]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringColumn)); }).Execute); var func3 = new DataFrameWorkerFunction( new DataFrameUdfWrapper <ArrowStringDataFrameColumn, ArrowStringDataFrameColumn>( (strings) => { var stringColumn = (StringArray)ToArrowArray( Enumerable.Range(0, (int)strings.Length) .Select(i => $"outer2:{strings[(i)]}") .ToArray()); return(ToArrowStringDataFrameColumn(stringColumn)); }).Execute); string[] inputString = { "name" }; var column = (StringArray)ToArrowArray(inputString); ArrowStringDataFrameColumn ArrowStringDataFrameColumn = ToArrowStringDataFrameColumn(column); var input = new DataFrameColumn[] { new PrimitiveDataFrameColumn <int>("Int", new List <int>() { 100 }), ArrowStringDataFrameColumn }; // Validate one-level chaining. DataFrameWorkerFunction chainedFunc1 = DataFrameWorkerFunction.Chain(func1, func2); ArrowTestUtils.AssertEquals( "outer1:name:100", chainedFunc1.Func(input, new[] { 0, 1 })); // Validate two-level chaining. DataFrameWorkerFunction chainedFunc2 = DataFrameWorkerFunction.Chain(chainedFunc1, func3); ArrowTestUtils.AssertEquals( "outer2:outer1:name:100", chainedFunc2.Func(input, new[] { 0, 1 })); }