/// <summary> /// Returns an <see cref="IEnumerable{RecordBatch}"/> without copying data /// </summary> public IEnumerable <RecordBatch> ToArrowRecordBatches() { Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder(); int columnCount = Columns.Count; for (int i = 0; i < columnCount; i++) { DataFrameColumn column = Columns[i]; Field field = column.GetArrowField(); schemaBuilder.Field(field); } Schema schema = schemaBuilder.Build(); List <Apache.Arrow.Array> arrays = new List <Apache.Arrow.Array>(); int recordBatchLength = Int32.MaxValue; int numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount); long numberOfRowsProcessed = 0; // Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows do { for (int i = 0; i < columnCount; i++) { DataFrameColumn column = Columns[i]; numberOfRowsInThisRecordBatch = (int)Math.Min(numberOfRowsInThisRecordBatch, column.GetMaxRecordBatchLength(numberOfRowsProcessed)); } for (int i = 0; i < columnCount; i++) { DataFrameColumn column = Columns[i]; arrays.Add(column.ToArrowArray(numberOfRowsProcessed, numberOfRowsInThisRecordBatch)); } numberOfRowsProcessed += numberOfRowsInThisRecordBatch; yield return(new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch)); } while (numberOfRowsProcessed < RowCount); }
public static Table BuildTable(List <TDataLayout> data) { PropertyInfo[] properties = typeof(TDataLayout).GetProperties(BindingFlags.Public | BindingFlags.Instance).Where(p => p.GetCustomAttribute <ArrowPropertyAttribute>() != null).ToArray(); //int length = data.Count; List <Field> fields = new List <Field>(properties.Length); Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder(); foreach (PropertyInfo property in properties) { ArrowPropertyAttribute arrowPropertyAttribute = property.GetCustomAttribute <ArrowPropertyAttribute>(); Field field = new Field.Builder().Name(arrowPropertyAttribute.Name).DataType(arrowPropertyAttribute.ArrowType()).Build(); schemaBuilder.Field(field); fields.Add(field); } List <List <object> > transformData = new List <List <object> >(properties.Length); for (int i = 0; i < properties.Length; ++i) { transformData.Add(new List <object>(data.Count)); } for (int i = 0; i < data.Count; ++i) { int j = 0; foreach (object property in data[i]) { transformData[j++][i] = property; } } List <Column> columns = new List <Column>(properties.Length); for (int i = 0; i < properties.Length; ++i) { Array array = ArrowUtilities.MakeArrayBuffer(fields[i].DataType, transformData[i]); Column column = new Column(fields[i], new Array[] { array }); columns.Add(column); } //NativeMemoryAllocator memoryAllocator = new NativeMemoryAllocator(32); Table table = new Table(schemaBuilder.Build(), columns); return(table); }