/// <summary> /// Returns an <see cref="IEnumerable{RecordBatch}"/> without copying data /// </summary> public IEnumerable <RecordBatch> ToArrowRecordBatches() { Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder(); int columnCount = Columns.Count; for (int i = 0; i < columnCount; i++) { DataFrameColumn column = Columns[i]; Field field = column.GetArrowField(); schemaBuilder.Field(field); } Schema schema = schemaBuilder.Build(); List <Apache.Arrow.Array> arrays = new List <Apache.Arrow.Array>(); int recordBatchLength = Int32.MaxValue; int numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount); long numberOfRowsProcessed = 0; // Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows do { for (int i = 0; i < columnCount; i++) { DataFrameColumn column = Columns[i]; numberOfRowsInThisRecordBatch = (int)Math.Min(numberOfRowsInThisRecordBatch, column.GetMaxRecordBatchLength(numberOfRowsProcessed)); } for (int i = 0; i < columnCount; i++) { DataFrameColumn column = Columns[i]; arrays.Add(column.ToArrowArray(numberOfRowsProcessed, numberOfRowsInThisRecordBatch)); } numberOfRowsProcessed += numberOfRowsInThisRecordBatch; yield return(new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch)); } while (numberOfRowsProcessed < RowCount); }