예제 #1
0
        /// <summary>
        /// Returns an <see cref="IEnumerable{RecordBatch}"/> without copying data
        /// </summary>
        public IEnumerable <RecordBatch> ToArrowRecordBatches()
        {
            Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder();

            int columnCount = Columns.Count;

            for (int i = 0; i < columnCount; i++)
            {
                DataFrameColumn column = Columns[i];
                Field           field  = column.GetArrowField();
                schemaBuilder.Field(field);
            }

            Schema schema = schemaBuilder.Build();
            List <Apache.Arrow.Array> arrays = new List <Apache.Arrow.Array>();

            int  recordBatchLength             = Int32.MaxValue;
            int  numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount);
            long numberOfRowsProcessed         = 0;

            // Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows
            do
            {
                for (int i = 0; i < columnCount; i++)
                {
                    DataFrameColumn column = Columns[i];
                    numberOfRowsInThisRecordBatch = (int)Math.Min(numberOfRowsInThisRecordBatch, column.GetMaxRecordBatchLength(numberOfRowsProcessed));
                }
                for (int i = 0; i < columnCount; i++)
                {
                    DataFrameColumn column = Columns[i];
                    arrays.Add(column.ToArrowArray(numberOfRowsProcessed, numberOfRowsInThisRecordBatch));
                }
                numberOfRowsProcessed += numberOfRowsInThisRecordBatch;
                yield return(new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch));
            } while (numberOfRowsProcessed < RowCount);
        }