Beispiel #1
0
        /// <summary>
        /// Returns an <see cref="IEnumerable{RecordBatch}"/> without copying data
        /// </summary>
        public IEnumerable <RecordBatch> ToArrowRecordBatches()
        {
            Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder();

            int columnCount = Columns.Count;

            for (int i = 0; i < columnCount; i++)
            {
                DataFrameColumn column = Columns[i];
                Field           field  = column.GetArrowField();
                schemaBuilder.Field(field);
            }

            Schema schema = schemaBuilder.Build();
            List <Apache.Arrow.Array> arrays = new List <Apache.Arrow.Array>();

            int  recordBatchLength             = Int32.MaxValue;
            int  numberOfRowsInThisRecordBatch = (int)Math.Min(recordBatchLength, RowCount);
            long numberOfRowsProcessed         = 0;

            // Sometimes .NET for Spark passes in DataFrames with no rows. In those cases, we just return a RecordBatch with the right Schema and no rows
            do
            {
                for (int i = 0; i < columnCount; i++)
                {
                    DataFrameColumn column = Columns[i];
                    numberOfRowsInThisRecordBatch = (int)Math.Min(numberOfRowsInThisRecordBatch, column.GetMaxRecordBatchLength(numberOfRowsProcessed));
                }
                for (int i = 0; i < columnCount; i++)
                {
                    DataFrameColumn column = Columns[i];
                    arrays.Add(column.ToArrowArray(numberOfRowsProcessed, numberOfRowsInThisRecordBatch));
                }
                numberOfRowsProcessed += numberOfRowsInThisRecordBatch;
                yield return(new RecordBatch(schema, arrays, numberOfRowsInThisRecordBatch));
            } while (numberOfRowsProcessed < RowCount);
        }
Beispiel #2
0
        public static Table BuildTable(List <TDataLayout> data)
        {
            PropertyInfo[] properties = typeof(TDataLayout).GetProperties(BindingFlags.Public | BindingFlags.Instance).Where(p => p.GetCustomAttribute <ArrowPropertyAttribute>() != null).ToArray();

            //int length = data.Count;

            List <Field> fields = new List <Field>(properties.Length);

            Apache.Arrow.Schema.Builder schemaBuilder = new Apache.Arrow.Schema.Builder();

            foreach (PropertyInfo property in properties)
            {
                ArrowPropertyAttribute arrowPropertyAttribute = property.GetCustomAttribute <ArrowPropertyAttribute>();

                Field field = new Field.Builder().Name(arrowPropertyAttribute.Name).DataType(arrowPropertyAttribute.ArrowType()).Build();

                schemaBuilder.Field(field);

                fields.Add(field);
            }

            List <List <object> > transformData = new List <List <object> >(properties.Length);

            for (int i = 0; i < properties.Length; ++i)
            {
                transformData.Add(new List <object>(data.Count));
            }

            for (int i = 0; i < data.Count; ++i)
            {
                int j = 0;

                foreach (object property in data[i])
                {
                    transformData[j++][i] = property;
                }
            }

            List <Column> columns = new List <Column>(properties.Length);

            for (int i = 0; i < properties.Length; ++i)
            {
                Array array = ArrowUtilities.MakeArrayBuffer(fields[i].DataType,
                                                             transformData[i]);

                Column column = new Column(fields[i],
                                           new Array[]
                {
                    array
                });

                columns.Add(column);
            }

            //NativeMemoryAllocator memoryAllocator = new NativeMemoryAllocator(32);

            Table table = new Table(schemaBuilder.Build(),
                                    columns);

            return(table);
        }