public Cursor(ParquetLoader parent, Func <int, bool> predicate, IRandom rand)
                : base(parent._host)
            {
                Ch.AssertValue(predicate);
                _loader             = parent;
                _fileStream         = parent._parquetStream;
                _parquetConversions = new ParquetConversions(Ch);
                _rand = rand;

                // Create Getter delegates
                Utils.BuildSubsetMaps(Schema.ColumnCount, predicate, out _actives, out _colToActivesIndex);
                _readerOptions = new ReaderOptions
                {
                    Count   = _loader._columnChunkReadSize,
                    Columns = _loader._columnsLoaded.Select(i => i.Name).ToArray()
                };

                int numBlocks = (int)Math.Ceiling(((decimal)parent.GetRowCount() / _readerOptions.Count));

                int[] blockOrder = _rand == null?Utils.GetIdentityPermutation(numBlocks) : Utils.GetRandomPermutation(rand, numBlocks);

                _blockEnumerator = blockOrder.GetEnumerator();

                _dataSetEnumerator = new int[0].GetEnumerator(); // Initialize an empty enumerator to get started
                _columnValues      = new IList[_actives.Length];
                _getters           = new Delegate[_actives.Length];
                for (int i = 0; i < _actives.Length; ++i)
                {
                    int columnIndex = _actives[i];
                    _getters[i] = CreateGetterDelegate(columnIndex);
                }
            }
Beispiel #2
0
            public Cursor(ParquetLoader parent, Func <int, bool> predicate, IRandom rand)
                : base(parent._host)
            {
                Ch.AssertValue(predicate);
                Ch.AssertValue(parent._parquetStream);

                _loader             = parent;
                _fileStream         = parent._parquetStream;
                _parquetConversions = new ParquetConversions(Ch);
                _rand = rand;

                // Create Getter delegates
                Utils.BuildSubsetMaps(Schema.ColumnCount, predicate, out _actives, out _colToActivesIndex);
                _readerOptions = new ReaderOptions
                {
                    Count   = _loader._columnChunkReadSize,
                    Columns = _loader._columnsLoaded.Select(i => i.Name).ToArray()
                };

                // The number of blocks is calculated based on the specified rows in a block (defaults to 1M).
                // Since we want to shuffle the blocks in addition to shuffling the rows in each block, checks
                // are put in place to ensure we can produce a shuffle order for the blocks.
                var numBlocks = MathUtils.DivisionCeiling((long)parent.GetRowCount(), _readerOptions.Count);

                if (numBlocks > int.MaxValue)
                {
                    throw _loader._host.ExceptParam(nameof(Arguments.ColumnChunkReadSize), "Error due to too many blocks. Try increasing block size.");
                }
                var blockOrder = CreateOrderSequence((int)numBlocks);

                _blockEnumerator = blockOrder.GetEnumerator();

                _dataSetEnumerator = Enumerable.Empty <int>().GetEnumerator();
                _columnValues      = new IList[_actives.Length];
                _getters           = new Delegate[_actives.Length];
                for (int i = 0; i < _actives.Length; ++i)
                {
                    int columnIndex = _actives[i];
                    _getters[i] = CreateGetterDelegate(columnIndex);
                }
            }