private void ReadColumn(BinaryReader reader, Thrift.Encoding encoding, long maxValues,
                                out IList values,
                                out List <int> indexes)
        {
            //dictionary encoding uses RLE to encode data

            switch (encoding)
            {
            case Thrift.Encoding.PLAIN:
                values  = _dataTypeHandler.Read(_thriftSchemaElement, reader, _parquetOptions);
                indexes = null;
                break;

            case Thrift.Encoding.RLE:
                values  = null;
                indexes = RunLengthBitPackingHybridValuesReader.Read(reader, _thriftSchemaElement.Type_length);
                break;

            case Thrift.Encoding.PLAIN_DICTIONARY:
                values  = null;
                indexes = ReadPlainDictionary(reader, maxValues);
                break;

            default:
                throw new ParquetException($"encoding {encoding} is not supported.");
            }
        }
        private static int ReadPlainDictionary(BinaryReader reader, int maxReadCount, int[] dest, int offset)
        {
            int start    = offset;
            int bitWidth = reader.ReadByte();

            int length = GetRemainingLength(reader);

            //when bit width is zero reader must stop and just repeat zero maxValue number of times
            if (bitWidth == 0 || length == 0)
            {
                for (int i = 0; i < maxReadCount; i++)
                {
                    dest[offset++] = 0;
                }
            }
            else
            {
                if (length != 0)
                {
                    offset += RunLengthBitPackingHybridValuesReader.ReadRleBitpackedHybrid(reader, bitWidth, length, dest, offset, maxReadCount);
                }
            }

            return(offset - start);
        }
Exemple #3
0
        /// <summary>
        /// Reads levels, suitable for both repetition levels and definition levels
        /// </summary>
        /// <param name="reader"></param>
        /// <param name="maxLevel">Maximum level value, depends on level type</param>
        /// <returns></returns>
        private List <int> ReadLevels(BinaryReader reader, int maxLevel)
        {
            int bitWidth = PEncoding.GetWidthFromMaxInt(maxLevel);
            var result   = new List <int>();

            //todo: there might be more data on larger files, therefore line below need to be called in a loop until valueCount is satisfied
            RunLengthBitPackingHybridValuesReader.ReadRleBitpackedHybrid(reader, bitWidth, 0, result);

            return(result);
        }
        private List <int> ReadDefinitionLevels(BinaryReader reader, int valueCount)
        {
            const int maxDefinitionLevel = 1; //todo: for nested columns this needs to be calculated properly
            int       bitWidth           = PEncoding.GetWidthFromMaxInt(maxDefinitionLevel);
            var       result             = new List <int>();

            //todo: there might be more data on larger files, therefore line below need to be called in a loop until valueCount is satisfied
            RunLengthBitPackingHybridValuesReader.ReadRleBitpackedHybrid(reader, bitWidth, 0, result, valueCount);

            int maxLevel  = _schema.GetMaxDefinitionLevel(_thriftChunk);
            int nullCount = valueCount - result.Count(r => r == maxLevel);

            if (nullCount == 0)
            {
                return(null);
            }

            return(result);
        }
        private static List <int> ReadPlainDictionary(BinaryReader reader, long maxValues)
        {
            var result   = new List <int>();
            int bitWidth = reader.ReadByte();

            //when bit width is zero reader must stop and just repeat zero maxValue number of times
            if (bitWidth == 0)
            {
                for (int i = 0; i < maxValues; i++)
                {
                    result.Add(0);
                }
                return(result);
            }

            int length = GetRemainingLength(reader);

            RunLengthBitPackingHybridValuesReader.ReadRleBitpackedHybrid(reader, bitWidth, length, result);
            return(result);
        }
Exemple #6
0
        private void ReadColumn(BinaryReader reader, Thrift.Encoding encoding, long totalValues, int maxReadCount, ColumnRawData cd)
        {
            //dictionary encoding uses RLE to encode data

            if (cd.values == null)
            {
                cd.values = _dataTypeHandler.GetArray((int)totalValues, false, false);
            }

            switch (encoding)
            {
            case Thrift.Encoding.PLAIN:
                cd.valuesOffset += _dataTypeHandler.Read(reader, _thriftSchemaElement, cd.values, cd.valuesOffset);
                break;

            case Thrift.Encoding.RLE:
                if (cd.indexes == null)
                {
                    cd.indexes = new int[(int)totalValues];
                }
                int indexCount = RunLengthBitPackingHybridValuesReader.Read(reader, _thriftSchemaElement.Type_length, cd.indexes, 0, maxReadCount);
                _dataTypeHandler.MergeDictionary(cd.dictionary, cd.indexes, cd.values, cd.valuesOffset, indexCount);
                cd.valuesOffset += indexCount;
                break;

            case Thrift.Encoding.PLAIN_DICTIONARY:
                if (cd.indexes == null)
                {
                    cd.indexes = new int[(int)totalValues];
                }
                indexCount = ReadPlainDictionary(reader, maxReadCount, cd.indexes, 0);
                _dataTypeHandler.MergeDictionary(cd.dictionary, cd.indexes, cd.values, cd.valuesOffset, indexCount);
                cd.valuesOffset += indexCount;
                break;

            default:
                throw new ParquetException($"encoding {encoding} is not supported.");
            }
        }
        private void ReadColumn(BinaryReader reader, Thrift.Encoding encoding, long maxValues,
                                ref Array values, ref int valuesOffset,
                                ref int[] indexes, ref int indexesOffset)
        {
            //dictionary encoding uses RLE to encode data

            switch (encoding)
            {
            case Thrift.Encoding.PLAIN:
                if (values == null)
                {
                    values = _dataTypeHandler.GetArray((int)maxValues, false, false);
                }
                valuesOffset += _dataTypeHandler.Read(reader, _thriftSchemaElement, values, valuesOffset, _parquetOptions);
                break;

            case Thrift.Encoding.RLE:
                if (indexes == null)
                {
                    indexes = new int[(int)maxValues];
                }
                indexesOffset += RunLengthBitPackingHybridValuesReader.Read(reader, _thriftSchemaElement.Type_length, indexes, indexesOffset);
                break;

            case Thrift.Encoding.PLAIN_DICTIONARY:
                if (indexes == null)
                {
                    indexes = new int[(int)maxValues];
                }
                indexesOffset += ReadPlainDictionary(reader, maxValues, indexes, indexesOffset);
                break;

            default:
                throw new ParquetException($"encoding {encoding} is not supported.");
            }
        }
Exemple #8
0
        private static int ReadPlainDictionary(BinaryReader reader, long maxValues, int[] dest, int offset)
        {
            int start    = offset;
            int bitWidth = reader.ReadByte();

            //when bit width is zero reader must stop and just repeat zero maxValue number of times
            if (bitWidth == 0)
            {
                for (int i = 0; i < maxValues; i++)
                {
                    dest[offset++] = 0;
                }
            }
            else
            {
                int length = GetRemainingLength(reader);
                offset += RunLengthBitPackingHybridValuesReader.ReadRleBitpackedHybrid(reader, bitWidth, length, dest, offset);
            }

            // the above might end up reading a few more elements than what we should read
            // Let's just fix the offset if we're over the total length
            // todo: longs, ints - maybe return long?
            return((offset - start > maxValues) ? (int)maxValues : (offset - start));
        }
        private int ReadLevels(BinaryReader reader, int maxLevel, int[] dest, int offset)
        {
            int bitWidth = maxLevel.GetBitWidth();

            return(RunLengthBitPackingHybridValuesReader.ReadRleBitpackedHybrid(reader, bitWidth, 0, dest, offset));
        }