Example #1
0
        private byte[] ReadRawBytes(Thrift.PageHeader ph, Stream inputStream)
        {
            Thrift.CompressionCodec thriftCodec = _thriftChunk.Meta_data.Codec;
            IDataReader             reader      = DataFactory.GetReader(thriftCodec);

            return(reader.Read(inputStream, ph.Compressed_page_size));
        }
        public static BytesOwner ReadPageData(Stream nakedStream, Thrift.CompressionCodec compressionCodec,
                                              int compressedLength, int uncompressedLength)
        {
            if (!_codecToCompressionMethod.TryGetValue(compressionCodec, out CompressionMethod compressionMethod))
            {
                throw new NotSupportedException($"reader for compression '{compressionCodec}' is not supported.");
            }

            int totalBytesRead   = 0;
            int currentBytesRead = int.MinValue;

            byte[] data = BytesPool.Rent(compressedLength);

            // Some storage solutions (like Azure blobs) might require more than one 'Read' action to read the requested length.
            while (totalBytesRead < compressedLength && currentBytesRead != 0)
            {
                currentBytesRead = nakedStream.Read(data, totalBytesRead, compressedLength - totalBytesRead);
                totalBytesRead  += currentBytesRead;
            }

            if (totalBytesRead != compressedLength)
            {
                throw new ParquetException($"expected {compressedLength} bytes in source stream but could read only {totalBytesRead}");
            }

            switch (compressionMethod)
            {
            case CompressionMethod.None:
                //nothing to do, original data is the raw data
                break;

            case CompressionMethod.Gzip:
                using (var source = new MemoryStream(data, 0, compressedLength))
                {
                    byte[] unGzData = BytesPool.Rent(uncompressedLength);
                    using (var dest = new MemoryStream(unGzData, 0, uncompressedLength))
                    {
                        using (var gz = new GZipStream(source, CompressionMode.Decompress))
                        {
                            gz.CopyTo(dest);
                        }
                    }
                    BytesPool.Return(data);
                    data = unGzData;
                }
                break;

            case CompressionMethod.Snappy:
                var    snappy     = new SnappyDecompressor();
                byte[] unSnapData = snappy.Decompress(BytesPool, data, 0, compressedLength);
                BytesPool.Return(data);
                data = unSnapData;
                break;

            default:
                throw new NotSupportedException("method: " + compressionMethod);
            }

            return(new BytesOwner(data, 0, data.AsMemory(0, (int)uncompressedLength), d => BytesPool.Return(d)));
        }
Example #3
0
        public static Stream CreateReader(Stream nakedStream, Thrift.CompressionCodec compressionCodec, long knownLength)
        {
            if (!_codecToCompressionMethod.TryGetValue(compressionCodec, out CompressionMethod compressionMethod))
            {
                throw new NotSupportedException($"reader for compression '{compressionCodec}' is not supported.");
            }

            return(CreateReader(nakedStream, compressionMethod, knownLength));
        }
        public static IDataReader GetReader(Thrift.CompressionCodec thriftCodec)
        {
            if (!CompressionMethodToCodec.ContainsValue(thriftCodec))
            {
                throw new NotSupportedException($"reader for compression '{thriftCodec}' is not supported.");
            }

            CompressionMethod method = CompressionMethodToCodec.First(kv => kv.Value == thriftCodec).Key;

            return(GetReader(method));
        }
Example #5
0
        public static BytesOwner ReadPageData(Stream nakedStream, Thrift.CompressionCodec compressionCodec,
                                              int compressedLength, int uncompressedLength)
        {
            if (!_codecToCompressionMethod.TryGetValue(compressionCodec, out CompressionMethod compressionMethod))
            {
                throw new NotSupportedException($"reader for compression '{compressionCodec}' is not supported.");
            }

            byte[] data = BytesPool.Rent(compressedLength);
            int    read = nakedStream.Read(data, 0, compressedLength);

            if (read != compressedLength)
            {
                throw new ParquetException($"expected {compressedLength} bytes in source stream but could read only {read}");
            }

            switch (compressionMethod)
            {
            case CompressionMethod.None:
                //nothing to do, original data is the raw data
                break;

            case CompressionMethod.Gzip:
                using (var source = new MemoryStream(data, 0, compressedLength))
                {
                    byte[] unGzData = BytesPool.Rent(uncompressedLength);
                    using (var dest = new MemoryStream(unGzData, 0, uncompressedLength))
                    {
                        using (var gz = new GZipStream(source, CompressionMode.Decompress))
                        {
                            gz.CopyTo(dest);
                        }
                    }
                    BytesPool.Return(data);
                    data = unGzData;
                }
                break;

            case CompressionMethod.Snappy:
                var    snappy     = new SnappyDecompressor();
                byte[] unSnapData = snappy.Decompress(BytesPool, data, 0, compressedLength);
                BytesPool.Return(data);
                data = unSnapData;
                break;

            default:
                throw new NotSupportedException("method: " + compressionMethod);
            }

            return(new BytesOwner(data, data.AsMemory(0, (int)uncompressedLength), d => BytesPool.Return(d)));
        }
Example #6
0
        public Thrift.ColumnChunk AddColumnChunk(CompressionMethod compression, Stream output, SchemaElement schema, int valuesCount)
        {
            Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression);

            var  chunk    = new Thrift.ColumnChunk();
            long startPos = output.Position;

            chunk.File_offset                = startPos;
            chunk.Meta_data                  = new Thrift.ColumnMetaData();
            chunk.Meta_data.Num_values       = valuesCount;
            chunk.Meta_data.Type             = schema.Thrift.Type;
            chunk.Meta_data.Codec            = codec;
            chunk.Meta_data.Data_page_offset = startPos;
            chunk.Meta_data.Encodings        = new List <Thrift.Encoding>
            {
                Thrift.Encoding.RLE,
                Thrift.Encoding.BIT_PACKED,
                Thrift.Encoding.PLAIN
            };
            chunk.Meta_data.Path_in_schema = new List <string>(schema.Path.Split(Schema.PathSeparatorChar));

            return(chunk);
        }
        public Thrift.ColumnChunk CreateColumnChunk(CompressionMethod compression, Stream output, Thrift.Type columnType, List <string> path, int valuesCount)
        {
            Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression);

            var  chunk    = new Thrift.ColumnChunk();
            long startPos = output.Position;

            chunk.File_offset                = startPos;
            chunk.Meta_data                  = new Thrift.ColumnMetaData();
            chunk.Meta_data.Num_values       = valuesCount;
            chunk.Meta_data.Type             = columnType;
            chunk.Meta_data.Codec            = codec;
            chunk.Meta_data.Data_page_offset = startPos;
            chunk.Meta_data.Encodings        = new List <Thrift.Encoding>
            {
                Thrift.Encoding.RLE,
                Thrift.Encoding.BIT_PACKED,
                Thrift.Encoding.PLAIN
            };
            chunk.Meta_data.Path_in_schema = path;

            return(chunk);
        }