private byte[] ReadRawBytes(Thrift.PageHeader ph, Stream inputStream) { Thrift.CompressionCodec thriftCodec = _thriftChunk.Meta_data.Codec; IDataReader reader = DataFactory.GetReader(thriftCodec); return(reader.Read(inputStream, ph.Compressed_page_size)); }
public static BytesOwner ReadPageData(Stream nakedStream, Thrift.CompressionCodec compressionCodec, int compressedLength, int uncompressedLength) { if (!_codecToCompressionMethod.TryGetValue(compressionCodec, out CompressionMethod compressionMethod)) { throw new NotSupportedException($"reader for compression '{compressionCodec}' is not supported."); } int totalBytesRead = 0; int currentBytesRead = int.MinValue; byte[] data = BytesPool.Rent(compressedLength); // Some storage solutions (like Azure blobs) might require more than one 'Read' action to read the requested length. while (totalBytesRead < compressedLength && currentBytesRead != 0) { currentBytesRead = nakedStream.Read(data, totalBytesRead, compressedLength - totalBytesRead); totalBytesRead += currentBytesRead; } if (totalBytesRead != compressedLength) { throw new ParquetException($"expected {compressedLength} bytes in source stream but could read only {totalBytesRead}"); } switch (compressionMethod) { case CompressionMethod.None: //nothing to do, original data is the raw data break; case CompressionMethod.Gzip: using (var source = new MemoryStream(data, 0, compressedLength)) { byte[] unGzData = BytesPool.Rent(uncompressedLength); using (var dest = new MemoryStream(unGzData, 0, uncompressedLength)) { using (var gz = new GZipStream(source, CompressionMode.Decompress)) { gz.CopyTo(dest); } } BytesPool.Return(data); data = unGzData; } break; case CompressionMethod.Snappy: var snappy = new SnappyDecompressor(); byte[] unSnapData = snappy.Decompress(BytesPool, data, 0, compressedLength); BytesPool.Return(data); data = unSnapData; break; default: throw new NotSupportedException("method: " + compressionMethod); } return(new BytesOwner(data, 0, data.AsMemory(0, (int)uncompressedLength), d => BytesPool.Return(d))); }
public static Stream CreateReader(Stream nakedStream, Thrift.CompressionCodec compressionCodec, long knownLength) { if (!_codecToCompressionMethod.TryGetValue(compressionCodec, out CompressionMethod compressionMethod)) { throw new NotSupportedException($"reader for compression '{compressionCodec}' is not supported."); } return(CreateReader(nakedStream, compressionMethod, knownLength)); }
public static IDataReader GetReader(Thrift.CompressionCodec thriftCodec) { if (!CompressionMethodToCodec.ContainsValue(thriftCodec)) { throw new NotSupportedException($"reader for compression '{thriftCodec}' is not supported."); } CompressionMethod method = CompressionMethodToCodec.First(kv => kv.Value == thriftCodec).Key; return(GetReader(method)); }
public static BytesOwner ReadPageData(Stream nakedStream, Thrift.CompressionCodec compressionCodec, int compressedLength, int uncompressedLength) { if (!_codecToCompressionMethod.TryGetValue(compressionCodec, out CompressionMethod compressionMethod)) { throw new NotSupportedException($"reader for compression '{compressionCodec}' is not supported."); } byte[] data = BytesPool.Rent(compressedLength); int read = nakedStream.Read(data, 0, compressedLength); if (read != compressedLength) { throw new ParquetException($"expected {compressedLength} bytes in source stream but could read only {read}"); } switch (compressionMethod) { case CompressionMethod.None: //nothing to do, original data is the raw data break; case CompressionMethod.Gzip: using (var source = new MemoryStream(data, 0, compressedLength)) { byte[] unGzData = BytesPool.Rent(uncompressedLength); using (var dest = new MemoryStream(unGzData, 0, uncompressedLength)) { using (var gz = new GZipStream(source, CompressionMode.Decompress)) { gz.CopyTo(dest); } } BytesPool.Return(data); data = unGzData; } break; case CompressionMethod.Snappy: var snappy = new SnappyDecompressor(); byte[] unSnapData = snappy.Decompress(BytesPool, data, 0, compressedLength); BytesPool.Return(data); data = unSnapData; break; default: throw new NotSupportedException("method: " + compressionMethod); } return(new BytesOwner(data, data.AsMemory(0, (int)uncompressedLength), d => BytesPool.Return(d))); }
public Thrift.ColumnChunk AddColumnChunk(CompressionMethod compression, Stream output, SchemaElement schema, int valuesCount) { Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression); var chunk = new Thrift.ColumnChunk(); long startPos = output.Position; chunk.File_offset = startPos; chunk.Meta_data = new Thrift.ColumnMetaData(); chunk.Meta_data.Num_values = valuesCount; chunk.Meta_data.Type = schema.Thrift.Type; chunk.Meta_data.Codec = codec; chunk.Meta_data.Data_page_offset = startPos; chunk.Meta_data.Encodings = new List <Thrift.Encoding> { Thrift.Encoding.RLE, Thrift.Encoding.BIT_PACKED, Thrift.Encoding.PLAIN }; chunk.Meta_data.Path_in_schema = new List <string>(schema.Path.Split(Schema.PathSeparatorChar)); return(chunk); }
public Thrift.ColumnChunk CreateColumnChunk(CompressionMethod compression, Stream output, Thrift.Type columnType, List <string> path, int valuesCount) { Thrift.CompressionCodec codec = DataFactory.GetThriftCompression(compression); var chunk = new Thrift.ColumnChunk(); long startPos = output.Position; chunk.File_offset = startPos; chunk.Meta_data = new Thrift.ColumnMetaData(); chunk.Meta_data.Num_values = valuesCount; chunk.Meta_data.Type = columnType; chunk.Meta_data.Codec = codec; chunk.Meta_data.Data_page_offset = startPos; chunk.Meta_data.Encodings = new List <Thrift.Encoding> { Thrift.Encoding.RLE, Thrift.Encoding.BIT_PACKED, Thrift.Encoding.PLAIN }; chunk.Meta_data.Path_in_schema = path; return(chunk); }