/// <summary> /// try various compression on the data, /// return the best codec and compute the bits saved by it /// </summary> /// <returns></returns> private Tuple <Type, Bitstream> GetBestCompressor( string label, List <uint> data ) { // use this to check each stream var cc = new CompressionChecker { Options = CompressionOptions }; var stream = new Datastream(data); var results = cc.TestAll(label, stream, internalFlags); results.Sort((a, b) => a.CompressedBitLength.CompareTo(b.CompressedBitLength)); var best = results[0]; CodecBase codec; if (best.CompressorType == typeof(FixedSizeCodec)) { codec = new FixedSizeCodec(); } else if (best.CompressorType == typeof(ArithmeticCodec)) { codec = new ArithmeticCodec(); } else if (best.CompressorType == typeof(HuffmanCodec)) { codec = new HuffmanCodec(); } else if (best.CompressorType == typeof(GolombCodec)) { codec = new GolombCodec { Parameter = best.Parameters[0] } } ; else { throw new NotImplementedException("Unknown codec type"); } var bitstream = codec.CompressToStream(stream, internalFlags); var codecName = codec.GetType().Name; StatRecorder.AddStat("codec win: " + label + " " + codecName, 1); StatRecorder.AddStat($"codec win {codecName} saved high ", results.Last().CompressedBitLength - best.CompressedBitLength); if (results.Count > 1) { StatRecorder.AddStat($"codec win {codecName} saved low ", results[1].CompressedBitLength - best.CompressedBitLength); } if (Options.HasFlag(OptionFlags.DumpCompressorSelections)) { WriteLine($"{label} using {codecName}"); } return(new Tuple <Type, Bitstream>(codec.GetType(), bitstream)); }
/// <summary> /// try various compression on the data, /// return list of compression results (bitLength, type, optional parameters) /// </summary> /// <param name="statPrefix"></param> /// <param name="data"></param> /// <param name="headerFlags"></param> /// <returns></returns> public List <Result> TestAll(string statPrefix, Datastream data, Header.HeaderFlags headerFlags) { var results = new List <Result>(); // perform compression algorithm Action <string, CodecBase, Type> tryCodec = (label, codec, codecType) => { var bitstream = codec.CompressToStream(data, headerFlags); var result = new Result(label, bitstream.Length, codecType); if (codec.GetType() == typeof(GolombCodec)) { var g = codec as GolombCodec; if (g != null) { result.Parameters.Add(g.Parameter); result.CompressorName += $"({g.Parameter})"; } } results.Add(result); }; // try compression algorithms if (Options.HasFlag(OptionFlags.UseFixed)) { tryCodec("Fixed size", new FixedSizeCodec(), typeof(FixedSizeCodec)); } if (Options.HasFlag(OptionFlags.UseArithmetic)) { tryCodec("Arithmetic", new ArithmeticCodec(), typeof(ArithmeticCodec)); } if (Options.HasFlag(OptionFlags.UseHuffman)) { tryCodec("Huffman", new HuffmanCodec(), typeof(HuffmanCodec)); } if (Options.HasFlag(OptionFlags.UseGolomb) && data.Max() < GolombCodec.GolombThreshold) { tryCodec("Golomb", new GolombCodec(), typeof(GolombCodec)); } /* // try Golomb encoding * if (Options.HasFlag(OptionFlags.UseGolomb)) * { * var bestg = UniversalCodec.Optimize(UniversalCodec.Golomb.Encode, data, 1, Math.Min(data.Max(), 256)); * var bitstream = bestg.Item1; * var gname = $"Golomb({bestg.Item2,2})"; * results.Add(new Result(gname, bitstream.Length, typeof(GolombCodec), bestg.Item2)); * //results.Add(new Result(gname,bitstream.Length, typeof(UniversalCodec.Golomb), bestg.Item2)); * } */ Action <string, UniversalCodec.UniversalCodeDelegate, Type> tryEncoder = (label, codec, codecType) => { var bitstream = UniversalCodec.CompressStream(codec, data.Select(v => v + 1).ToList()); results.Add(new Result(label, bitstream.Length, codecType)); }; // try Elias codes - all perform poorly - todo - need way to pass this back as type? if (Options.HasFlag(OptionFlags.UseEliasDelta)) { tryEncoder("EliasDelta", UniversalCodec.Elias.EncodeDelta, typeof(UniversalCodec.Elias)); } if (Options.HasFlag(OptionFlags.UseEliasGamma)) { tryEncoder("EliasGamma", UniversalCodec.Elias.EncodeGamma, typeof(UniversalCodec.Elias)); } if (Options.HasFlag(OptionFlags.UseEliasOmega)) { tryEncoder("EliasOmega", UniversalCodec.Elias.EncodeOmega, typeof(UniversalCodec.Elias)); } // Stout if (Options.HasFlag(OptionFlags.UseStout)) { tryEncoder("Stout", (b, v) => UniversalCodec.Stout.Encode(b, v, 3), typeof(UniversalCodec.Stout)); } // BinaryAdaptiveSequentialEncode if (Options.HasFlag(OptionFlags.UseBasc)) { var bitstream = new Bitstream(); UniversalCodec.BinaryAdaptiveSequentialEncode(bitstream, data, UniversalCodec.Elias.EncodeDelta); var label = "BASC"; results.Add(new Result(label, bitstream.Length, typeof(UniversalCodec))); } // save stats foreach (var result in results) { StatRecorder.AddStat(statPrefix + "_" + result.CompressorName, result.CompressedBitLength); } return(results); }
/// <summary> /// Given the leaf nodes, create a canonical Huffman compression table /// Format is /// Elias delta code bitsPerSymbol /// Elias delta code maxCodeWordLength /// Then maxCodeWordLength counts of each codeword length, /// Then sum of those lengths of symbols, each of the given length /// </summary> /// <param name="leaves1"></param> /// <returns></returns> Bitstream MakeTable(List <Node> leaves1) { Trace.Assert(leaves1.Count > 0); // longest codeword uint maxCodewordLength = leaves1.Max(n => n.Codeword.BitLength); uint minCodewordLength = leaves1.Min(n => n.Codeword.BitLength); WriteLine($"Min, max codeword lengths {minCodewordLength} {maxCodewordLength}"); // get counts of each codeword length var codewordLengthCounts = new List <int>(); for (var codewordLength = minCodewordLength; codewordLength <= maxCodewordLength; ++codewordLength) { codewordLengthCounts.Add(leaves1.Count(n => n.Codeword.BitLength == codewordLength)); } if (Options.HasFlag(OptionFlags.LogCodewordLengths)) { for (var codewordLength = minCodewordLength; codewordLength <= maxCodewordLength; ++codewordLength) { var count = codewordLengthCounts[(int)(codewordLength - minCodewordLength)]; StatRecorder.AddStat($"Huffman_Codeword_{codewordLength}", (uint)count); } } Trace.Assert(codewordLengthCounts.Sum() == leaves1.Count); // bits for each item to store uint bitsPerSymbol = BitsRequired(leaves1.Max(n => n.Symbol)); // codeword length is < alphabet size (proof: look at tree to make codewords) // the largest count of codewords of a given length is ceiling (log_2(alphabet size)) // look at construction tree to see this var bitsPerCodelengthCount = BitsRequired((uint)codewordLengthCounts.Max()); if (Options.HasFlag(OptionFlags.DumpDictionary)) { // write table for debugging WriteLine("Make huffman tree:"); for (var length = minCodewordLength; length <= maxCodewordLength; ++length) { Write($" {length,3}: {codewordLengthCounts[(int)(length - minCodewordLength)],3} -> "); var length1 = length; // avoid modified closure foreach (var s in leaves1.Where(n => n.Codeword.BitLength == length1)) { Write($"x{s.Symbol:X2}, "); } WriteLine(); } } // now write the bit sizes of each entry type, then counts of distinct lengths, then the symbols var bs = new Bitstream(); // want to save the minimum codeword length and the delta to the max codeword length // size of codeword min and delta to max uint deltaCodewordLength = maxCodewordLength - minCodewordLength; // all header values UniversalCodec.Lomont.EncodeLomont1(bs, bitsPerSymbol - 1, 3, 0); // 1-32, usually 8, subtracting 1 gives 7, fits in 3 bits UniversalCodec.Lomont.EncodeLomont1(bs, bitsPerCodelengthCount - 1, 3, 0); // usually 4,5,6 UniversalCodec.Lomont.EncodeLomont1(bs, minCodewordLength - 1, 2, 0); // quite often 1,2,3,4, usually small UniversalCodec.Lomont.EncodeLomont1(bs, deltaCodewordLength - 1, 4, -1); // 9-12, up to 16,17 if (Options.HasFlag(OptionFlags.DumpHeader)) { WriteLine("Huffman encode header:"); WriteLine($" bits per symbol {bitsPerSymbol} bits per code length count {bitsPerCodelengthCount}"); WriteLine($" min len code {minCodewordLength} delta code len {deltaCodewordLength}"); } // write table - one entry for each codeword length present, entry is count then symbols int symbolIndex = 0; for (uint length = minCodewordLength; length <= maxCodewordLength; ++length) { int count = codewordLengthCounts[(int)(length - minCodewordLength)]; bs.Write((uint)count, bitsPerCodelengthCount); // write 'count' symbols for (int j = 0; j < count; ++j) { bs.Write(leaves1[symbolIndex++].Symbol, bitsPerSymbol); } } return(bs); }
/// <summary> /// Write the header for the compression algorithm /// </summary> /// <param name="bitstream"></param> /// <param name="data"></param> /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param> /// <returns></returns> public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags) { // erase data streams decisions.Clear(); decisionRuns.Clear(); literals.Clear(); distances.Clear(); lengths.Clear(); tokens.Clear(); // fill in all the data streams uint actualMinLength, actualMaxDistance; ComputeStreams(data, out actualMinLength, out actualMaxDistance); // due to the vagaries of this format, we write the entire file in the header call, // and unfortunately ignore the encode symbol and footer sections // dump info to help analyze if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("LZCL compress:"); WriteLine($" Data length {data.Count} "); } if (Options.HasFlag(OptionFlags.ShowTallies)) { // some info to help make analyze and make decisions Write("Length tally: "); Tally(lengths); WriteLine(); Write("Distance tally: "); Tally(distances); WriteLine(); } // get compressed streams so we can decide what to output var decisionChoice = GetBestCompressor("decisions", decisions); var decisionRunsChoice = GetBestCompressor("decision runs", decisionRuns); var literalsChoice = GetBestCompressor("literals", literals); var tokensChoice = GetBestCompressor("tokens", tokens); var distancesChoice = GetBestCompressor("distances", distances); var lengthsChoice = GetBestCompressor("lengths", lengths); // write header values Header.WriteUniversalHeader(bitstream, data, headerFlags); // save max distance occurring, used to encode tokens, very useful to users to know window needed size UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMaxDistance, 10, 0); UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMinLength, 2, 0); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine($"actual min length {actualMinLength}"); } if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine($"Max distance {actualMaxDistance}"); } if (decisionChoice.Item2.Length < decisionRunsChoice.Item2.Length) { // denote choice bitstream.Write(0); // save item WriteItem(bitstream, decisionChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Decisions smaller than decision runs"); } StatRecorder.AddStat($"codec used: decisions {decisionChoice.Item1.Name}", 1); } else { // denote choice bitstream.Write(1); // save initial value bitstream.Write(decisions[0]); // save item WriteItem(bitstream, decisionRunsChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Decisions runs smaller than decisions"); } StatRecorder.AddStat($"codec used: decision runs {decisionRunsChoice.Item1.Name}", 1); } // literals WriteItem(bitstream, literalsChoice); StatRecorder.AddStat($"codec used: literals {literalsChoice.Item1.Name}", 1); // tokens or separate distance, length pairs if (tokensChoice.Item2.Length < distancesChoice.Item2.Length + lengthsChoice.Item2.Length) { // denote choice bitstream.Write(0); // save item WriteItem(bitstream, tokensChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Tokens smaller than distance,length pairs"); } StatRecorder.AddStat($"codec used: tokens {tokensChoice.Item1.Name}", 1); } else { // denote choice bitstream.Write(1); // save items WriteItem(bitstream, distancesChoice); WriteItem(bitstream, lengthsChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Distance,length pairs smaller than tokens"); } StatRecorder.AddStat($"codec used: distances {distancesChoice.Item1.Name}", 1); StatRecorder.AddStat($"codec used: lengths {lengthsChoice.Item1.Name}", 1); } }