Header.HeaderFlags internalFlags = Header.HeaderFlags.None; // todo - make None to save bits void WriteItem(Bitstream bitstream, Tuple <Type, Bitstream> item) { // save type var codecType = item.Item1; if (codecType == typeof(FixedSizeCodec)) { bitstream.Write(0, 2); } else if (codecType == typeof(ArithmeticCodec)) { bitstream.Write(1, 2); } else if (codecType == typeof(HuffmanCodec)) { bitstream.Write(2, 2); } else if (codecType == typeof(GolombCodec)) { bitstream.Write(3, 2); } else { throw new NotImplementedException("Unknown compressor type"); } // save bit size UniversalCodec.Lomont.EncodeLomont1(bitstream, item.Item2.Length, 6, 0); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine($"Compressor type {codecType.Name}, length {item.Item2.Length}"); } // save stream bitstream.WriteStream(item.Item2); }
/// <summary> /// Even-Rodeh code /// /// Encode a non-negative integer N : /// 1. If N is less than 4 then output N in 3 bits and stop. /// 2. If N is less than 8 then prepend the coded value with 3 bits containing the value of N and stop. /// 3. Prepend the coded value with the binary representation of N. /// 4. Store the number of bits prepended in step 3 as the new value of N. /// 5. Go back to step 2 /// 6. Output a single 0 bit. /// /// </summary> /// <returns></returns> public static void Encode(Bitstream bitstream, uint value) { if (value < 4) { bitstream.Write(value, 3); return; } var stack = new Stack <uint>(); uint n = value; while (true) { if (n < 8) { stack.Push(n); break; } stack.Push(n); n = CodecBase.BitsRequired(n); } while (stack.Any()) { uint val = stack.Pop(); if (val < 8) { bitstream.Write(val, 3); } else { bitstream.Write(val); } } bitstream.Write(0, 1); }
/// <summary> /// Compress a symbol in the compression algorithm /// </summary> /// <param name="bitstream"></param> /// <param name="symbol"></param> public override void CompressSymbol(Bitstream bitstream, uint symbol) { encoderState.SymbolCallIndex++; // process streams into compressed bitstream while (encoderState.SymbolCallIndex > encoderState.DatumIndex) { uint decision = decisions[encoderState.DecisionIndex++]; bitstream.Write(decision, 1); // decision if (decision == 0) { // literal if (Options.HasFlag(OptionFlags.DumpEncode)) { Write($"[{literals[encoderState.LiteralIndex]}] "); } bitstream.Write(literals[encoderState.LiteralIndex++], encoderState.ActualBitsPerSymbol); ++encoderState.DatumIndex; } else { // (distance, length) pair, encoded uint distance = distances[encoderState.TokenIndex]; uint length = lengths[encoderState.TokenIndex]; if (Options.HasFlag(OptionFlags.DumpEncode)) { Write($"[{distance},{length}] "); } uint token = (length - encoderState.ActualMinLength) * (encoderState.ActualMaxDistance + 1) + distance; bitstream.Write(token, encoderState.ActualBitsPerToken); ++encoderState.TokenIndex; encoderState.DatumIndex += (int)length; } } }
void Finish(Bitstream bitstream) { // two possible lowValue and highValue distributions, so // two bits enough to distinguish // todo - need enough to decode last item, else bitstream must return 0 when empty if (lowValue < Range25Percent) { bitstream.Write(0); bitstream.Write(1); for (var i = 0; i < scaling + 1; ++i) //final e3 scaling { bitstream.Write(1); } //Console.WriteLine($"A finish 0 - {output.symbolsToWrite-output.symbolsWritten} to go"); } else { bitstream.Write(1); bitstream.Write(0); // no need to write more final scaling 0 values since decoder returns all 0s after end of stream //for (var i = 0; i < scaling + 1; ++i) //final e3 scaling // bitstream.Write(0); //Console.WriteLine($"A finish 1 - {output.symbolsToWrite - output.symbolsWritten} to go"); } }
/// <summary> /// Encode a 32 bit value into the bitstream using Lomont method 1 /// value is broken into 'chunckSize' bit chunks, then 0 or 1 written before /// each chunk, where 1 means not last chunk, 0 means last chunk. Chunks are /// written least significant first. 'chunkSize' 6 is default and works best for /// common file sizes. /// </summary> /// <param name="bitstream"></param> /// <param name="value32"></param> /// <param name="chunkSize">A good value is 6</param> /// <param name="deltaChunk">A good starting value is 0</param> public static void EncodeLomont1(Bitstream bitstream, uint value32, int chunkSize, int deltaChunk) { uint mask = (1U << chunkSize) - 1; while (value32 >= (1 << chunkSize)) { bitstream.Write(1, 1); // another chunk bitstream.Write(value32 & mask, (uint)chunkSize); // write chunkSize bits value32 >>= chunkSize; if (deltaChunk != 0) { chunkSize += deltaChunk; if (chunkSize <= 0) { chunkSize = 1; } mask = (1U << chunkSize) - 1; } //Console.Write('#'); } bitstream.Write(0, 1); // last chunk bitstream.Write(value32 & mask, (uint)chunkSize); }
/// <summary> /// Encode integer N via Sk(N) 0 B(N,floor(log_2 N)+1) /// /// Recursively define: for fixed integer k > 1 /// Sk(n) = B(n,l) if n in [0,2^k-1], else = Sk(Floor[Log_2 n]-k) B(n,Floor[Log_2 n]+1) /// where B(n,l) writes n in k bits /// Note the final B value is 0 prefixed, which lets the decoder know this is the last block. /// </summary> public static void Encode(Bitstream bitstream, uint value, uint k) { uint bitLength = CodecBase.BitsRequired(value); Recurse(bitstream, bitLength, k); bitstream.Write(0); bitstream.Write(value); }
/// <summary> /// L = bits needed to store value /// Write L-1 zero bits, then the value in binary (which necessarily starts with one) /// </summary> /// <param name="bitstream"></param> /// <param name="value32"></param> public static void EncodeGamma(Bitstream bitstream, uint value32) { Trace.Assert(value32 >= 1); uint n = CodecBase.BitsRequired(value32); for (var i = 0; i < n - 1; ++i) { bitstream.Write(0); } bitstream.Write(value32, n); }
/// <summary> /// Compress a symbol in the compression algorithm /// </summary> /// <param name="bitstream"></param> /// <param name="symbol"></param> public override void CompressSymbol(Bitstream bitstream, uint symbol) { if (Options.HasFlag(OptionFlags.DumpState)) { Write($"[{symbol:X2},{lowValue:X8},{highValue:X8}] "); } uint lowCount, highCount; GetCounts(symbol, out lowCount, out highCount); Trace.Assert(total < (1 << 29)); // update bounds uint step = (highValue - lowValue + 1) / total; // interval open at top gives + 1 highValue = lowValue + step * highCount - 1; // interval open at top gives -1 lowValue = lowValue + step * lowCount; // apply e1/e2 scaling to keep ranges in bounds while ((highValue < Range50Percent) || (lowValue >= Range50Percent)) { if (highValue < Range50Percent) { bitstream.Write(0); lowValue = 2 * lowValue; highValue = 2 * highValue + 1; // e3 scaling for (; scaling > 0; scaling--) { bitstream.Write(1); } } else if (lowValue >= Range50Percent) { bitstream.Write(1); lowValue = 2 * (lowValue - Range50Percent); highValue = 2 * (highValue - Range50Percent) + 1; // e3 scaling for (; scaling > 0; scaling--) { bitstream.Write(0); } } } // get e3 scaling value while ((Range25Percent <= lowValue) && (highValue < Range75Percent)) { scaling++; lowValue = 2 * (lowValue - Range25Percent); highValue = 2 * (highValue - Range25Percent) + 1; } }
/// <summary> /// Encode a 32 bit value into the bitstream using Lomont method 3 /// </summary> /// <param name="bitstream"></param> /// <param name="value32"></param> public static void EncodeLomont3(Bitstream bitstream, uint value32) { Trace.Assert(value32 > 0); uint n = CodecBase.BitsRequired(value32); for (var i = 0; i < n - 1; ++i) { bitstream.Write(0, 1); // n-1 of these } bitstream.Write(1, 1); // end of n count bitstream.Write(value32, n - 1); // remove leading 1 }
/// <summary> /// Encode a 32 bit value into the bitstream using Lomont method 2 /// </summary> /// <param name="bitstream"></param> /// <param name="value32"></param> public static void EncodeLomont2(Bitstream bitstream, uint value32) { while (value32 > 255) { bitstream.Write(1, 1); // another byte bitstream.Write(value32 & 255, 8); // write 8 bits value32 >>= 8; } Trace.Assert(value32 > 0); bitstream.Write(0, 1); // last byte bitstream.Write(value32 & 255, 8); }
/// <summary> /// Encode a 32 bit value into the bitstream using Elias Delta coding /// To encode a number X greater than 0: /// 1. N = number of bits needed to store X. N >=1 /// 2. L = number of bits needed to store N. L >= 1 /// 3. Write L-1 zeroes. /// 4. Write the L bit representation of N (which starts with a 1) /// 5. Write all but the leading 1 bit of X (i.e., the last N-1 bits) /// </summary> /// <param name="bitstream"></param> /// <param name="value32"></param> public static void EncodeDelta(Bitstream bitstream, uint value32) { Trace.Assert(value32 >= 1); uint n = CodecBase.BitsRequired(value32); uint l = CodecBase.BitsRequired(n); for (var i = 1; i <= l - 1; ++i) { bitstream.Write(0, 1); } bitstream.Write(n, l); bitstream.Write(value32, n - 1); }
/// <summary> /// Golomb code, useful for geometric distributions /// /// encode values using int parameter m /// value N is encoded via: q=Floor(N/M),r=N%M, /// q 1's, then one 0, then Log2M bits for r /// good for geometric distribution /// /// </summary> /// <returns></returns> public static void Encode(Bitstream bitstream, uint value, uint m) { Trace.Assert(m > 0); var n = value; var q = n / m; var r = n % m; for (var i = 1; i <= q; ++i) { bitstream.Write(1); } bitstream.Write(0); Truncated.Encode(bitstream, r, m); }
static void Recurse(Bitstream bitstream, uint n, uint k) { Trace.Assert(k > 1); if (n < (1U << (int)k)) { bitstream.Write(n, k); } else { uint m = CodecBase.FloorLog2(n); Recurse(bitstream, m - k, k); bitstream.Write(n, m + 1); } }
/// <summary> /// Exponential Golumb code for x geq 0 /// 1. Write (x+1) in binary in n bits /// 2. Prepend n-1 zero bits /// /// Order k > 0, do above to Floor[x/2^k] /// Then x mod 2^k in binary /// </summary> /// <param name="bitstream"></param> /// <param name="value"></param> /// <param name="k"></param> public static void EncodeExp(Bitstream bitstream, uint value, uint k) { if (k > 0) { EncodeExp(bitstream, value >> (int)k, 0); uint mask = (1U << (int)k) - 1; bitstream.Write(value & mask, k); } else { uint n = CodecBase.BitsRequired(value + 1); bitstream.Write(0, n - 1); bitstream.Write(value + 1, n); } }
/// <summary> /// Useful for encoding a value in [0,N). /// k = bit length N, k > 0 /// If N is a power of 2, uses k bits. /// If N is not a power of two, encodes some choices in k-1 bits, others in k bits /// </summary> public static void Encode(Bitstream bitstream, uint value, uint n) { Trace.Assert(value < n); uint k = CodecBase.BitsRequired(n); uint u = (1U << (int)k) - n; // u = number of unused codewords if (value < u) { bitstream.Write(value, k - 1); } else { bitstream.Write(value + u, k); } }
public static void EncodeOmega(Bitstream bitstream, uint value32) { Trace.Assert(value32 >= 1); var stack = new Stack <uint>(); while (value32 != 1) { stack.Push(value32); value32 = CodecBase.BitsRequired(value32) - 1; } while (stack.Any()) { bitstream.Write(stack.Pop()); } bitstream.Write(0); }
/// <summary> /// Compress stream of numbers: /// Store length+1 using EliasDelta (to avoid 0 case) /// First number x0 requires b0 bits. Write b0 in EliasDelta. Write x0 in b0 bits. /// Each subsequent number xi requires bi bits. If bi leq b(i-1) then /// write a 0 bit, then xi in b(i-1) bits. Else write (bi-b(i-1)) 1 bits, then a 0, /// then the least sig bi - 1 bits of xi (the leading 1 in xi is implied, xi>0). /// TODO - alternatives - allow the bi to change slowly, removes some hiccups for odd data points, set b to avg of some prev values /// </summary> /// <param name="bitstream"></param> /// <param name="data"></param> /// <param name="universalCoder">How to encode/decode a data length and first bitlength items. Elias.EncodeDelta is useful</param> public static void BinaryAdaptiveSequentialEncode(Bitstream bitstream, Datastream data, Action <Bitstream, uint> universalCoder) { universalCoder(bitstream, (uint)data.Count + 1); if (data.Count == 0) { return; } uint b1 = CodecBase.BitsRequired(data[0]); universalCoder(bitstream, b1); bitstream.Write(data[0]); for (var i = 1; i < data.Count; ++i) { uint d = data[i]; uint b2 = CodecBase.BitsRequired(d); if (b2 <= b1) { // b1 is enough bits bitstream.Write(0); bitstream.Write(d, b1); } else { // b2 requires more bits, tell how many Trace.Assert(d > 0); for (var ik = 0; ik < b2 - b1; ++ik) { bitstream.Write(1); } bitstream.Write(0, 1); // end of bit count bitstream.Write(d, b2 - 1); // strip off leading '1' } b1 = CodecBase.BitsRequired(d); // for next pass } }
public override void CompressSymbol(Bitstream bitstream, uint symbol) { var node = leaves.Find(n => n.Symbol == symbol); // write MSB first for (var i = (int)node.Codeword.BitLength - 1; i >= 0; --i) { bitstream.Write(node.Codeword.GetBit((uint)i), 1); } if (Options.HasFlag(OptionFlags.DumpEncoding)) { Write($"{symbol:X2},"); } }
/// <summary> /// Write the header for the compression algorithm /// </summary> /// <param name="bitstream"></param> /// <param name="data"></param> /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param> /// <returns></returns> public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags) { // erase data streams decisions.Clear(); decisionRuns.Clear(); literals.Clear(); distances.Clear(); lengths.Clear(); tokens.Clear(); // fill in all the data streams uint actualMinLength, actualMaxDistance; ComputeStreams(data, out actualMinLength, out actualMaxDistance); // due to the vagaries of this format, we write the entire file in the header call, // and unfortunately ignore the encode symbol and footer sections // dump info to help analyze if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("LZCL compress:"); WriteLine($" Data length {data.Count} "); } if (Options.HasFlag(OptionFlags.ShowTallies)) { // some info to help make analyze and make decisions Write("Length tally: "); Tally(lengths); WriteLine(); Write("Distance tally: "); Tally(distances); WriteLine(); } // get compressed streams so we can decide what to output var decisionChoice = GetBestCompressor("decisions", decisions); var decisionRunsChoice = GetBestCompressor("decision runs", decisionRuns); var literalsChoice = GetBestCompressor("literals", literals); var tokensChoice = GetBestCompressor("tokens", tokens); var distancesChoice = GetBestCompressor("distances", distances); var lengthsChoice = GetBestCompressor("lengths", lengths); // write header values Header.WriteUniversalHeader(bitstream, data, headerFlags); // save max distance occurring, used to encode tokens, very useful to users to know window needed size UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMaxDistance, 10, 0); UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMinLength, 2, 0); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine($"actual min length {actualMinLength}"); } if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine($"Max distance {actualMaxDistance}"); } if (decisionChoice.Item2.Length < decisionRunsChoice.Item2.Length) { // denote choice bitstream.Write(0); // save item WriteItem(bitstream, decisionChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Decisions smaller than decision runs"); } StatRecorder.AddStat($"codec used: decisions {decisionChoice.Item1.Name}", 1); } else { // denote choice bitstream.Write(1); // save initial value bitstream.Write(decisions[0]); // save item WriteItem(bitstream, decisionRunsChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Decisions runs smaller than decisions"); } StatRecorder.AddStat($"codec used: decision runs {decisionRunsChoice.Item1.Name}", 1); } // literals WriteItem(bitstream, literalsChoice); StatRecorder.AddStat($"codec used: literals {literalsChoice.Item1.Name}", 1); // tokens or separate distance, length pairs if (tokensChoice.Item2.Length < distancesChoice.Item2.Length + lengthsChoice.Item2.Length) { // denote choice bitstream.Write(0); // save item WriteItem(bitstream, tokensChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Tokens smaller than distance,length pairs"); } StatRecorder.AddStat($"codec used: tokens {tokensChoice.Item1.Name}", 1); } else { // denote choice bitstream.Write(1); // save items WriteItem(bitstream, distancesChoice); WriteItem(bitstream, lengthsChoice); if (Options.HasFlag(OptionFlags.DumpDebug)) { WriteLine("Distance,length pairs smaller than tokens"); } StatRecorder.AddStat($"codec used: distances {distancesChoice.Item1.Name}", 1); StatRecorder.AddStat($"codec used: lengths {lengthsChoice.Item1.Name}", 1); } }
/// <summary> /// Create the frequency table as a bitsteam for ease of use/testing /// </summary> /// <returns></returns> Bitstream MakeFrequencyTable() { var bs = new Bitstream(); // write freq tables uint maxCount = counts.Max(); uint minCount = counts.Where(c => c > 0).Min(); #if true // have determined the following: // Of all three Elias, Golomb optimized, BASC, that BASC is slightly best for storing counts // Also using BASC for counts present is good. // Also determined the sparse table type is much bigger in every file we tested! // so, check two types: // 1) BASC on all counts, versus // 2) BASC on those present for both count and symbol // Table thus only full type. Format is // - symbol min index used, max index used, Lomont1 universal coded. // - Number of bits in table, Lomont1 universal coded (allows jumping past) // - Full table. Counts are BASC encoded, maxIndex - minIndex+1 entries uint minSymbolIndex = UInt32.MaxValue; uint maxSymbolIndex = 0; for (var i = 0U; i < counts.Length; ++i) { if (counts[i] != 0) { maxSymbolIndex = i; if (minSymbolIndex == UInt32.MaxValue) { minSymbolIndex = i; } } } UniversalCodec.Lomont.EncodeLomont1(bs, minSymbolIndex, 6, 0); UniversalCodec.Lomont.EncodeLomont1(bs, maxSymbolIndex, 6, 0); var fullTableBs = new Bitstream(); UniversalCodec.BinaryAdaptiveSequentialEncode(fullTableBs, new Datastream( counts.Skip((int)minSymbolIndex).Take((int)(maxSymbolIndex - minSymbolIndex + 1)).ToArray()), (b, v) => UniversalCodec.Lomont.EncodeLomont1(b, v, 6, 0) ); UniversalCodec.Lomont.EncodeLomont1(bs, fullTableBs.Length, 6, 0); bs.WriteStream(fullTableBs); if (Options.HasFlag(OptionFlags.DumpHeader)) { WriteLine($"Arith encode: min symb index {minSymbolIndex} max symb index {maxSymbolIndex} tbl bits {fullTableBs.Length}"); } #else // have determined the following: // Of all three Elias, Golomb optimized, BASC, that BASC is slightly best for storing counts // Also using BASC for counts present is good. // Also determined the sparse table type is much bigger in every file we tested! // so, check two types: // 1) BASC on all counts, versus // 2) BASC on those present for both count and symbol // Table thus // - symbol min index used + 1, max index used + 1, EliasDelta coded. // - bit denoting table type 0 (full) or 1 (sparse) // - Number of bits in table + 1, elias delta coded (allows jumping past) // 0 = Full table. Counts are BASC encoded, maxIndex - minIndex+1 entries // 1 = sparse table. // Elias delta for number of counts in table + 1 (same as number of symbols) // Elias delta for bitlength of counts + 1, // BASC counts, // BASC symbols present // - table // compute two table lengths: uint minSymbolIndex = UInt32.MaxValue; uint maxSymbolIndex = 0; for (var i = 0U; i < counts.Length; ++i) { if (counts[i] != 0) { maxSymbolIndex = i; if (minSymbolIndex == UInt32.MaxValue) { minSymbolIndex = i; } } } // common header UniversalCodec.Elias.EncodeDelta(bs, minSymbolIndex + 1); UniversalCodec.Elias.EncodeDelta(bs, maxSymbolIndex + 1); var fullTableBs = new Bitstream(); var sparseTableBs = new Bitstream(); UniversalCodec.BinaryAdaptiveSequentialEncode(fullTableBs, new Datastream( counts.Skip((int)minSymbolIndex).Take((int)(maxSymbolIndex - minSymbolIndex + 1)).ToArray() )); var nonzeroCountIndices = counts.Select((c, n) => new { val = c, pos = n }) .Where(p => p.val > 0) .Select(p => (uint)p.pos) .ToArray(); var nonzeroCounts = counts.Where(c => c > 0).ToArray(); UniversalCodec.Elias.EncodeDelta(sparseTableBs, (uint)(nonzeroCounts.Length + 1)); UniversalCodec.Elias.EncodeDelta(sparseTableBs, (uint)(nonzeroCounts.Length + 1)); var tempBs = new Bitstream(); UniversalCodec.BinaryAdaptiveSequentialEncode(tempBs, new Datastream(nonzeroCounts)); uint sparseMidPos = tempBs.Position; UniversalCodec.Elias.EncodeDelta(sparseTableBs, sparseMidPos + 1); sparseTableBs.WriteStream(tempBs); UniversalCodec.BinaryAdaptiveSequentialEncode(sparseTableBs, new Datastream(nonzeroCountIndices)); Console.WriteLine($"Arith full table {fullTableBs.Length} sparse table {sparseTableBs.Length}"); // now finish table if (fullTableBs.Length < sparseTableBs.Length) { bs.Write(0); // full table UniversalCodec.Elias.EncodeDelta(bs, fullTableBs.Length + 1); bs.WriteStream(fullTableBs); } else { bs.Write(1); // sparse table UniversalCodec.Elias.EncodeDelta(bs, sparseTableBs.Length + 1); bs.WriteStream(sparseTableBs); } // var cc = new CompressionChecker(); // cc.TestAll("arith",new Datastream(counts)); // all // cc.TestAll("arith",new Datastream(counts.Where(c=>c>0).ToArray())); // nonzero // BASC wins these tests // #if false var allDs = new Datastream(); var nonzeroDs = new Datastream(); for (var i = 0U; i < counts.Length; ++i) { var index = i;//(uint)(counts.Length - 1 - i); allDs.Add(index); if (counts[i] != 0) { nonzeroDs.Add(index); } } var allBs = new Bitstream(); var nonzeroBs = new Bitstream(); UniversalCodec.BinaryAdaptiveSequentialEncode(allBs, allDs); UniversalCodec.BinaryAdaptiveSequentialEncode(nonzeroBs, nonzeroDs); Console.WriteLine($"Arith all {allBs.Length} in "); Console.WriteLine($"Arith nonzero {nonzeroBs.Length} in "); //foreach (var c in counts) // UniversalCodec.OneParameterCodeDelegate( //var ans = UniversalCodec.Optimize(UniversalCodec.Golomb.Encode,counts.ToList(),1,256); //bs = ans.Item1; // 912 gamma // 918 elias delta // 988 Omega // 1152 bits UniversalCodec.BinaryAdaptiveSequentialEncode(bs,new Datastream(counts)); // 1265 best Golomb #endif #endif if (Options.HasFlag(OptionFlags.DumpTable)) { WriteLine($"Arith table bitsize {bs.Length}, min symbol ? max symbol ? min count {minCount} max count {maxCount}"); for (var i = 0; i < counts.Length; ++i) { if (counts[i] != 0) { Write($"[{i},{counts[i]}] "); } } WriteLine(); } return(bs); }
/// <summary> /// Compress a symbol in the compression algorithm /// </summary> /// <param name="bitstream"></param> /// <param name="symbol"></param> public override void CompressSymbol(Bitstream bitstream, uint symbol) { bitstream.Write(symbol, BitsPerSymbol); }
/// <summary> /// Given the leaf nodes, create a canonical Huffman compression table /// Format is /// Elias delta code bitsPerSymbol /// Elias delta code maxCodeWordLength /// Then maxCodeWordLength counts of each codeword length, /// Then sum of those lengths of symbols, each of the given length /// </summary> /// <param name="leaves1"></param> /// <returns></returns> Bitstream MakeTable(List <Node> leaves1) { Trace.Assert(leaves1.Count > 0); // longest codeword uint maxCodewordLength = leaves1.Max(n => n.Codeword.BitLength); uint minCodewordLength = leaves1.Min(n => n.Codeword.BitLength); WriteLine($"Min, max codeword lengths {minCodewordLength} {maxCodewordLength}"); // get counts of each codeword length var codewordLengthCounts = new List <int>(); for (var codewordLength = minCodewordLength; codewordLength <= maxCodewordLength; ++codewordLength) { codewordLengthCounts.Add(leaves1.Count(n => n.Codeword.BitLength == codewordLength)); } if (Options.HasFlag(OptionFlags.LogCodewordLengths)) { for (var codewordLength = minCodewordLength; codewordLength <= maxCodewordLength; ++codewordLength) { var count = codewordLengthCounts[(int)(codewordLength - minCodewordLength)]; StatRecorder.AddStat($"Huffman_Codeword_{codewordLength}", (uint)count); } } Trace.Assert(codewordLengthCounts.Sum() == leaves1.Count); // bits for each item to store uint bitsPerSymbol = BitsRequired(leaves1.Max(n => n.Symbol)); // codeword length is < alphabet size (proof: look at tree to make codewords) // the largest count of codewords of a given length is ceiling (log_2(alphabet size)) // look at construction tree to see this var bitsPerCodelengthCount = BitsRequired((uint)codewordLengthCounts.Max()); if (Options.HasFlag(OptionFlags.DumpDictionary)) { // write table for debugging WriteLine("Make huffman tree:"); for (var length = minCodewordLength; length <= maxCodewordLength; ++length) { Write($" {length,3}: {codewordLengthCounts[(int)(length - minCodewordLength)],3} -> "); var length1 = length; // avoid modified closure foreach (var s in leaves1.Where(n => n.Codeword.BitLength == length1)) { Write($"x{s.Symbol:X2}, "); } WriteLine(); } } // now write the bit sizes of each entry type, then counts of distinct lengths, then the symbols var bs = new Bitstream(); // want to save the minimum codeword length and the delta to the max codeword length // size of codeword min and delta to max uint deltaCodewordLength = maxCodewordLength - minCodewordLength; // all header values UniversalCodec.Lomont.EncodeLomont1(bs, bitsPerSymbol - 1, 3, 0); // 1-32, usually 8, subtracting 1 gives 7, fits in 3 bits UniversalCodec.Lomont.EncodeLomont1(bs, bitsPerCodelengthCount - 1, 3, 0); // usually 4,5,6 UniversalCodec.Lomont.EncodeLomont1(bs, minCodewordLength - 1, 2, 0); // quite often 1,2,3,4, usually small UniversalCodec.Lomont.EncodeLomont1(bs, deltaCodewordLength - 1, 4, -1); // 9-12, up to 16,17 if (Options.HasFlag(OptionFlags.DumpHeader)) { WriteLine("Huffman encode header:"); WriteLine($" bits per symbol {bitsPerSymbol} bits per code length count {bitsPerCodelengthCount}"); WriteLine($" min len code {minCodewordLength} delta code len {deltaCodewordLength}"); } // write table - one entry for each codeword length present, entry is count then symbols int symbolIndex = 0; for (uint length = minCodewordLength; length <= maxCodewordLength; ++length) { int count = codewordLengthCounts[(int)(length - minCodewordLength)]; bs.Write((uint)count, bitsPerCodelengthCount); // write 'count' symbols for (int j = 0; j < count; ++j) { bs.Write(leaves1[symbolIndex++].Symbol, bitsPerSymbol); } } return(bs); }