예제 #1
0
        /// <summary>
        /// try various compression on the data,
        /// return the best codec and compute the bits saved by it
        /// </summary>
        /// <returns></returns>
        private Tuple <Type, Bitstream> GetBestCompressor(
            string label,
            List <uint> data
            )
        {
            // use this to check each stream
            var cc = new CompressionChecker {
                Options = CompressionOptions
            };
            var stream  = new Datastream(data);
            var results = cc.TestAll(label, stream, internalFlags);

            results.Sort((a, b) => a.CompressedBitLength.CompareTo(b.CompressedBitLength));

            var       best = results[0];
            CodecBase codec;

            if (best.CompressorType == typeof(FixedSizeCodec))
            {
                codec = new FixedSizeCodec();
            }
            else if (best.CompressorType == typeof(ArithmeticCodec))
            {
                codec = new ArithmeticCodec();
            }
            else if (best.CompressorType == typeof(HuffmanCodec))
            {
                codec = new HuffmanCodec();
            }
            else if (best.CompressorType == typeof(GolombCodec))
            {
                codec = new GolombCodec {
                    Parameter = best.Parameters[0]
                }
            }
            ;
            else
            {
                throw new NotImplementedException("Unknown codec type");
            }

            var bitstream = codec.CompressToStream(stream, internalFlags);

            var codecName = codec.GetType().Name;

            StatRecorder.AddStat("codec win: " + label + " " + codecName, 1);
            StatRecorder.AddStat($"codec win {codecName} saved high ", results.Last().CompressedBitLength - best.CompressedBitLength);
            if (results.Count > 1)
            {
                StatRecorder.AddStat($"codec win {codecName} saved low  ", results[1].CompressedBitLength - best.CompressedBitLength);
            }

            if (Options.HasFlag(OptionFlags.DumpCompressorSelections))
            {
                WriteLine($"{label} using {codecName}");
            }

            return(new Tuple <Type, Bitstream>(codec.GetType(), bitstream));
        }
예제 #2
0
        /// <summary>
        /// try various compression on the data,
        /// return list of compression results (bitLength, type, optional parameters)
        /// </summary>
        /// <param name="statPrefix"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags"></param>
        /// <returns></returns>
        public List <Result> TestAll(string statPrefix, Datastream data, Header.HeaderFlags headerFlags)
        {
            var results = new List <Result>();

            // perform compression algorithm
            Action <string, CodecBase, Type> tryCodec = (label, codec, codecType) =>
            {
                var bitstream = codec.CompressToStream(data, headerFlags);
                var result    = new Result(label, bitstream.Length, codecType);
                if (codec.GetType() == typeof(GolombCodec))
                {
                    var g = codec as GolombCodec;
                    if (g != null)
                    {
                        result.Parameters.Add(g.Parameter);
                        result.CompressorName += $"({g.Parameter})";
                    }
                }
                results.Add(result);
            };

            // try compression algorithms
            if (Options.HasFlag(OptionFlags.UseFixed))
            {
                tryCodec("Fixed size", new FixedSizeCodec(), typeof(FixedSizeCodec));
            }
            if (Options.HasFlag(OptionFlags.UseArithmetic))
            {
                tryCodec("Arithmetic", new ArithmeticCodec(), typeof(ArithmeticCodec));
            }
            if (Options.HasFlag(OptionFlags.UseHuffman))
            {
                tryCodec("Huffman", new HuffmanCodec(), typeof(HuffmanCodec));
            }
            if (Options.HasFlag(OptionFlags.UseGolomb) && data.Max() < GolombCodec.GolombThreshold)
            {
                tryCodec("Golomb", new GolombCodec(), typeof(GolombCodec));
            }

/*            // try Golomb encoding
 *          if (Options.HasFlag(OptionFlags.UseGolomb))
 *          {
 *              var bestg = UniversalCodec.Optimize(UniversalCodec.Golomb.Encode, data, 1, Math.Min(data.Max(), 256));
 *              var bitstream = bestg.Item1;
 *              var gname = $"Golomb({bestg.Item2,2})";
 *              results.Add(new Result(gname, bitstream.Length, typeof(GolombCodec), bestg.Item2));
 *              //results.Add(new Result(gname,bitstream.Length, typeof(UniversalCodec.Golomb), bestg.Item2));
 *          } */

            Action <string, UniversalCodec.UniversalCodeDelegate, Type> tryEncoder = (label, codec, codecType) =>
            {
                var bitstream = UniversalCodec.CompressStream(codec, data.Select(v => v + 1).ToList());
                results.Add(new Result(label, bitstream.Length, codecType));
            };

            // try Elias codes - all perform poorly - todo - need way to pass this back as type?
            if (Options.HasFlag(OptionFlags.UseEliasDelta))
            {
                tryEncoder("EliasDelta", UniversalCodec.Elias.EncodeDelta, typeof(UniversalCodec.Elias));
            }
            if (Options.HasFlag(OptionFlags.UseEliasGamma))
            {
                tryEncoder("EliasGamma", UniversalCodec.Elias.EncodeGamma, typeof(UniversalCodec.Elias));
            }
            if (Options.HasFlag(OptionFlags.UseEliasOmega))
            {
                tryEncoder("EliasOmega", UniversalCodec.Elias.EncodeOmega, typeof(UniversalCodec.Elias));
            }

            // Stout
            if (Options.HasFlag(OptionFlags.UseStout))
            {
                tryEncoder("Stout", (b, v) => UniversalCodec.Stout.Encode(b, v, 3), typeof(UniversalCodec.Stout));
            }

            // BinaryAdaptiveSequentialEncode
            if (Options.HasFlag(OptionFlags.UseBasc))
            {
                var bitstream = new Bitstream();
                UniversalCodec.BinaryAdaptiveSequentialEncode(bitstream, data, UniversalCodec.Elias.EncodeDelta);
                var label = "BASC";
                results.Add(new Result(label, bitstream.Length, typeof(UniversalCodec)));
            }

            // save stats
            foreach (var result in results)
            {
                StatRecorder.AddStat(statPrefix + "_" + result.CompressorName, result.CompressedBitLength);
            }

            return(results);
        }
예제 #3
0
        /// <summary>
        /// Given the leaf nodes, create a canonical Huffman compression table
        /// Format is
        ///    Elias delta code bitsPerSymbol
        ///    Elias delta code maxCodeWordLength
        /// Then maxCodeWordLength counts of each codeword length,
        /// Then sum of those lengths of symbols, each of the given length
        /// </summary>
        /// <param name="leaves1"></param>
        /// <returns></returns>
        Bitstream MakeTable(List <Node> leaves1)
        {
            Trace.Assert(leaves1.Count > 0);

            // longest codeword
            uint maxCodewordLength = leaves1.Max(n => n.Codeword.BitLength);
            uint minCodewordLength = leaves1.Min(n => n.Codeword.BitLength);

            WriteLine($"Min, max codeword lengths {minCodewordLength} {maxCodewordLength}");

            // get counts of each codeword length
            var codewordLengthCounts = new List <int>();

            for (var codewordLength = minCodewordLength; codewordLength <= maxCodewordLength; ++codewordLength)
            {
                codewordLengthCounts.Add(leaves1.Count(n => n.Codeword.BitLength == codewordLength));
            }

            if (Options.HasFlag(OptionFlags.LogCodewordLengths))
            {
                for (var codewordLength = minCodewordLength; codewordLength <= maxCodewordLength; ++codewordLength)
                {
                    var count = codewordLengthCounts[(int)(codewordLength - minCodewordLength)];
                    StatRecorder.AddStat($"Huffman_Codeword_{codewordLength}", (uint)count);
                }
            }

            Trace.Assert(codewordLengthCounts.Sum() == leaves1.Count);

            // bits for each item to store
            uint bitsPerSymbol = BitsRequired(leaves1.Max(n => n.Symbol));

            // codeword length is < alphabet size (proof: look at tree to make codewords)

            // the largest count of codewords of a given length is ceiling (log_2(alphabet size))
            // look at construction tree to see this
            var bitsPerCodelengthCount = BitsRequired((uint)codewordLengthCounts.Max());

            if (Options.HasFlag(OptionFlags.DumpDictionary))
            {
                // write table for debugging
                WriteLine("Make huffman tree:");
                for (var length = minCodewordLength; length <= maxCodewordLength; ++length)
                {
                    Write($"  {length,3}: {codewordLengthCounts[(int)(length - minCodewordLength)],3} -> ");
                    var length1 = length; // avoid modified closure
                    foreach (var s in leaves1.Where(n => n.Codeword.BitLength == length1))
                    {
                        Write($"x{s.Symbol:X2}, ");
                    }
                    WriteLine();
                }
            }

            // now write the bit sizes of each entry type, then counts of distinct lengths, then the symbols
            var bs = new Bitstream();


            // want to save the minimum codeword length and the delta to the max codeword length
            // size of codeword min and delta to max
            uint deltaCodewordLength = maxCodewordLength - minCodewordLength;

            // all header values
            UniversalCodec.Lomont.EncodeLomont1(bs, bitsPerSymbol - 1, 3, 0);          // 1-32, usually 8, subtracting 1 gives 7, fits in 3 bits
            UniversalCodec.Lomont.EncodeLomont1(bs, bitsPerCodelengthCount - 1, 3, 0); // usually 4,5,6
            UniversalCodec.Lomont.EncodeLomont1(bs, minCodewordLength - 1, 2, 0);      // quite often 1,2,3,4, usually small
            UniversalCodec.Lomont.EncodeLomont1(bs, deltaCodewordLength - 1, 4, -1);   // 9-12, up to 16,17

            if (Options.HasFlag(OptionFlags.DumpHeader))
            {
                WriteLine("Huffman encode header:");
                WriteLine($"   bits per symbol {bitsPerSymbol} bits per code length count {bitsPerCodelengthCount}");
                WriteLine($"   min len code {minCodewordLength} delta code len {deltaCodewordLength}");
            }

            // write table - one entry for each codeword length present, entry is count then symbols
            int symbolIndex = 0;

            for (uint length = minCodewordLength; length <= maxCodewordLength; ++length)
            {
                int count = codewordLengthCounts[(int)(length - minCodewordLength)];
                bs.Write((uint)count, bitsPerCodelengthCount);
                // write 'count' symbols
                for (int j = 0; j < count; ++j)
                {
                    bs.Write(leaves1[symbolIndex++].Symbol, bitsPerSymbol);
                }
            }
            return(bs);
        }
예제 #4
0
        /// <summary>
        /// Write the header for the compression algorithm
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param>
        /// <returns></returns>
        public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            // erase data streams
            decisions.Clear();
            decisionRuns.Clear();
            literals.Clear();
            distances.Clear();
            lengths.Clear();
            tokens.Clear();

            // fill in all the data streams
            uint actualMinLength, actualMaxDistance;

            ComputeStreams(data, out actualMinLength, out actualMaxDistance);

            // due to the vagaries of this format, we write the entire file in the header call,
            // and unfortunately ignore the encode symbol and footer sections

            // dump info to help analyze
            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine("LZCL compress:");
                WriteLine($"  Data length {data.Count} ");
            }

            if (Options.HasFlag(OptionFlags.ShowTallies))
            {
                // some info to help make analyze and make decisions
                Write("Length tally: ");
                Tally(lengths);
                WriteLine();

                Write("Distance tally: ");
                Tally(distances);
                WriteLine();
            }

            // get compressed streams so we can decide what to output
            var decisionChoice     = GetBestCompressor("decisions", decisions);
            var decisionRunsChoice = GetBestCompressor("decision runs", decisionRuns);
            var literalsChoice     = GetBestCompressor("literals", literals);
            var tokensChoice       = GetBestCompressor("tokens", tokens);
            var distancesChoice    = GetBestCompressor("distances", distances);
            var lengthsChoice      = GetBestCompressor("lengths", lengths);

            // write header values
            Header.WriteUniversalHeader(bitstream, data, headerFlags);

            // save max distance occurring, used to encode tokens, very useful to users to know window needed size
            UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMaxDistance, 10, 0);
            UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMinLength, 2, 0);

            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine($"actual min length {actualMinLength}");
            }
            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine($"Max distance {actualMaxDistance}");
            }

            if (decisionChoice.Item2.Length < decisionRunsChoice.Item2.Length)
            {
                // denote choice
                bitstream.Write(0);
                // save item
                WriteItem(bitstream, decisionChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Decisions smaller than decision runs");
                }
                StatRecorder.AddStat($"codec used: decisions {decisionChoice.Item1.Name}", 1);
            }
            else
            {
                // denote choice
                bitstream.Write(1);
                // save initial value
                bitstream.Write(decisions[0]);
                // save item
                WriteItem(bitstream, decisionRunsChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Decisions runs smaller than decisions");
                }
                StatRecorder.AddStat($"codec used: decision runs {decisionRunsChoice.Item1.Name}", 1);
            }

            // literals
            WriteItem(bitstream, literalsChoice);
            StatRecorder.AddStat($"codec used: literals {literalsChoice.Item1.Name}", 1);


            // tokens or separate distance, length pairs
            if (tokensChoice.Item2.Length < distancesChoice.Item2.Length + lengthsChoice.Item2.Length)
            {
                // denote choice
                bitstream.Write(0);
                // save item
                WriteItem(bitstream, tokensChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Tokens smaller than distance,length pairs");
                }
                StatRecorder.AddStat($"codec used: tokens {tokensChoice.Item1.Name}", 1);
            }
            else
            {
                // denote choice
                bitstream.Write(1);
                // save items
                WriteItem(bitstream, distancesChoice);
                WriteItem(bitstream, lengthsChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Distance,length pairs smaller than tokens");
                }
                StatRecorder.AddStat($"codec used: distances {distancesChoice.Item1.Name}", 1);
                StatRecorder.AddStat($"codec used: lengths {lengthsChoice.Item1.Name}", 1);
            }
        }