Пример #1
0
        public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            Header.WriteUniversalHeader(bitstream, data, headerFlags);
            if (data.Count == 0)
            {
                return;
            }

            // make code tree
            var tree = MakeTree(data);

            // walk tree assigning codewords
            AssignBitStrings(tree);

            // get leaf nodes, which are the symbols
            // after this the rest of the tree is not needed
            leaves.Clear();
            GetLeaves(tree);
            MakeCanonical(leaves); // relabel codewords into canonical ordering


            // write symbol table
            // must have canonical labeled leaves
            bitstream.WriteStream(MakeTable(leaves));

            // prepare to add rest of data
            if (Options.HasFlag(OptionFlags.DumpEncoding))
            {
                WriteLine("Huff encode: [");
            }
        }
Пример #2
0
        /// <summary>
        /// Compress stream of numbers:
        /// Store length+1 using EliasDelta (to avoid 0 case)
        /// First number x0 requires b0 bits. Write b0 in EliasDelta. Write x0 in b0 bits.
        /// Each subsequent number xi requires bi bits. If bi leq b(i-1) then
        /// write a 0 bit, then xi in b(i-1) bits. Else write (bi-b(i-1)) 1 bits, then a 0,
        /// then the least sig bi - 1 bits of xi (the leading 1 in xi is implied, xi>0).
        /// TODO - alternatives - allow the bi to change slowly, removes some hiccups for odd data points, set b to avg of some prev values
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="data"></param>
        /// <param name="universalCoder">How to encode/decode a data length and first bitlength items. Elias.EncodeDelta is useful</param>
        public static void BinaryAdaptiveSequentialEncode(Bitstream bitstream, Datastream data, Action <Bitstream, uint> universalCoder)
        {
            universalCoder(bitstream, (uint)data.Count + 1);
            if (data.Count == 0)
            {
                return;
            }
            uint b1 = CodecBase.BitsRequired(data[0]);

            universalCoder(bitstream, b1);
            bitstream.Write(data[0]);
            for (var i = 1; i < data.Count; ++i)
            {
                uint d  = data[i];
                uint b2 = CodecBase.BitsRequired(d);

                if (b2 <= b1)
                { // b1 is enough bits
                    bitstream.Write(0);
                    bitstream.Write(d, b1);
                }
                else
                { // b2 requires more bits, tell how many
                    Trace.Assert(d > 0);
                    for (var ik = 0; ik < b2 - b1; ++ik)
                    {
                        bitstream.Write(1);
                    }
                    bitstream.Write(0, 1);      // end of bit count
                    bitstream.Write(d, b2 - 1); // strip off leading '1'
                }
                b1 = CodecBase.BitsRequired(d); // for next pass
            }
        }
Пример #3
0
        // end encoding  notes http://www3.sympatico.ca/mt0000/biacode/biacode.html
        // and http://bijective.dogma.net/compres10.htm

        /// <summary>
        /// Write the header for the compression algorithm
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param>
        /// <returns></returns>
        public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            // count occurrences to get probabilities
            counts = new uint[data.Max() + 1];
            foreach (var b in data)
            {
                counts[b]++;
            }
            total = (uint)data.Count;  // total frequency
            MakeSums();

            ResetCoder();

            // arithmetic gets total probability from the header, so ensure it gets saved
            headerFlags |= Header.HeaderFlags.SymbolCount;
            Header.WriteUniversalHeader(bitstream, data, headerFlags);

            // we'll insert the bitlength of what follows at this spot during the footer
            whereToInsertBitlength = bitstream.Position;

            // write freq tables
            var tables = MakeFrequencyTable();

            bitstream.WriteStream(tables);

            if (Options.HasFlag(OptionFlags.DumpState))
            {
                Write("Enc: ");
            }
        }
Пример #4
0
        /// <summary>
        /// Make probability tree
        /// </summary>
        /// <param name="data"></param>
        /// <returns></returns>
        Node MakeTree(Datastream data)
        {
            // create a node for each symbol with the frequency count
            var nodes = new List <Node>();

            foreach (var symbol in data)
            {
                AddNode(symbol, nodes);
            }

            // create a parent for the lowest two freq nodes until single root node left
            while (nodes.Count > 1)
            {
                // get lowest freq node
                var minFreqL = nodes.Min(n => n.FrequencyCount);
                var left     = nodes.First(n => n.FrequencyCount == minFreqL);
                nodes.Remove(left);

                // get second lowest freq node
                var minFreqR = nodes.Min(n => n.FrequencyCount);
                var right    = nodes.First(n => n.FrequencyCount == minFreqR);
                nodes.Remove(right);

                // combine and reinsert
                nodes.Add(new Node
                {
                    LeftChild      = left,
                    RightChild     = right,
                    FrequencyCount = left.FrequencyCount + right.FrequencyCount
                });
            }
            return(nodes[0]);
        }
Пример #5
0
        /// <summary>
        /// try various compression on the data,
        /// return the best codec and compute the bits saved by it
        /// </summary>
        /// <returns></returns>
        private Tuple <Type, Bitstream> GetBestCompressor(
            string label,
            List <uint> data
            )
        {
            // use this to check each stream
            var cc = new CompressionChecker {
                Options = CompressionOptions
            };
            var stream  = new Datastream(data);
            var results = cc.TestAll(label, stream, internalFlags);

            results.Sort((a, b) => a.CompressedBitLength.CompareTo(b.CompressedBitLength));

            var       best = results[0];
            CodecBase codec;

            if (best.CompressorType == typeof(FixedSizeCodec))
            {
                codec = new FixedSizeCodec();
            }
            else if (best.CompressorType == typeof(ArithmeticCodec))
            {
                codec = new ArithmeticCodec();
            }
            else if (best.CompressorType == typeof(HuffmanCodec))
            {
                codec = new HuffmanCodec();
            }
            else if (best.CompressorType == typeof(GolombCodec))
            {
                codec = new GolombCodec {
                    Parameter = best.Parameters[0]
                }
            }
            ;
            else
            {
                throw new NotImplementedException("Unknown codec type");
            }

            var bitstream = codec.CompressToStream(stream, internalFlags);

            var codecName = codec.GetType().Name;

            StatRecorder.AddStat("codec win: " + label + " " + codecName, 1);
            StatRecorder.AddStat($"codec win {codecName} saved high ", results.Last().CompressedBitLength - best.CompressedBitLength);
            if (results.Count > 1)
            {
                StatRecorder.AddStat($"codec win {codecName} saved low  ", results[1].CompressedBitLength - best.CompressedBitLength);
            }

            if (Options.HasFlag(OptionFlags.DumpCompressorSelections))
            {
                WriteLine($"{label} using {codecName}");
            }

            return(new Tuple <Type, Bitstream>(codec.GetType(), bitstream));
        }
Пример #6
0
        /// <summary>
        /// Write the header for the compression algorithm
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param>
        /// <returns></returns>
        public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            // always store bps also
            Header.WriteUniversalHeader(bitstream, data, headerFlags | Header.HeaderFlags.BitsPerSymbol);
            var max = data.Any() ? data.Max() : 0;

            BitsPerSymbol = BitsRequired(max);
        }
Пример #7
0
        /// <summary>
        /// replace each entry with the diff from the prev one
        /// first one is left alone. Return deltas
        /// </summary>
        /// <param name="data"></param>
        /// <returns></returns>
        public static Datastream DeltaCode(Datastream data)
        {
            var d = new Datastream(data);

            for (var i = data.Count - 1; i >= 1; --i)
            {
                d[i] = (byte)(data[i] - data[i - 1]);
            }
            d[0] = data[0];
            return(d);
        }
Пример #8
0
        /// <summary>
        /// Stream version
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="headerFlags"></param>
        /// <returns></returns>
        public Datastream DecompressFromStream(Bitstream bitstream, Header.HeaderFlags headerFlags)
        {
            uint symbolCount = ReadHeader(bitstream, headerFlags);
            var  datastream  = new Datastream();

            for (var i = 0U; i < symbolCount; ++i)
            {
                datastream.Add(DecompressSymbol(bitstream));
            }
            ReadFooter(bitstream);
            return(datastream);
        }
Пример #9
0
        /// <summary>
        /// Process the data to compress, creating various list of values to be compressed later
        /// </summary>
        /// <param name="data"></param>
        private void ScanData(Datastream data)
        {
            uint index = 0; // index of symbol to encode

            while (index < data.Count)
            {
                // find best run
                uint bestDistance = 0;
                uint bestLength   = 0;
                // walk backwards to end with smallest distance for a given run length
                for (int distance = (int)MaximumDistance; distance >= 0; --distance)
                {
                    uint length = 0;
                    while (
                        // bound check the values
                        0 <= index - 1 - distance + length &&
                        index - 1 - distance + length < data.Count &&
                        index + length < data.Count &&
                        // check the bytes match
                        data[(int)(index - 1 - distance + length)] == data[(int)(index + length)] &&
                        // don't get longer than this
                        length < MaximumLength
                        )
                    {
                        length++;
                    }
                    // keep best
                    if (length >= bestLength)
                    {
                        // break ties in this direction to have shorter distances
                        bestLength   = length;
                        bestDistance = (uint)distance;
                    }
                }

                // now with best run, decide how to encode next part
                if (bestLength >= MinimumLength)
                {
                    // encode (distance,length) token
                    decisions.Add(1); // select (distance,length)
                    distances.Add(bestDistance);
                    lengths.Add(bestLength);
                    index += bestLength;
                }
                else
                {
                    // encode literal
                    decisions.Add(0);               // select literal
                    literals.Add(data[(int)index]); // save value
                    index++;
                }
            }
        }
Пример #10
0
        /// <summary>
        /// Stream version to keep from 0 padding last byte
        /// </summary>
        /// <param name="data"></param>
        /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param>
        /// <returns></returns>
        public Bitstream CompressToStream(Datastream data, Header.HeaderFlags headerFlags)
        {
            var bitstream = new Bitstream();

            WriteHeader(bitstream, data, headerFlags);
            foreach (var symbol in data)
            {
                CompressSymbol(bitstream, symbol);
            }
            WriteFooter(bitstream);
            return(bitstream);
        }
Пример #11
0
 /// <summary>
 /// Write a universal compression header
 /// </summary>
 /// <param name="bitstream"></param>
 /// <param name="data"></param>
 /// <param name="headerFlags"></param>
 public static void WriteUniversalHeader(Bitstream bitstream, Datastream data, HeaderFlags headerFlags)
 {
     if (headerFlags.HasFlag(HeaderFlags.SymbolCount))
     {
         UniversalCodec.Lomont.EncodeLomont1(bitstream, (uint)data.Count, 6, 0);
     }
     if (headerFlags.HasFlag(HeaderFlags.BitsPerSymbol))
     {
         var max           = data.Any() ? data.Max() : 0;
         var bitsPerSymbol = CodecBase.BitsRequired(max);
         UniversalCodec.Lomont.EncodeLomont1(bitstream, bitsPerSymbol - 1, 3, 0);
     }
 }
Пример #12
0
        /// <summary>
        /// Compress the data in the streams.
        /// </summary>
        /// <returns></returns>
        private void PackHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            // actual value extremes occurring in data streams
            encoderState.ActualMaxDistance = distances.Any() ? distances.Max() : 0;
            encoderState.ActualMinLength   = lengths.Any() ? lengths.Min() : 0;
            // token (distance,length) is encoded as
            //   (length - actualMinLength) * (actualMaxDistance+1) + distance
            // compute largest occurring token value
            uint actualMaxToken = 0;

            for (var i = 0; i < distances.Count; ++i)
            {
                uint length   = lengths[i];
                uint distance = distances[i];
                uint token    = (length - encoderState.ActualMinLength) * (encoderState.ActualMaxDistance + 1) + distance;
                actualMaxToken = Math.Max(actualMaxToken, token);
            }

            // bit sizes
            encoderState.ActualBitsPerSymbol = BitsRequired(literals.Max());
            encoderState.ActualBitsPerToken  = BitsRequired(actualMaxToken);

            //some info to help analyze

            if (Options.HasFlag(OptionFlags.DumpHeader))
            {
                //WriteLine("LZ77 compress:");
                //WriteLine($"  Data length {data.Count} ");
                //WriteLine($"  Bits per symbol {encoderState.actualBitsPerSymbol} ");
                //WriteLine($"  Bits per token {encoderState.actualBitsPerToken}");
                //WriteLine($"  Max token {actualMaxToken}, {lengths.Count} tokens ");
                //WriteLine($"  Max distance {encoderState.actualMaxDistance} ");
            }

            // checks
            Trace.Assert(0 < encoderState.ActualBitsPerSymbol);

            // header values
            Header.WriteUniversalHeader(bitstream, data, headerFlags);
            UniversalCodec.Lomont.EncodeLomont1(bitstream, encoderState.ActualBitsPerSymbol - 1, 3, 0); // usually 8, -1 => 7, fits in 3 bits
            UniversalCodec.Lomont.EncodeLomont1(bitstream, encoderState.ActualBitsPerToken - 1, 5, 0);  // around 20 or so, depends on parameters
            UniversalCodec.Lomont.EncodeLomont1(bitstream, encoderState.ActualMinLength, 2, 0);         // usually 2
            UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMaxToken, 25, -10);                    //
            UniversalCodec.Lomont.EncodeLomont1(bitstream, encoderState.ActualMaxDistance, 14, -7);     //

            if (Options.HasFlag(OptionFlags.DumpHeader))
            {
                WriteLine("LZ77 header:");
                WriteLine($"   datalen {data.Count} bits/symbol {encoderState.ActualBitsPerSymbol}, bits/token {encoderState.ActualBitsPerToken}, minLen {encoderState.ActualMinLength}, max token {actualMaxToken}, max dist {encoderState.ActualMaxDistance}");
            }
        }
Пример #13
0
 public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
 {
     Header.WriteUniversalHeader(bitstream, data, headerFlags);
     if (Options.HasFlag(OptionFlags.Optimize))
     {
         Parameter = Optimize(data, headerFlags);
     }
     if (Options.HasFlag(OptionFlags.DumpDebug))
     {
         WriteLine($"Golomb encode parameter {Parameter}");
     }
     //Console.WriteLine("Parameter " + Parameter);
     UniversalCodec.Lomont.EncodeLomont1(bitstream, Parameter, Lomont1Parameter, 0);
 }
Пример #14
0
        /// <summary>
        /// Write the header for the compression algorithm
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param>
        /// <returns></returns>
        public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            // erase data streams
            decisions.Clear();
            literals.Clear();
            distances.Clear();
            lengths.Clear();

            // fill in various data streams
            ScanData(data);

            // put header into stream
            PackHeader(bitstream, data, headerFlags);

            // start indices here
            encoderState.Reset();

            if (Options.HasFlag(OptionFlags.DumpEncode))
            {
                Write("LZ77 stream: ");
            }
        }
Пример #15
0
        /// <summary>
        /// try various compression on the data,
        /// return list of compression results (bitLength, type, optional parameters)
        /// </summary>
        /// <param name="statPrefix"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags"></param>
        /// <returns></returns>
        public List <Result> TestAll(string statPrefix, Datastream data, Header.HeaderFlags headerFlags)
        {
            var results = new List <Result>();

            // perform compression algorithm
            Action <string, CodecBase, Type> tryCodec = (label, codec, codecType) =>
            {
                var bitstream = codec.CompressToStream(data, headerFlags);
                var result    = new Result(label, bitstream.Length, codecType);
                if (codec.GetType() == typeof(GolombCodec))
                {
                    var g = codec as GolombCodec;
                    if (g != null)
                    {
                        result.Parameters.Add(g.Parameter);
                        result.CompressorName += $"({g.Parameter})";
                    }
                }
                results.Add(result);
            };

            // try compression algorithms
            if (Options.HasFlag(OptionFlags.UseFixed))
            {
                tryCodec("Fixed size", new FixedSizeCodec(), typeof(FixedSizeCodec));
            }
            if (Options.HasFlag(OptionFlags.UseArithmetic))
            {
                tryCodec("Arithmetic", new ArithmeticCodec(), typeof(ArithmeticCodec));
            }
            if (Options.HasFlag(OptionFlags.UseHuffman))
            {
                tryCodec("Huffman", new HuffmanCodec(), typeof(HuffmanCodec));
            }
            if (Options.HasFlag(OptionFlags.UseGolomb) && data.Max() < GolombCodec.GolombThreshold)
            {
                tryCodec("Golomb", new GolombCodec(), typeof(GolombCodec));
            }

/*            // try Golomb encoding
 *          if (Options.HasFlag(OptionFlags.UseGolomb))
 *          {
 *              var bestg = UniversalCodec.Optimize(UniversalCodec.Golomb.Encode, data, 1, Math.Min(data.Max(), 256));
 *              var bitstream = bestg.Item1;
 *              var gname = $"Golomb({bestg.Item2,2})";
 *              results.Add(new Result(gname, bitstream.Length, typeof(GolombCodec), bestg.Item2));
 *              //results.Add(new Result(gname,bitstream.Length, typeof(UniversalCodec.Golomb), bestg.Item2));
 *          } */

            Action <string, UniversalCodec.UniversalCodeDelegate, Type> tryEncoder = (label, codec, codecType) =>
            {
                var bitstream = UniversalCodec.CompressStream(codec, data.Select(v => v + 1).ToList());
                results.Add(new Result(label, bitstream.Length, codecType));
            };

            // try Elias codes - all perform poorly - todo - need way to pass this back as type?
            if (Options.HasFlag(OptionFlags.UseEliasDelta))
            {
                tryEncoder("EliasDelta", UniversalCodec.Elias.EncodeDelta, typeof(UniversalCodec.Elias));
            }
            if (Options.HasFlag(OptionFlags.UseEliasGamma))
            {
                tryEncoder("EliasGamma", UniversalCodec.Elias.EncodeGamma, typeof(UniversalCodec.Elias));
            }
            if (Options.HasFlag(OptionFlags.UseEliasOmega))
            {
                tryEncoder("EliasOmega", UniversalCodec.Elias.EncodeOmega, typeof(UniversalCodec.Elias));
            }

            // Stout
            if (Options.HasFlag(OptionFlags.UseStout))
            {
                tryEncoder("Stout", (b, v) => UniversalCodec.Stout.Encode(b, v, 3), typeof(UniversalCodec.Stout));
            }

            // BinaryAdaptiveSequentialEncode
            if (Options.HasFlag(OptionFlags.UseBasc))
            {
                var bitstream = new Bitstream();
                UniversalCodec.BinaryAdaptiveSequentialEncode(bitstream, data, UniversalCodec.Elias.EncodeDelta);
                var label = "BASC";
                results.Add(new Result(label, bitstream.Length, typeof(UniversalCodec)));
            }

            // save stats
            foreach (var result in results)
            {
                StatRecorder.AddStat(statPrefix + "_" + result.CompressorName, result.CompressedBitLength);
            }

            return(results);
        }
Пример #16
0
 /// <summary>
 /// Write the header for the compression algorithm
 /// </summary>
 /// <param name="bitstream"></param>
 /// <param name="data"></param>
 /// <param name="headerFlags"></param>
 /// <returns></returns>
 public virtual void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
 {
 }
Пример #17
0
        /// <summary>
        /// Process the data to compress, creating various list of values to be compressed later
        /// </summary>
        /// <param name="data"></param>
        /// <param name="actualMinLength"></param>
        /// <param name="actualMaxDistance"></param>
        private void ComputeStreams(Datastream data, out uint actualMinLength, out uint actualMaxDistance)
        {
            uint index = 0; // index of symbol to encode

            while (index < data.Count)
            {
                // find best run
                uint bestDistance = 0;
                uint bestLength   = 0;
                // walk backwards to end with smallest distance for a given run length
                for (int distance = (int)MaximumDistance; distance >= 0; --distance)
                {
                    uint length = 0;
                    while (
                        // bound check the values
                        0 <= index - 1 - distance + length &&
                        index - 1 - distance + length < data.Count &&
                        index + length < data.Count &&
                        // check the bytes match
                        data[(int)(index - 1 - distance + length)] == data[(int)(index + length)] &&
                        // don't get longer than this
                        length < MaximumLength
                        )
                    {
                        length++;
                    }
                    // keep best
                    if (length >= bestLength)
                    {
                        // break ties in this direction to have shorter distances
                        bestLength   = length;
                        bestDistance = (uint)distance;
                    }
                }

                // now with best run, decide how to encode next part
                if (bestLength >= MinimumLength)
                {
                    // encode (distance,length) token
                    decisions.Add(1); // select (distance,length)
                    distances.Add(bestDistance);
                    lengths.Add(bestLength);
                    index += bestLength;
                }
                else
                {
                    // encode literal
                    decisions.Add(0);                 // select literal
                    literals.Add(data[(int)(index)]); // save value
                    index++;
                }
            }

            Trace.Assert(decisions.Max() < 2);

            // some actually occurring extremes
            actualMaxDistance = distances.Any()?distances.Max():0;
            actualMinLength   = lengths.Any()?lengths.Min():0;

            // shift lengths
            for (var i = 0; i < lengths.Count; ++i)
            {
                lengths[i] -= actualMinLength;
            }

            // create tokens in case we need them
            for (var i = 0; i < lengths.Count; ++i)
            {
                tokens.Add(lengths[i] * (actualMaxDistance + 1) + distances[i]);
            }

            // get runs of each type of decision in case we need them
            decisionRuns.AddRange(GetRuns(decisions));

            if (Options.HasFlag(OptionFlags.ShowTallies))
            {
                Write("Decision runs tally ");
                Tally(decisionRuns);
                WriteLine();
            }
        }
Пример #18
0
        /// <summary>
        /// Create the frequency table as a bitsteam for ease of use/testing
        /// </summary>
        /// <returns></returns>
        Bitstream MakeFrequencyTable()
        {
            var bs = new Bitstream();
            // write freq tables
            uint maxCount = counts.Max();
            uint minCount = counts.Where(c => c > 0).Min();

#if true
            // have determined the following:
            // Of all three Elias, Golomb optimized, BASC, that BASC is slightly best for storing counts
            // Also using BASC for counts present is good.
            // Also determined the sparse table type is much bigger in every file we tested!
            // so, check two types:
            //    1) BASC on all counts, versus
            //    2) BASC on those present for both count and symbol
            // Table thus only full type. Format is
            //   - symbol min index used, max index used, Lomont1 universal coded.
            //   - Number of bits in table, Lomont1 universal coded (allows jumping past)
            //   - Full table. Counts are BASC encoded, maxIndex - minIndex+1 entries

            uint minSymbolIndex = UInt32.MaxValue;
            uint maxSymbolIndex = 0;
            for (var i = 0U; i < counts.Length; ++i)
            {
                if (counts[i] != 0)
                {
                    maxSymbolIndex = i;
                    if (minSymbolIndex == UInt32.MaxValue)
                    {
                        minSymbolIndex = i;
                    }
                }
            }

            UniversalCodec.Lomont.EncodeLomont1(bs, minSymbolIndex, 6, 0);
            UniversalCodec.Lomont.EncodeLomont1(bs, maxSymbolIndex, 6, 0);

            var fullTableBs = new Bitstream();
            UniversalCodec.BinaryAdaptiveSequentialEncode(fullTableBs, new Datastream(
                                                              counts.Skip((int)minSymbolIndex).Take((int)(maxSymbolIndex - minSymbolIndex + 1)).ToArray()),
                                                          (b, v) => UniversalCodec.Lomont.EncodeLomont1(b, v, 6, 0)
                                                          );
            UniversalCodec.Lomont.EncodeLomont1(bs, fullTableBs.Length, 6, 0);
            bs.WriteStream(fullTableBs);

            if (Options.HasFlag(OptionFlags.DumpHeader))
            {
                WriteLine($"Arith encode: min symb index {minSymbolIndex} max symb index {maxSymbolIndex} tbl bits {fullTableBs.Length}");
            }
#else
            // have determined the following:
            // Of all three Elias, Golomb optimized, BASC, that BASC is slightly best for storing counts
            // Also using BASC for counts present is good.
            // Also determined the sparse table type is much bigger in every file we tested!
            // so, check two types:
            //    1) BASC on all counts, versus
            //    2) BASC on those present for both count and symbol
            // Table thus
            //   - symbol min index used + 1, max index used + 1, EliasDelta coded.
            //   - bit denoting table type 0 (full) or 1 (sparse)
            //   - Number of bits in table + 1, elias delta coded (allows jumping past)
            //     0 = Full table. Counts are BASC encoded, maxIndex - minIndex+1 entries
            //     1 = sparse table.
            //         Elias delta for number of counts in table + 1 (same as number of symbols)
            //         Elias delta for bitlength of counts + 1,
            //         BASC counts,
            //         BASC symbols present
            //   - table



            // compute two table lengths:
            uint minSymbolIndex = UInt32.MaxValue;
            uint maxSymbolIndex = 0;
            for (var i = 0U; i < counts.Length; ++i)
            {
                if (counts[i] != 0)
                {
                    maxSymbolIndex = i;
                    if (minSymbolIndex == UInt32.MaxValue)
                    {
                        minSymbolIndex = i;
                    }
                }
            }
            // common header
            UniversalCodec.Elias.EncodeDelta(bs, minSymbolIndex + 1);
            UniversalCodec.Elias.EncodeDelta(bs, maxSymbolIndex + 1);


            var fullTableBs   = new Bitstream();
            var sparseTableBs = new Bitstream();

            UniversalCodec.BinaryAdaptiveSequentialEncode(fullTableBs, new Datastream(
                                                              counts.Skip((int)minSymbolIndex).Take((int)(maxSymbolIndex - minSymbolIndex + 1)).ToArray()
                                                              ));

            var nonzeroCountIndices =
                counts.Select((c, n) => new { val = c, pos = n })
                .Where(p => p.val > 0)
                .Select(p => (uint)p.pos)
                .ToArray();
            var nonzeroCounts = counts.Where(c => c > 0).ToArray();

            UniversalCodec.Elias.EncodeDelta(sparseTableBs, (uint)(nonzeroCounts.Length + 1));
            UniversalCodec.Elias.EncodeDelta(sparseTableBs, (uint)(nonzeroCounts.Length + 1));

            var tempBs = new Bitstream();
            UniversalCodec.BinaryAdaptiveSequentialEncode(tempBs, new Datastream(nonzeroCounts));
            uint sparseMidPos = tempBs.Position;

            UniversalCodec.Elias.EncodeDelta(sparseTableBs, sparseMidPos + 1);
            sparseTableBs.WriteStream(tempBs);

            UniversalCodec.BinaryAdaptiveSequentialEncode(sparseTableBs, new Datastream(nonzeroCountIndices));

            Console.WriteLine($"Arith full table {fullTableBs.Length} sparse table {sparseTableBs.Length}");


            // now finish table
            if (fullTableBs.Length < sparseTableBs.Length)
            {
                bs.Write(0); // full table
                UniversalCodec.Elias.EncodeDelta(bs, fullTableBs.Length + 1);

                bs.WriteStream(fullTableBs);
            }
            else
            {
                bs.Write(1); // sparse table
                UniversalCodec.Elias.EncodeDelta(bs, sparseTableBs.Length + 1);

                bs.WriteStream(sparseTableBs);
            }



            // var cc = new CompressionChecker();
            // cc.TestAll("arith",new Datastream(counts)); // all
            // cc.TestAll("arith",new Datastream(counts.Where(c=>c>0).ToArray())); // nonzero
            // BASC wins these tests
            //

#if false
            var allDs     = new Datastream();
            var nonzeroDs = new Datastream();
            for (var i = 0U; i < counts.Length; ++i)
            {
                var index = i;//(uint)(counts.Length - 1 - i);
                allDs.Add(index);
                if (counts[i] != 0)
                {
                    nonzeroDs.Add(index);
                }
            }

            var allBs     = new Bitstream();
            var nonzeroBs = new Bitstream();
            UniversalCodec.BinaryAdaptiveSequentialEncode(allBs, allDs);
            UniversalCodec.BinaryAdaptiveSequentialEncode(nonzeroBs, nonzeroDs);
            Console.WriteLine($"Arith all {allBs.Length} in ");
            Console.WriteLine($"Arith nonzero {nonzeroBs.Length} in ");

            //foreach (var c in counts)
            //    UniversalCodec.OneParameterCodeDelegate(
            //var ans = UniversalCodec.Optimize(UniversalCodec.Golomb.Encode,counts.ToList(),1,256);
            //bs = ans.Item1;
            // 912 gamma
            // 918 elias delta
            // 988 Omega
            // 1152 bits UniversalCodec.BinaryAdaptiveSequentialEncode(bs,new Datastream(counts));
            // 1265 best Golomb
#endif
#endif
            if (Options.HasFlag(OptionFlags.DumpTable))
            {
                WriteLine($"Arith table bitsize {bs.Length}, min symbol ? max symbol ? min count {minCount} max count {maxCount}");
                for (var i = 0; i < counts.Length; ++i)
                {
                    if (counts[i] != 0)
                    {
                        Write($"[{i},{counts[i]}] ");
                    }
                }
                WriteLine();
            }
            return(bs);
        }
Пример #19
0
        uint Optimize(Datastream data, Header.HeaderFlags headerFlags)
        {
            // Golomb bitlength  seems to be a convex down function of the parameter:
            // Let non-negative integers a1,a2,a3,..aN, parameter M. b=#bits to encode M, = Floor[log_2 M]+1
            // golomb code then qi=Floor[ai/M],ri=ai-Mqi, unary encode qi in qi+1 bits, encode ri in b or b-1 bits using
            // adaptive code. Then each of these is convex (true?) in M, so sum is convex, so length is convex.
            // Want best parameter between M=1 (total unary) and M=max{ai} = total fixed encoding
            // for large ai, unary uses lots of bits, so start at high end, 2^k=M >= max{ai}, divide by 2 until stops decreasing,
            // then binary search on final range.
            // todo - writeup optimal selection as blog post

            var g = new GolombCodec();

            g.Options &= ~OptionFlags.Optimize; // disable auto optimizer

            // function to compute length given the parameter
            Func <uint, uint> f = m1 =>
            {
                g.Parameter = m1;
                var bs   = g.CompressToStream(data, headerFlags);
                var len1 = bs.Length;
                return(len1);
            };

            Trace.Assert(data.Max() < 0x80000000); // needs to be true to work
            // start parameters
            var  m      = 1U << (int)BitsRequired(data.Max());
            var  length = f(m);
            uint oldLength;

            do
            {
                oldLength = length;
                m        /= 2;
                length    = f(m);
            } while (length < oldLength && m > 1);
            // now best between length and oldLength, binary search
            // todo - search
            Trace.Assert(m > 0);
            var left  = m;
            var right = 2 * m;
            var mid   = 0U;
            var a     = f(left);

            while (left <= right)
            {
                mid = (left + right) / 2;
                var c = f(mid);
                if (c < a)
                {
                    left = mid + 1;
                }
                else
                {
                    right = mid - 1;
                }
            }
            if (mid == 1)
            {
                mid = 2;
            }
            // check mid, mid+1, mid-1
            uint best;

            if (f(mid) < f(mid + 1))
            {
                if (f(mid - 1) < f(mid))
                {
                    best = mid - 1;
                }
                else
                {
                    best = mid;
                }
            }
            else
            {
                best = mid + 1;
            }
            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine($"Golomb opt {mid} {f(mid-2)} {f(mid-1)} {f(mid)} {f(mid+1)} {f(mid+2)} {f(mid+3)}");
            }
            return(best);
        }
Пример #20
0
        /// <summary>
        /// Write the header for the compression algorithm
        /// </summary>
        /// <param name="bitstream"></param>
        /// <param name="data"></param>
        /// <param name="headerFlags">Flags telling what to put in the header. Useful when embedding in other streams.</param>
        /// <returns></returns>
        public override void WriteHeader(Bitstream bitstream, Datastream data, Header.HeaderFlags headerFlags)
        {
            // erase data streams
            decisions.Clear();
            decisionRuns.Clear();
            literals.Clear();
            distances.Clear();
            lengths.Clear();
            tokens.Clear();

            // fill in all the data streams
            uint actualMinLength, actualMaxDistance;

            ComputeStreams(data, out actualMinLength, out actualMaxDistance);

            // due to the vagaries of this format, we write the entire file in the header call,
            // and unfortunately ignore the encode symbol and footer sections

            // dump info to help analyze
            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine("LZCL compress:");
                WriteLine($"  Data length {data.Count} ");
            }

            if (Options.HasFlag(OptionFlags.ShowTallies))
            {
                // some info to help make analyze and make decisions
                Write("Length tally: ");
                Tally(lengths);
                WriteLine();

                Write("Distance tally: ");
                Tally(distances);
                WriteLine();
            }

            // get compressed streams so we can decide what to output
            var decisionChoice     = GetBestCompressor("decisions", decisions);
            var decisionRunsChoice = GetBestCompressor("decision runs", decisionRuns);
            var literalsChoice     = GetBestCompressor("literals", literals);
            var tokensChoice       = GetBestCompressor("tokens", tokens);
            var distancesChoice    = GetBestCompressor("distances", distances);
            var lengthsChoice      = GetBestCompressor("lengths", lengths);

            // write header values
            Header.WriteUniversalHeader(bitstream, data, headerFlags);

            // save max distance occurring, used to encode tokens, very useful to users to know window needed size
            UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMaxDistance, 10, 0);
            UniversalCodec.Lomont.EncodeLomont1(bitstream, actualMinLength, 2, 0);

            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine($"actual min length {actualMinLength}");
            }
            if (Options.HasFlag(OptionFlags.DumpDebug))
            {
                WriteLine($"Max distance {actualMaxDistance}");
            }

            if (decisionChoice.Item2.Length < decisionRunsChoice.Item2.Length)
            {
                // denote choice
                bitstream.Write(0);
                // save item
                WriteItem(bitstream, decisionChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Decisions smaller than decision runs");
                }
                StatRecorder.AddStat($"codec used: decisions {decisionChoice.Item1.Name}", 1);
            }
            else
            {
                // denote choice
                bitstream.Write(1);
                // save initial value
                bitstream.Write(decisions[0]);
                // save item
                WriteItem(bitstream, decisionRunsChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Decisions runs smaller than decisions");
                }
                StatRecorder.AddStat($"codec used: decision runs {decisionRunsChoice.Item1.Name}", 1);
            }

            // literals
            WriteItem(bitstream, literalsChoice);
            StatRecorder.AddStat($"codec used: literals {literalsChoice.Item1.Name}", 1);


            // tokens or separate distance, length pairs
            if (tokensChoice.Item2.Length < distancesChoice.Item2.Length + lengthsChoice.Item2.Length)
            {
                // denote choice
                bitstream.Write(0);
                // save item
                WriteItem(bitstream, tokensChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Tokens smaller than distance,length pairs");
                }
                StatRecorder.AddStat($"codec used: tokens {tokensChoice.Item1.Name}", 1);
            }
            else
            {
                // denote choice
                bitstream.Write(1);
                // save items
                WriteItem(bitstream, distancesChoice);
                WriteItem(bitstream, lengthsChoice);
                if (Options.HasFlag(OptionFlags.DumpDebug))
                {
                    WriteLine("Distance,length pairs smaller than tokens");
                }
                StatRecorder.AddStat($"codec used: distances {distancesChoice.Item1.Name}", 1);
                StatRecorder.AddStat($"codec used: lengths {lengthsChoice.Item1.Name}", 1);
            }
        }