Ejemplo n.º 1
0
 internal PackedWriter(PackedInts.Format format, DataOutput @out, int valueCount, int bitsPerValue, int mem)
     : base(@out, valueCount, bitsPerValue)
 {
     this.Format_Renamed = format;
     Encoder = BulkOperation.Of(format, bitsPerValue);
     Iterations = Encoder.ComputeIterations(valueCount, mem);
     NextBlocks = new byte[Iterations * Encoder.ByteBlockCount()];
     NextValues = new long[Iterations * Encoder.ByteValueCount()];
     Off = 0;
     Written = 0;
     Finished = false;
 }
Ejemplo n.º 2
0
        /// <summary>
        /// Compress <code>bytes[off:off+len]</code> into <code>out</code> using
        /// at most 16KB of memory. <code>ht</code> shouldn't be shared across threads
        /// but can safely be reused.
        /// </summary>
        public static void Compress(byte[] bytes, int off, int len, DataOutput @out, HashTable ht)
        {
            int @base = off;
            int end = off + len;

            int anchor = off++;

            if (len > LAST_LITERALS + MIN_MATCH)
            {
                int limit = end - LAST_LITERALS;
                int matchLimit = limit - MIN_MATCH;
                ht.Reset(len);
                int hashLog = ht.HashLog;
                PackedInts.Mutable hashTable = ht.hashTable;

                while (off <= limit)
                {
                    // find a match
                    int @ref;
                    while (true)
                    {
                        if (off >= matchLimit)
                        {
                            goto mainBreak;
                        }
                        int v = ReadInt(bytes, off);
                        int h = Hash(v, hashLog);
                        @ref = @base + (int)hashTable.Get(h);
                        Debug.Assert(PackedInts.BitsRequired(off - @base) <= hashTable.BitsPerValue);
                        hashTable.Set(h, off - @base);
                        if (off - @ref < MAX_DISTANCE && ReadInt(bytes, @ref) == v)
                        {
                            break;
                        }
                        ++off;
                    }

                    // compute match length
                    int matchLen = MIN_MATCH + CommonBytes(bytes, @ref + MIN_MATCH, off + MIN_MATCH, limit);

                    EncodeSequence(bytes, anchor, @ref, off, matchLen, @out);
                    off += matchLen;
                    anchor = off;
                mainContinue: ;
                }
            mainBreak: ;
            }

            // last literals
            int literalLen = end - anchor;
            Debug.Assert(literalLen >= LAST_LITERALS || literalLen == len);
            EncodeLastLiterals(bytes, anchor, end - anchor, @out);
        }
 // same as DataOutput.writeVLong but accepts negative values
 internal static void WriteVLong(DataOutput @out, long i)
 {
     int k = 0;
     while ((i & ~0x7FL) != 0L && k++ < 8)
     {
         @out.WriteByte(unchecked((byte)(sbyte)((i & 0x7FL) | 0x80L)));
         i = (long)((ulong)i >> 7);
     }
     @out.WriteByte((byte)(sbyte)i);
 }
 /// <summary>
 /// Sole constructor. </summary>
 /// <param name="blockSize"> the number of values of a single block, must be a multiple of <tt>64</tt> </param>
 public AbstractBlockPackedWriter(DataOutput @out, int blockSize)
 {
     PackedInts.CheckBlockSize(blockSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
     Reset(@out);
     Values = new long[blockSize];
 }
Ejemplo n.º 5
0
 /// <summary>
 /// Encode an final node output value into a {@link
 ///  DataOutput}.  By default this just calls {@link #write(Object,
 ///  DataOutput)}.
 /// </summary>
 public virtual void WriteFinalOutput(T output, DataOutput @out)
 {
     Write(output, @out);
 }
 /// <summary>
 /// Reset this writer to wrap <code>out</code>. The block size remains unchanged. </summary>
 public virtual void Reset(DataOutput @out)
 {
     Debug.Assert(@out != null);
     this.@out = @out;
     Off = 0;
     Ord_Renamed = 0L;
     Finished = false;
 }
Ejemplo n.º 7
0
 /// <summary>
 /// Save this mutable into <code>out</code>. Instantiating a reader from
 /// the generated data will return a reader with the same number of bits
 /// per value.
 /// </summary>
 public virtual void Save(DataOutput @out)
 {
     Writer writer = GetWriterNoHeader(@out, Format, Size(), BitsPerValue, DEFAULT_BUFFER_SIZE);
     writer.WriteHeader();
     for (int i = 0; i < Size(); ++i)
     {
         writer.Add(Get(i));
     }
     writer.Finish();
 }
Ejemplo n.º 8
0
 /// <summary>
 /// Encode an output value into a <seealso cref="DataOutput"/>. </summary>
 public abstract void Write(T output, DataOutput @out);
Ejemplo n.º 9
0
 public override void EncodeTerm(long[] empty, DataOutput @out, FieldInfo fieldInfo, BlockTermState _state, bool absolute)
 {
     StandardTermState state = (StandardTermState)_state;
     if (absolute)
     {
         LastState = EmptyState;
     }
     @out.WriteVLong(state.FreqStart - LastState.FreqStart);
     if (state.SkipOffset != -1)
     {
         Debug.Assert(state.SkipOffset > 0);
         @out.WriteVLong(state.SkipOffset);
     }
     if (IndexOptions >= FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
     {
         @out.WriteVLong(state.ProxStart - LastState.ProxStart);
     }
     LastState = state;
 }
Ejemplo n.º 10
0
        /// <summary>
        /// Create a packed integer array writer for the given output, format, value
        /// count, and number of bits per value.
        /// </p><p>
        /// The resulting stream will be long-aligned. this means that depending on
        /// the format which is used under the hoods, up to 63 bits will be wasted.
        /// An easy way to make sure that no space is lost is to always use a
        /// <code>valueCount</code> that is a multiple of 64.
        /// </p><p>
        /// this method writes metadata to the stream, so that the resulting stream is
        /// sufficient to restore a <seealso cref="Reader"/> from it. You don't need to track
        /// <code>valueCount</code> or <code>bitsPerValue</code> by yourself. In case
        /// this is a problem, you should probably look at
        /// <seealso cref="#getWriterNoHeader(DataOutput, Format, int, int, int)"/>.
        /// </p><p>
        /// The <code>acceptableOverheadRatio</code> parameter controls how
        /// readers that will be restored from this stream trade space
        /// for speed by selecting a faster but potentially less memory-efficient
        /// implementation. An <code>acceptableOverheadRatio</code> of
        /// <seealso cref="PackedInts#COMPACT"/> will make sure that the most memory-efficient
        /// implementation is selected whereas <seealso cref="PackedInts#FASTEST"/> will make sure
        /// that the fastest implementation is selected. In case you are only interested
        /// in reading this stream sequentially later on, you should probably use
        /// <seealso cref="PackedInts#COMPACT"/>.
        /// </summary>
        /// <param name="out">          the data output </param>
        /// <param name="valueCount">   the number of values </param>
        /// <param name="bitsPerValue"> the number of bits per value </param>
        /// <param name="acceptableOverheadRatio"> an acceptable overhead ratio per value </param>
        /// <returns>             a Writer </returns>
        /// <exception cref="IOException"> If there is a low-level I/O error
        /// @lucene.internal </exception>
        public static Writer GetWriter(DataOutput @out, int valueCount, int bitsPerValue, float acceptableOverheadRatio)
        {
            Debug.Assert(valueCount >= 0);

            FormatAndBits formatAndBits = FastestFormatAndBits(valueCount, bitsPerValue, acceptableOverheadRatio);
            Writer writer = GetWriterNoHeader(@out, formatAndBits.format, valueCount, formatAndBits.bitsPerValue, DEFAULT_BUFFER_SIZE);
            writer.WriteHeader();
            return writer;
        }
Ejemplo n.º 11
0
 private static void EncodeLen(int l, DataOutput @out)
 {
     while (l >= 0xFF)
     {
         @out.WriteByte(unchecked((byte)(sbyte)0xFF));
         l -= 0xFF;
     }
     @out.WriteByte((byte)(sbyte)l);
 }
Ejemplo n.º 12
0
        private static void EncodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput @out)
        {
            @out.WriteByte((byte)(sbyte)token);

            // encode literal length
            if (literalLen >= 0x0F)
            {
                EncodeLen(literalLen - 0x0F, @out);
            }

            // encode literals
            @out.WriteBytes(bytes, anchor, literalLen);
        }
Ejemplo n.º 13
0
 private static void EncodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput @out)
 {
     int token = Math.Min(literalLen, 0x0F) << 4;
     EncodeLiterals(bytes, token, anchor, literalLen, @out);
 }
Ejemplo n.º 14
0
        /// <summary>
        /// Compress <code>bytes[off:off+len]</code> into <code>out</code>. Compared to
        /// <seealso cref="LZ4#compress(byte[], int, int, DataOutput, HashTable)"/>, this method
        /// is slower and uses more memory (~ 256KB per thread) but should provide
        /// better compression ratios (especially on large inputs) because it chooses
        /// the best match among up to 256 candidates and then performs trade-offs to
        /// fix overlapping matches. <code>ht</code> shouldn't be shared across threads
        /// but can safely be reused.
        /// </summary>
        public static void CompressHC(byte[] src, int srcOff, int srcLen, DataOutput @out, HCHashTable ht)
        {
            int srcEnd = srcOff + srcLen;
            int matchLimit = srcEnd - LAST_LITERALS;
            int mfLimit = matchLimit - MIN_MATCH;

            int sOff = srcOff;
            int anchor = sOff++;

            ht.Reset(srcOff);
            Match match0 = new Match();
            Match match1 = new Match();
            Match match2 = new Match();
            Match match3 = new Match();

            while (sOff <= mfLimit)
            {
                if (!ht.InsertAndFindBestMatch(src, sOff, matchLimit, match1))
                {
                    ++sOff;
                    continue;
                }

                // saved, in case we would skip too much
                CopyTo(match1, match0);

                while (true)
                {
                    Debug.Assert(match1.Start >= anchor);
                    if (match1.End() >= mfLimit || !ht.InsertAndFindWiderMatch(src, match1.End() - 2, match1.Start + 1, matchLimit, match1.Len, match2))
                    {
                        // no better match
                        EncodeSequence(src, anchor, match1.@ref, match1.Start, match1.Len, @out);
                        anchor = sOff = match1.End();
                        goto mainContinue;
                    }

                    if (match0.Start < match1.Start)
                    {
                        if (match2.Start < match1.Start + match0.Len) // empirical
                        {
                            CopyTo(match0, match1);
                        }
                    }
                    Debug.Assert(match2.Start > match1.Start);

                    if (match2.Start - match1.Start < 3) // First Match too small : removed
                    {
                        CopyTo(match2, match1);
                        goto search2Continue;
                    }

                    while (true)
                    {
                        if (match2.Start - match1.Start < OPTIMAL_ML)
                        {
                            int newMatchLen = match1.Len;
                            if (newMatchLen > OPTIMAL_ML)
                            {
                                newMatchLen = OPTIMAL_ML;
                            }
                            if (match1.Start + newMatchLen > match2.End() - MIN_MATCH)
                            {
                                newMatchLen = match2.Start - match1.Start + match2.Len - MIN_MATCH;
                            }
                            int correction = newMatchLen - (match2.Start - match1.Start);
                            if (correction > 0)
                            {
                                match2.Fix(correction);
                            }
                        }

                        if (match2.Start + match2.Len >= mfLimit || !ht.InsertAndFindWiderMatch(src, match2.End() - 3, match2.Start, matchLimit, match2.Len, match3))
                        {
                            // no better match -> 2 sequences to encode
                            if (match2.Start < match1.End())
                            {
                                match1.Len = match2.Start - match1.Start;
                            }
                            // encode seq 1
                            EncodeSequence(src, anchor, match1.@ref, match1.Start, match1.Len, @out);
                            anchor = sOff = match1.End();
                            // encode seq 2
                            EncodeSequence(src, anchor, match2.@ref, match2.Start, match2.Len, @out);
                            anchor = sOff = match2.End();
                            goto mainContinue;
                        }

                        if (match3.Start < match1.End() + 3) // Not enough space for match 2 : remove it
                        {
                            if (match3.Start >= match1.End()) // // can write Seq1 immediately ==> Seq2 is removed, so Seq3 becomes Seq1
                            {
                                if (match2.Start < match1.End())
                                {
                                    int correction = match1.End() - match2.Start;
                                    match2.Fix(correction);
                                    if (match2.Len < MIN_MATCH)
                                    {
                                        CopyTo(match3, match2);
                                    }
                                }

                                EncodeSequence(src, anchor, match1.@ref, match1.Start, match1.Len, @out);
                                anchor = sOff = match1.End();

                                CopyTo(match3, match1);
                                CopyTo(match2, match0);

                                goto search2Continue;
                            }

                            CopyTo(match3, match2);
                            goto search3Continue;
                        }

                        // OK, now we have 3 ascending matches; let's write at least the first one
                        if (match2.Start < match1.End())
                        {
                            if (match2.Start - match1.Start < 0x0F)
                            {
                                if (match1.Len > OPTIMAL_ML)
                                {
                                    match1.Len = OPTIMAL_ML;
                                }
                                if (match1.End() > match2.End() - MIN_MATCH)
                                {
                                    match1.Len = match2.End() - match1.Start - MIN_MATCH;
                                }
                                int correction = match1.End() - match2.Start;
                                match2.Fix(correction);
                            }
                            else
                            {
                                match1.Len = match2.Start - match1.Start;
                            }
                        }

                        EncodeSequence(src, anchor, match1.@ref, match1.Start, match1.Len, @out);
                        anchor = sOff = match1.End();

                        CopyTo(match2, match1);
                        CopyTo(match3, match2);

                        goto search3Continue;
                    search3Continue: ;
                    }
                search3Break: ;

                search2Continue: ;
                }
            search2Break: ;

            mainContinue: ;
            }
            mainBreak:

            EncodeLastLiterals(src, anchor, srcEnd - anchor, @out);
        }
Ejemplo n.º 15
0
 /// <summary>
 /// Expert: Create a packed integer array writer for the given output, format,
 /// value count, and number of bits per value.
 /// </p><p>
 /// The resulting stream will be long-aligned. this means that depending on
 /// the format which is used, up to 63 bits will be wasted. An easy way to
 /// make sure that no space is lost is to always use a <code>valueCount</code>
 /// that is a multiple of 64.
 /// </p><p>
 /// this method does not write any metadata to the stream, meaning that it is
 /// your responsibility to store it somewhere else in order to be able to
 /// recover data from the stream later on:
 /// <ul>
 ///   <li><code>format</code> (using <seealso cref="Format#getId()"/>),</li>
 ///   <li><code>valueCount</code>,</li>
 ///   <li><code>bitsPerValue</code>,</li>
 ///   <li><seealso cref="#VERSION_CURRENT"/>.</li>
 /// </ul>
 /// </p><p>
 /// It is possible to start writing values without knowing how many of them you
 /// are actually going to write. To do this, just pass <code>-1</code> as
 /// <code>valueCount</code>. On the other hand, for any positive value of
 /// <code>valueCount</code>, the returned writer will make sure that you don't
 /// write more values than expected and pad the end of stream with zeros in
 /// case you have written less than <code>valueCount</code> when calling
 /// <seealso cref="Writer#finish()"/>.
 /// </p><p>
 /// The <code>mem</code> parameter lets you control how much memory can be used
 /// to buffer changes in memory before flushing to disk. High values of
 /// <code>mem</code> are likely to improve throughput. On the other hand, if
 /// speed is not that important to you, a value of <code>0</code> will use as
 /// little memory as possible and should already offer reasonable throughput.
 /// </summary>
 /// <param name="out">          the data output </param>
 /// <param name="format">       the format to use to serialize the values </param>
 /// <param name="valueCount">   the number of values </param>
 /// <param name="bitsPerValue"> the number of bits per value </param>
 /// <param name="mem">          how much memory (in bytes) can be used to speed up serialization </param>
 /// <returns>             a Writer </returns>
 /// <seealso cref= PackedInts#getReaderIteratorNoHeader(DataInput, Format, int, int, int, int) </seealso>
 /// <seealso cref= PackedInts#getReaderNoHeader(DataInput, Format, int, int, int)
 /// @lucene.internal </seealso>
 public static Writer GetWriterNoHeader(DataOutput @out, Format format, int valueCount, int bitsPerValue, int mem)
 {
     return new PackedWriter(format, @out, valueCount, bitsPerValue, mem);
 }
Ejemplo n.º 16
0
        public long WriteTo(DataOutput @out)
        {
            long size = 0;
            while (true)
            {
                if (Limit + BufferOffset == EndIndex)
                {
                    Debug.Assert(EndIndex - BufferOffset >= Upto);
                    @out.WriteBytes(Buffer, Upto, Limit - Upto);
                    size += Limit - Upto;
                    break;
                }
                else
                {
                    @out.WriteBytes(Buffer, Upto, Limit - Upto);
                    size += Limit - Upto;
                    NextSlice();
                }
            }

            return size;
        }
Ejemplo n.º 17
0
 /// <summary>
 /// Encode metadata as long[] and byte[]. {@code absolute} controls whether
 /// current term is delta encoded according to latest term.
 /// Usually elements in {@code longs} are file pointers, so each one always
 /// increases when a new term is consumed. {@code out} is used to write generic
 /// bytes, which are not monotonic.
 ///
 /// NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
 /// the pointer to postings list may not be defined for some terms but is defined
 /// for others, if it is designed to inline  some postings data in term dictionary.
 /// In this case, the postings writer should always use the last value, so that each
 /// element in metadata long[] remains monotonic.
 /// </summary>
 public abstract void EncodeTerm(long[] longs, DataOutput @out, FieldInfo fieldInfo, BlockTermState state, bool absolute);
Ejemplo n.º 18
0
        private static void EncodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput @out)
        {
            int literalLen = matchOff - anchor;
            Debug.Assert(matchLen >= 4);
            // encode token
            int token = (Math.Min(literalLen, 0x0F) << 4) | Math.Min(matchLen - 4, 0x0F);
            EncodeLiterals(bytes, token, anchor, literalLen, @out);

            // encode match dec
            int matchDec = matchOff - matchRef;
            Debug.Assert(matchDec > 0 && matchDec < 1 << 16);
            @out.WriteByte((byte)(sbyte)matchDec);
            @out.WriteByte((byte)(sbyte)((int)((uint)matchDec >> 8)));

            // encode match len
            if (matchLen >= MIN_MATCH + 0x0F)
            {
                EncodeLen(matchLen - 0x0F - MIN_MATCH, @out);
            }
        }
Ejemplo n.º 19
0
 protected internal Writer(DataOutput @out, int valueCount, int bitsPerValue)
 {
     Debug.Assert(bitsPerValue <= 64);
     Debug.Assert(valueCount >= 0 || valueCount == -1);
     this.@out = @out;
     this.valueCount = valueCount;
     this.bitsPerValue = bitsPerValue;
 }
Ejemplo n.º 20
0
 public override void Compress(byte[] bytes, int off, int len, DataOutput @out)
 {
     @out.WriteBytes(bytes, off, len);
 }
Ejemplo n.º 21
0
 /// <summary>
 /// Encode metadata as long[] and byte[]. {@code absolute} controls whether
 /// current term is delta encoded according to latest term.
 /// Usually elements in {@code longs} are file pointers, so each one always
 /// increases when a new term is consumed. {@code out} is used to write generic
 /// bytes, which are not monotonic.
 ///
 /// NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
 /// the pointer to postings list may not be defined for some terms but is defined
 /// for others, if it is designed to inline  some postings data in term dictionary.
 /// In this case, the postings writer should always use the last value, so that each
 /// element in metadata long[] remains monotonic.
 /// </summary>
 public abstract void EncodeTerm(long[] longs, DataOutput @out, FieldInfo fieldInfo, BlockTermState state, bool absolute);
Ejemplo n.º 22
0
 /// <summary>
 /// Compress bytes into <code>out</code>. It it the responsibility of the
 /// compressor to add all necessary information so that a <seealso cref="Decompressor"/>
 /// will know when to stop decompressing bytes from the stream.
 /// </summary>
 public abstract void Compress(sbyte[] bytes, int off, int len, DataOutput @out);
Ejemplo n.º 23
0
 /// <summary>
 /// Create a new instance that wraps <code>out</code>.
 /// </summary>
 public PackedDataOutput(DataOutput @out)
 {
     this.@out = @out;
     Current = 0;
     RemainingBits = 8;
 }