Пример #1
0
        public override void Fill(int fromIndex, int toIndex, long val)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(fromIndex >= 0);
                Debugging.Assert(fromIndex <= toIndex);
                Debugging.Assert(PackedInt32s.BitsRequired(val) <= m_bitsPerValue);
            }

            int valuesPerBlock = 64 / m_bitsPerValue;

            if (toIndex - fromIndex <= valuesPerBlock << 1)
            {
                // there needs to be at least one full block to set for the block
                // approach to be worth trying
                base.Fill(fromIndex, toIndex, val);
                return;
            }

            // set values naively until the next block start
            int fromOffsetInBlock = fromIndex % valuesPerBlock;

            if (fromOffsetInBlock != 0)
            {
                for (int i = fromOffsetInBlock; i < valuesPerBlock; ++i)
                {
                    Set(fromIndex++, val);
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(fromIndex % valuesPerBlock == 0);
                }
            }

            // bulk set of the inner blocks
            int fromBlock = fromIndex / valuesPerBlock;
            int toBlock   = toIndex / valuesPerBlock;

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(fromBlock * valuesPerBlock == fromIndex);
            }

            long blockValue = 0L;

            for (int i = 0; i < valuesPerBlock; ++i)
            {
                blockValue |= (val << (i * m_bitsPerValue));
            }
            Arrays.Fill(blocks, fromBlock, toBlock, blockValue);

            // fill the gap
            for (int i = valuesPerBlock * toBlock; i < toIndex; ++i)
            {
                Set(i, val);
            }
        }
Пример #2
0
            internal void Reset(int len)
            {
                int bitsPerOffset    = PackedInt32s.BitsRequired(len - LAST_LITERALS);
                int bitsPerOffsetLog = 32 - (bitsPerOffset - 1).LeadingZeroCount();

                hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog;
                if (hashTable is null || hashTable.Count < 1 << hashLog || hashTable.BitsPerValue < bitsPerOffset)
                {
                    hashTable = PackedInt32s.GetMutable(1 << hashLog, bitsPerOffset, PackedInt32s.DEFAULT);
                }
Пример #3
0
 public BinaryDocValuesFieldUpdates(string field, int maxDoc)
     : base(field, DocValuesFieldUpdatesType.BINARY)
 {
     docsWithField = new FixedBitSet(64);
     docs          = new PagedMutable(1, 1024, PackedInt32s.BitsRequired(maxDoc - 1), PackedInt32s.COMPACT);
     offsets       = new PagedGrowableWriter(1, 1024, 1, PackedInt32s.FAST);
     lengths       = new PagedGrowableWriter(1, 1024, 1, PackedInt32s.FAST);
     values        = new BytesRef(16); // start small
     size          = 0;
 }
Пример #4
0
        /// <summary>
        /// Compress <c>bytes[off:off+len]</c> into <paramref name="out"/> using
        /// at most 16KB of memory. <paramref name="ht"/> shouldn't be shared across threads
        /// but can safely be reused.
        /// </summary>
        public static void Compress(byte[] bytes, int off, int len, DataOutput @out, HashTable ht)
        {
            int @base = off;
            int end   = off + len;

            int anchor = off++;

            if (len > LAST_LITERALS + MIN_MATCH)
            {
                int limit      = end - LAST_LITERALS;
                int matchLimit = limit - MIN_MATCH;
                ht.Reset(len);
                int hashLog = ht.hashLog;
                PackedInt32s.Mutable hashTable = ht.hashTable;

                while (off <= limit)
                {
                    // find a match
                    int @ref;
                    while (true)
                    {
                        if (off >= matchLimit)
                        {
                            goto mainBreak;
                        }
                        int v = ReadInt32(bytes, off);
                        int h = Hash(v, hashLog);
                        @ref = @base + (int)hashTable.Get(h);
                        Debug.Assert(PackedInt32s.BitsRequired(off - @base) <= hashTable.BitsPerValue);
                        hashTable.Set(h, off - @base);
                        if (off - @ref < MAX_DISTANCE && ReadInt32(bytes, @ref) == v)
                        {
                            break;
                        }
                        ++off;
                    }

                    // compute match length
                    int matchLen = MIN_MATCH + CommonBytes(bytes, @ref + MIN_MATCH, off + MIN_MATCH, limit);

                    EncodeSequence(bytes, anchor, @ref, off, matchLen, @out);
                    off   += matchLen;
                    anchor = off;
                    //mainContinue: ; // LUCENENET NOTE: Not Referenced
                }
                mainBreak :;
            }

            // last literals
            int literalLen = end - anchor;

            Debug.Assert(literalLen >= LAST_LITERALS || literalLen == len);
            EncodeLastLiterals(bytes, anchor, end - anchor, @out);
        }
Пример #5
0
        /// <summary>
        /// Compute the number of bits required to serialize any of the longs in
        /// <paramref name="data"/>.
        /// </summary>
        private static int BitsRequired(int[] data)
        {
            long or = 0;

            for (int i = 0; i < Lucene41PostingsFormat.BLOCK_SIZE; ++i)
            {
                Debug.Assert(data[i] >= 0);
                or |= (uint)data[i];
            }
            return(PackedInt32s.BitsRequired(or));
        }
Пример #6
0
        private void AddVarSortedBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values, IEnumerable <long?> docToOrd)
        {
            field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_SORTED.ToString());

            CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);

            CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_VAR_SORTED_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_VAR_SORTED_VERSION_CURRENT);

            /* values */

            long startPos = data.GetFilePointer();

            int valueCount = 0;

            foreach (BytesRef v in values)
            {
                data.WriteBytes(v.Bytes, v.Offset, v.Length);
                valueCount++;
            }

            /* addresses */

            long maxAddress = data.GetFilePointer() - startPos;

            index.WriteInt64(maxAddress);

            Debug.Assert(valueCount != int.MaxValue); // unsupported by the 4.0 impl

            PackedInt32s.Writer w = PackedInt32s.GetWriter(index, valueCount + 1, PackedInt32s.BitsRequired(maxAddress), PackedInt32s.DEFAULT);
            long currentPosition  = 0;

            foreach (BytesRef v in values)
            {
                w.Add(currentPosition);
                currentPosition += v.Length;
            }
            // write sentinel
            Debug.Assert(currentPosition == maxAddress);
            w.Add(currentPosition);
            w.Finish();

            /* ordinals */

            int maxDoc = state.SegmentInfo.DocCount;

            Debug.Assert(valueCount > 0);
            PackedInt32s.Writer ords = PackedInt32s.GetWriter(index, maxDoc, PackedInt32s.BitsRequired(valueCount - 1), PackedInt32s.DEFAULT);
            foreach (long n in docToOrd)
            {
                ords.Add((long)n);
            }
            ords.Finish();
        }
Пример #7
0
        public override void AddNumericField(FieldInfo field, IEnumerable <long?> values)
        {
            // examine the values to determine best type to use
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;

            foreach (long?n in values)
            {
                long v = n.GetValueOrDefault();
                minValue = Math.Min(minValue, v);
                maxValue = Math.Max(maxValue, v);
            }

            string      fileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name + "_" + Convert.ToString(field.Number, CultureInfo.InvariantCulture), segmentSuffix, "dat");
            IndexOutput data     = dir.CreateOutput(fileName, state.Context);
            bool        success  = false;

            try
            {
                if (minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue && PackedInt32s.BitsRequired(maxValue - minValue) > 4)
                {
                    // fits in a byte[], would be more than 4bpv, just write byte[]
                    AddBytesField(field, data, values);
                }
                else if (minValue >= short.MinValue && maxValue <= short.MaxValue && PackedInt32s.BitsRequired(maxValue - minValue) > 8)
                {
                    // fits in a short[], would be more than 8bpv, just write short[]
                    AddShortsField(field, data, values);
                }
                else if (minValue >= int.MinValue && maxValue <= int.MaxValue && PackedInt32s.BitsRequired(maxValue - minValue) > 16)
                {
                    // fits in a int[], would be more than 16bpv, just write int[]
                    AddIntsField(field, data, values);
                }
                else
                {
                    AddVarIntsField(field, data, values, minValue, maxValue);
                }
                success = true;
            }
            finally
            {
                if (success)
                {
                    IOUtils.Dispose(data);
                }
                else
                {
                    IOUtils.DisposeWhileHandlingException(data);
                }
            }
        }
Пример #8
0
        // NOTE: 4.0 file format docs are crazy/wrong here...
        private void AddVarStraightBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values)
        {
            field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_STRAIGHT.ToString());

            CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT);

            CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_VAR_STRAIGHT_VERSION_CURRENT);

            /* values */

            long startPos = data.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream

            foreach (BytesRef v in values)
            {
                if (v != null)
                {
                    data.WriteBytes(v.Bytes, v.Offset, v.Length);
                }
            }

            /* addresses */

            long maxAddress = data.Position - startPos; // LUCENENET specific: Renamed from getFilePointer() to match FileStream

            index.WriteVInt64(maxAddress);

            int maxDoc = state.SegmentInfo.DocCount;

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(maxDoc != int.MaxValue);                           // unsupported by the 4.0 impl
            }
            PackedInt32s.Writer w = PackedInt32s.GetWriter(index, maxDoc + 1, PackedInt32s.BitsRequired(maxAddress), PackedInt32s.DEFAULT);
            long currentPosition  = 0;

            foreach (BytesRef v in values)
            {
                w.Add(currentPosition);
                if (v != null)
                {
                    currentPosition += v.Length;
                }
            }
            // write sentinel
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(currentPosition == maxAddress);
            }
            w.Add(currentPosition);
            w.Finish();
        }
Пример #9
0
        protected override void Flush()
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(m_off > 0);
            }
            long min = long.MaxValue, max = long.MinValue;

            for (int i = 0; i < m_off; ++i)
            {
                min = Math.Min(m_values[i], min);
                max = Math.Max(m_values[i], max);
            }

            long delta        = max - min;
            int  bitsRequired = delta < 0 ? 64 : delta == 0L ? 0 : PackedInt32s.BitsRequired(delta);

            if (bitsRequired == 64)
            {
                // no need to delta-encode
                min = 0L;
            }
            else if (min > 0L)
            {
                // make min as small as possible so that writeVLong requires fewer bytes
                min = Math.Max(0L, max - PackedInt32s.MaxValue(bitsRequired));
            }

            int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0);

            m_out.WriteByte((byte)(sbyte)token);

            if (min != 0)
            {
                WriteVInt64(m_out, ZigZagEncode(min) - 1);
            }

            if (bitsRequired > 0)
            {
                if (min != 0)
                {
                    for (int i = 0; i < m_off; ++i)
                    {
                        m_values[i] -= min;
                    }
                }
                WriteValues(bitsRequired);
            }

            m_off = 0;
        }
Пример #10
0
        private void AddFixedDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values, int length)
        {
            field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_FIXED_DEREF.ToString());

            CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_VERSION_CURRENT);

            CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_FIXED_DEREF_VERSION_CURRENT);

            // deduplicate
            JCG.SortedSet <BytesRef> dictionary = new JCG.SortedSet <BytesRef>();
            foreach (BytesRef v in values)
            {
                dictionary.Add(v == null ? new BytesRef() : BytesRef.DeepCopyOf(v));
            }

            /* values */
            data.WriteInt32(length);
            foreach (BytesRef v in dictionary)
            {
                data.WriteBytes(v.Bytes, v.Offset, v.Length);
            }

            /* ordinals */
            int valueCount = dictionary.Count;

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(valueCount > 0);
            }
            index.WriteInt32(valueCount);
            int maxDoc = state.SegmentInfo.DocCount;

            PackedInt32s.Writer w = PackedInt32s.GetWriter(index, maxDoc, PackedInt32s.BitsRequired(valueCount - 1), PackedInt32s.DEFAULT);

            BytesRef brefDummy;

            foreach (BytesRef v in values)
            {
                brefDummy = v;

                if (v == null)
                {
                    brefDummy = new BytesRef();
                }
                //int ord = dictionary.HeadSet(brefDummy).Size();
                int ord = dictionary.Count(@ref => @ref.CompareTo(brefDummy) < 0);
                w.Add(ord);
            }
            w.Finish();
        }
Пример #11
0
        private void Rehash()
        {
            PagedGrowableWriter oldTable = table;

            table = new PagedGrowableWriter(2 * oldTable.Count, 1 << 30, PackedInt32s.BitsRequired(count), PackedInt32s.COMPACT);
            mask  = table.Count - 1;
            for (long idx = 0; idx < oldTable.Count; idx++)
            {
                long address = oldTable.Get(idx);
                if (address != 0)
                {
                    AddNew(address);
                }
            }
        }
Пример #12
0
        private void EnsureCapacity(long value)
        {
            if ((value & currentMask) == value)
            {
                return;
            }
            int bitsRequired = value < 0 ? 64 : PackedInt32s.BitsRequired(value);

            Debug.Assert(bitsRequired > current.BitsPerValue);
            int valueCount = Count;

            PackedInt32s.Mutable next = PackedInt32s.GetMutable(valueCount, bitsRequired, acceptableOverheadRatio);
            PackedInt32s.Copy(current, 0, next, 0, valueCount, PackedInt32s.DEFAULT_BUFFER_SIZE);
            current     = next;
            currentMask = Mask(current.BitsPerValue);
        }
Пример #13
0
 /// <summary>
 /// NOTE: This was saveInts() in Lucene.
 /// </summary>
 private static void SaveInt32s(int[] values, int length, DataOutput @out)
 {
     if (Debugging.AssertsEnabled)
     {
         Debugging.Assert(length > 0);
     }
     if (length == 1)
     {
         @out.WriteVInt32(values[0]);
     }
     else
     {
         bool allEqual = true;
         for (int i = 1; i < length; ++i)
         {
             if (values[i] != values[0])
             {
                 allEqual = false;
                 break;
             }
         }
         if (allEqual)
         {
             @out.WriteVInt32(0);
             @out.WriteVInt32(values[0]);
         }
         else
         {
             long max = 0;
             for (int i = 0; i < length; ++i)
             {
                 max |= (uint)values[i];
             }
             int bitsRequired = PackedInt32s.BitsRequired(max);
             @out.WriteVInt32(bitsRequired);
             PackedInt32s.Writer w = PackedInt32s.GetWriterNoHeader(@out, PackedInt32s.Format.PACKED, length, bitsRequired, 1);
             for (int i = 0; i < length; ++i)
             {
                 w.Add(values[i]);
             }
             w.Finish();
         }
     }
 }
Пример #14
0
        private void AddVarDerefBytesField(FieldInfo field, IndexOutput data, IndexOutput index, IEnumerable <BytesRef> values)
        {
            field.PutAttribute(legacyKey, LegacyDocValuesType.BYTES_VAR_DEREF.ToString());

            CodecUtil.WriteHeader(data, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_DAT, Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);

            CodecUtil.WriteHeader(index, Lucene40DocValuesFormat.BYTES_VAR_DEREF_CODEC_NAME_IDX, Lucene40DocValuesFormat.BYTES_VAR_DEREF_VERSION_CURRENT);

            // deduplicate
            SortedSet <BytesRef> dictionary = new SortedSet <BytesRef>();

            foreach (BytesRef v in values)
            {
                dictionary.Add(v == null ? new BytesRef() : BytesRef.DeepCopyOf(v));
            }

            /* values */
            long startPosition  = data.GetFilePointer();
            long currentAddress = 0;
            Dictionary <BytesRef, long> valueToAddress = new Dictionary <BytesRef, long>();

            foreach (BytesRef v in dictionary)
            {
                currentAddress    = data.GetFilePointer() - startPosition;
                valueToAddress[v] = currentAddress;
                WriteVShort(data, v.Length);
                data.WriteBytes(v.Bytes, v.Offset, v.Length);
            }

            /* ordinals */
            long totalBytes = data.GetFilePointer() - startPosition;

            index.WriteInt64(totalBytes);
            int maxDoc = state.SegmentInfo.DocCount;

            PackedInt32s.Writer w = PackedInt32s.GetWriter(index, maxDoc, PackedInt32s.BitsRequired(currentAddress), PackedInt32s.DEFAULT);

            foreach (BytesRef v in values)
            {
                w.Add(valueToAddress[v == null ? new BytesRef() : v]);
            }
            w.Finish();
        }
        internal override void PackPendingValues()
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(pendingOff > 0);
            }
            minValues[valuesOff] = pending[0];
            averages[valuesOff]  = pendingOff == 1 ? 0 : (float)(pending[pendingOff - 1] - pending[0]) / (pendingOff - 1);

            for (int i = 0; i < pendingOff; ++i)
            {
                // LUCENENET NOTE: IMPORTANT: The cast to float is critical here for it to work in x86
                pending[i] = ZigZagEncode(pending[i] - minValues[valuesOff] - (long)(float)(averages[valuesOff] * (long)i));
            }
            long maxDelta = 0;

            for (int i = 0; i < pendingOff; ++i)
            {
                if (pending[i] < 0)
                {
                    maxDelta = -1;
                    break;
                }
                else
                {
                    maxDelta = Math.Max(maxDelta, pending[i]);
                }
            }
            if (maxDelta == 0)
            {
                values[valuesOff] = new PackedInt32s.NullReader(pendingOff);
            }
            else
            {
                int bitsRequired             = maxDelta < 0 ? 64 : PackedInt32s.BitsRequired(maxDelta);
                PackedInt32s.Mutable mutable = PackedInt32s.GetMutable(pendingOff, bitsRequired, acceptableOverheadRatio);
                for (int i = 0; i < pendingOff;)
                {
                    i += mutable.Set(i, pending, i, pendingOff - i);
                }
                values[valuesOff] = mutable;
            }
        }
Пример #16
0
        /// <summary>
        /// Returns a sorted array containing unique field numbers. </summary>
        private int[] FlushFieldNums()
        {
            JCG.SortedSet <int> fieldNums = new JCG.SortedSet <int>();
            foreach (DocData dd in pendingDocs)
            {
                foreach (FieldData fd in dd.fields)
                {
                    fieldNums.Add(fd.fieldNum);
                }
            }

            int numDistinctFields = fieldNums.Count;

            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(numDistinctFields > 0);
            }
            int bitsRequired = PackedInt32s.BitsRequired(fieldNums.Max);
            int token        = (Math.Min(numDistinctFields - 1, 0x07) << 5) | bitsRequired;

            vectorsStream.WriteByte((byte)token);
            if (numDistinctFields - 1 >= 0x07)
            {
                vectorsStream.WriteVInt32(numDistinctFields - 1 - 0x07);
            }
            PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, fieldNums.Count, bitsRequired, 1);
            foreach (int fieldNum in fieldNums)
            {
                writer.Add(fieldNum);
            }
            writer.Finish();

            int[] fns = new int[fieldNums.Count];
            int   i   = 0;

            foreach (int key in fieldNums)
            {
                fns[i++] = key;
            }
            return(fns);
        }
        internal override void PackPendingValues()
        {
            // compute max delta
            long minValue = pending[0];
            long maxValue = pending[0];

            for (int i = 1; i < pendingOff; ++i)
            {
                minValue = Math.Min(minValue, pending[i]);
                maxValue = Math.Max(maxValue, pending[i]);
            }

            // build a new packed reader
            int bitsRequired = minValue < 0 ? 64 : PackedInt32s.BitsRequired(maxValue);

            PackedInt32s.Mutable mutable = PackedInt32s.GetMutable(pendingOff, bitsRequired, acceptableOverheadRatio);
            for (int i = 0; i < pendingOff;)
            {
                i += mutable.Set(i, pending, i, pendingOff - i);
            }
            values[valuesOff] = mutable;
        }
Пример #18
0
        public override void Encode(int[] values, int valuesOffset, byte[] blocks, int blocksOffset, int iterations)
        {
            int nextBlock = 0;
            int bitsLeft  = 8;

            for (int i = 0; i < byteValueCount * iterations; ++i)
            {
                int v = values[valuesOffset++];
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(PackedInt32s.BitsRequired(v & 0xFFFFFFFFL) <= bitsPerValue);
                }
                if (bitsPerValue < bitsLeft)
                {
                    // just buffer
                    nextBlock |= v << (bitsLeft - bitsPerValue);
                    bitsLeft  -= bitsPerValue;
                }
                else
                {
                    // flush as many blocks as possible
                    int bits = bitsPerValue - bitsLeft;
                    blocks[blocksOffset++] = (byte)(nextBlock | (v.TripleShift(bits)));
                    while (bits >= 8)
                    {
                        bits -= 8;
                        blocks[blocksOffset++] = (byte)(v.TripleShift(bits));
                    }
                    // then buffer
                    bitsLeft  = 8 - bits;
                    nextBlock = (v & ((1 << bits) - 1)) << bitsLeft;
                }
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(bitsLeft == 8);
            }
        }
Пример #19
0
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            Meta.WriteVInt32(field.Number);
            Meta.WriteByte((byte)Lucene42DocValuesProducer.NUMBER);
            Meta.WriteInt64(Data.GetFilePointer());
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            // TODO: more efficient?
            HashSet <long> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new HashSet <long>();

                long count = 0;
                foreach (long?nv in values)
                {
                    // TODO: support this as MemoryDVFormat (and be smart about missing maybe)
                    long v = nv.GetValueOrDefault();

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                Debug.Assert(count == MaxDoc);
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int           bitsPerValue  = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(MaxDoc, bitsPerValue, AcceptableOverheadRatio);
                if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    Meta.WriteByte((byte)Lucene42DocValuesProducer.UNCOMPRESSED); // uncompressed
                    foreach (long?nv in values)
                    {
                        Data.WriteByte((byte)nv.GetValueOrDefault());
                    }
                }
                else
                {
                    Meta.WriteByte((byte)Lucene42DocValuesProducer.TABLE_COMPRESSED); // table-compressed
                    long[] decode = uniqueValues.ToArray(/*new long?[uniqueValues.Count]*/);
                    var    encode = new Dictionary <long, int>();
                    Data.WriteVInt32(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        Data.WriteInt64(decode[i]);
                        encode[decode[i]] = i;
                    }

                    Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                    Data.WriteVInt32(formatAndBits.Format.Id);
                    Data.WriteVInt32(formatAndBits.BitsPerValue);

                    PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(Data, formatAndBits.Format, MaxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE);
                    foreach (long?nv in values)
                    {
                        writer.Add(encode[nv.GetValueOrDefault()]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                Meta.WriteByte((byte)Lucene42DocValuesProducer.GCD_COMPRESSED);
                Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                Data.WriteInt64(minValue);
                Data.WriteInt64(gcd);
                Data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                Meta.WriteByte((byte)Lucene42DocValuesProducer.DELTA_COMPRESSED); // delta-compressed

                Meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                Data.WriteVInt32(Lucene42DocValuesProducer.BLOCK_SIZE);

                BlockPackedWriter writer = new BlockPackedWriter(Data, Lucene42DocValuesProducer.BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
            }
        }
        private void WriteBlock()
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(blockChunks > 0);
            }
            fieldsIndexOut.WriteVInt32(blockChunks);

            // The trick here is that we only store the difference from the average start
            // pointer or doc base, this helps save bits per value.
            // And in order to prevent a few chunks that would be far from the average to
            // raise the number of bits per value for all of them, we only encode blocks
            // of 1024 chunks at once
            // See LUCENE-4512

            // doc bases
            int avgChunkDocs;

            if (blockChunks == 1)
            {
                avgChunkDocs = 0;
            }
            else
            {
                avgChunkDocs = (int)Math.Round((float)(blockDocs - docBaseDeltas[blockChunks - 1]) / (blockChunks - 1));
            }
            fieldsIndexOut.WriteVInt32(totalDocs - blockDocs); // docBase
            fieldsIndexOut.WriteVInt32(avgChunkDocs);
            int  docBase  = 0;
            long maxDelta = 0;

            for (int i = 0; i < blockChunks; ++i)
            {
                int delta = docBase - avgChunkDocs * i;
                maxDelta |= MoveSignToLowOrderBit(delta);
                docBase  += docBaseDeltas[i];
            }

            int bitsPerDocBase = PackedInt32s.BitsRequired(maxDelta);

            fieldsIndexOut.WriteVInt32(bitsPerDocBase);
            PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(fieldsIndexOut, PackedInt32s.Format.PACKED, blockChunks, bitsPerDocBase, 1);
            docBase = 0;
            for (int i = 0; i < blockChunks; ++i)
            {
                long delta = docBase - avgChunkDocs * i;
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(PackedInt32s.BitsRequired(MoveSignToLowOrderBit(delta)) <= writer.BitsPerValue);
                }
                writer.Add(MoveSignToLowOrderBit(delta));
                docBase += docBaseDeltas[i];
            }
            writer.Finish();

            // start pointers
            fieldsIndexOut.WriteVInt64(firstStartPointer);
            long avgChunkSize;

            if (blockChunks == 1)
            {
                avgChunkSize = 0;
            }
            else
            {
                avgChunkSize = (maxStartPointer - firstStartPointer) / (blockChunks - 1);
            }
            fieldsIndexOut.WriteVInt64(avgChunkSize);
            long startPointer = 0;

            maxDelta = 0;
            for (int i = 0; i < blockChunks; ++i)
            {
                startPointer += startPointerDeltas[i];
                long delta = startPointer - avgChunkSize * i;
                maxDelta |= MoveSignToLowOrderBit(delta);
            }

            int bitsPerStartPointer = PackedInt32s.BitsRequired(maxDelta);

            fieldsIndexOut.WriteVInt32(bitsPerStartPointer);
            writer       = PackedInt32s.GetWriterNoHeader(fieldsIndexOut, PackedInt32s.Format.PACKED, blockChunks, bitsPerStartPointer, 1);
            startPointer = 0;
            for (int i = 0; i < blockChunks; ++i)
            {
                startPointer += startPointerDeltas[i];
                long delta = startPointer - avgChunkSize * i;
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(PackedInt32s.BitsRequired(MoveSignToLowOrderBit(delta)) <= writer.BitsPerValue);
                }
                writer.Add(MoveSignToLowOrderBit(delta));
            }
            writer.Finish();
        }
Пример #21
0
        [ExceptionToNetNumericConvention] // LUCENENET: Private API, keeping as-is
        private void AddVarIntsField(FieldInfo field, IndexOutput output, IEnumerable <long?> values, long minValue, long maxValue)
        {
            field.PutAttribute(legacyKey, LegacyDocValuesType.VAR_INTS.ToString());

            CodecUtil.WriteHeader(output, Lucene40DocValuesFormat.VAR_INTS_CODEC_NAME, Lucene40DocValuesFormat.VAR_INTS_VERSION_CURRENT);

            long delta = maxValue - minValue;

            if (delta < 0)
            {
                // writes longs
                output.WriteByte((byte)Lucene40DocValuesFormat.VAR_INTS_FIXED_64);
                foreach (long?n in values)
                {
                    output.WriteInt64(n.GetValueOrDefault());
                }
            }
            else
            {
                // writes packed ints
                output.WriteByte((byte)Lucene40DocValuesFormat.VAR_INTS_PACKED);
                output.WriteInt64(minValue);
                output.WriteInt64(0 - minValue); // default value (representation of 0)
                PackedInt32s.Writer writer = PackedInt32s.GetWriter(output, state.SegmentInfo.DocCount, PackedInt32s.BitsRequired(delta), PackedInt32s.DEFAULT);
                foreach (long?n in values)
                {
                    writer.Add(n.GetValueOrDefault() - minValue);
                }
                writer.Finish();
            }
        }
        public override Fields Get(int doc)
        {
            EnsureOpen();

            // seek to the right place
            {
                long startPointer = indexReader.GetStartPointer(doc);
                vectorsStream.Seek(startPointer);
            }

            // decode
            // - docBase: first doc ID of the chunk
            // - chunkDocs: number of docs of the chunk
            int docBase   = vectorsStream.ReadVInt32();
            int chunkDocs = vectorsStream.ReadVInt32();

            if (doc < docBase || doc >= docBase + chunkDocs || docBase + chunkDocs > numDocs)
            {
                throw new CorruptIndexException("docBase=" + docBase + ",chunkDocs=" + chunkDocs + ",doc=" + doc + " (resource=" + vectorsStream + ")");
            }

            int skip;        // number of fields to skip
            int numFields;   // number of fields of the document we're looking for
            int totalFields; // total number of fields of the chunk (sum for all docs)

            if (chunkDocs == 1)
            {
                skip      = 0;
                numFields = totalFields = vectorsStream.ReadVInt32();
            }
            else
            {
                reader.Reset(vectorsStream, chunkDocs);
                int sum = 0;
                for (int i = docBase; i < doc; ++i)
                {
                    sum += (int)reader.Next();
                }
                skip      = sum;
                numFields = (int)reader.Next();
                sum      += numFields;
                for (int i = doc + 1; i < docBase + chunkDocs; ++i)
                {
                    sum += (int)reader.Next();
                }
                totalFields = sum;
            }

            if (numFields == 0)
            {
                // no vectors
                return(null);
            }

            // read field numbers that have term vectors
            int[] fieldNums;
            {
                int token = vectorsStream.ReadByte() & 0xFF;
                Debug.Assert(token != 0); // means no term vectors, cannot happen since we checked for numFields == 0
                int bitsPerFieldNum     = token & 0x1F;
                int totalDistinctFields = (int)((uint)token >> 5);
                if (totalDistinctFields == 0x07)
                {
                    totalDistinctFields += vectorsStream.ReadVInt32();
                }
                ++totalDistinctFields;
                PackedInt32s.IReaderIterator it = PackedInt32s.GetReaderIteratorNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalDistinctFields, bitsPerFieldNum, 1);
                fieldNums = new int[totalDistinctFields];
                for (int i = 0; i < totalDistinctFields; ++i)
                {
                    fieldNums[i] = (int)it.Next();
                }
            }

            // read field numbers and flags
            int[] fieldNumOffs = new int[numFields];
            PackedInt32s.Reader flags;
            {
                int bitsPerOff = PackedInt32s.BitsRequired(fieldNums.Length - 1);
                PackedInt32s.Reader allFieldNumOffs = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalFields, bitsPerOff);
                switch (vectorsStream.ReadVInt32())
                {
                case 0:
                    PackedInt32s.Reader  fieldFlags = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, fieldNums.Length, CompressingTermVectorsWriter.FLAGS_BITS);
                    PackedInt32s.Mutable f          = PackedInt32s.GetMutable(totalFields, CompressingTermVectorsWriter.FLAGS_BITS, PackedInt32s.COMPACT);
                    for (int i = 0; i < totalFields; ++i)
                    {
                        int fieldNumOff = (int)allFieldNumOffs.Get(i);
                        Debug.Assert(fieldNumOff >= 0 && fieldNumOff < fieldNums.Length);
                        int fgs = (int)fieldFlags.Get(fieldNumOff);
                        f.Set(i, fgs);
                    }
                    flags = f;
                    break;

                case 1:
                    flags = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalFields, CompressingTermVectorsWriter.FLAGS_BITS);
                    break;

                default:
                    throw new Exception();
                }
                for (int i = 0; i < numFields; ++i)
                {
                    fieldNumOffs[i] = (int)allFieldNumOffs.Get(skip + i);
                }
            }

            // number of terms per field for all fields
            PackedInt32s.Reader numTerms;
            int totalTerms;
            {
                int bitsRequired = vectorsStream.ReadVInt32();
                numTerms = PackedInt32s.GetReaderNoHeader(vectorsStream, PackedInt32s.Format.PACKED, packedIntsVersion, totalFields, bitsRequired);
                int sum = 0;
                for (int i = 0; i < totalFields; ++i)
                {
                    sum += (int)numTerms.Get(i);
                }
                totalTerms = sum;
            }

            // term lengths
            int docOff = 0, docLen = 0, totalLen;

            int[]   fieldLengths  = new int[numFields];
            int[][] prefixLengths = new int[numFields][];
            int[][] suffixLengths = new int[numFields][];
            {
                reader.Reset(vectorsStream, totalTerms);
                // skip
                int toSkip = 0;
                for (int i = 0; i < skip; ++i)
                {
                    toSkip += (int)numTerms.Get(i);
                }
                reader.Skip(toSkip);
                // read prefix lengths
                for (int i = 0; i < numFields; ++i)
                {
                    int   termCount          = (int)numTerms.Get(skip + i);
                    int[] fieldPrefixLengths = new int[termCount];
                    prefixLengths[i] = fieldPrefixLengths;
                    for (int j = 0; j < termCount;)
                    {
                        Int64sRef next = reader.Next(termCount - j);
                        for (int k = 0; k < next.Length; ++k)
                        {
                            fieldPrefixLengths[j++] = (int)next.Int64s[next.Offset + k];
                        }
                    }
                }
                reader.Skip(totalTerms - reader.Ord);

                reader.Reset(vectorsStream, totalTerms);
                // skip
                toSkip = 0;
                for (int i = 0; i < skip; ++i)
                {
                    for (int j = 0; j < numTerms.Get(i); ++j)
                    {
                        docOff += (int)reader.Next();
                    }
                }
                for (int i = 0; i < numFields; ++i)
                {
                    int   termCount          = (int)numTerms.Get(skip + i);
                    int[] fieldSuffixLengths = new int[termCount];
                    suffixLengths[i] = fieldSuffixLengths;
                    for (int j = 0; j < termCount;)
                    {
                        Int64sRef next = reader.Next(termCount - j);
                        for (int k = 0; k < next.Length; ++k)
                        {
                            fieldSuffixLengths[j++] = (int)next.Int64s[next.Offset + k];
                        }
                    }
                    fieldLengths[i] = Sum(suffixLengths[i]);
                    docLen         += fieldLengths[i];
                }
                totalLen = docOff + docLen;
                for (int i = skip + numFields; i < totalFields; ++i)
                {
                    for (int j = 0; j < numTerms.Get(i); ++j)
                    {
                        totalLen += (int)reader.Next();
                    }
                }
            }

            // term freqs
            int[] termFreqs = new int[totalTerms];
            {
                reader.Reset(vectorsStream, totalTerms);
                for (int i = 0; i < totalTerms;)
                {
                    Int64sRef next = reader.Next(totalTerms - i);
                    for (int k = 0; k < next.Length; ++k)
                    {
                        termFreqs[i++] = 1 + (int)next.Int64s[next.Offset + k];
                    }
                }
            }

            // total number of positions, offsets and payloads
            int totalPositions = 0, totalOffsets = 0, totalPayloads = 0;

            for (int i = 0, termIndex = 0; i < totalFields; ++i)
            {
                int f         = (int)flags.Get(i);
                int termCount = (int)numTerms.Get(i);
                for (int j = 0; j < termCount; ++j)
                {
                    int freq = termFreqs[termIndex++];
                    if ((f & CompressingTermVectorsWriter.POSITIONS) != 0)
                    {
                        totalPositions += freq;
                    }
                    if ((f & CompressingTermVectorsWriter.OFFSETS) != 0)
                    {
                        totalOffsets += freq;
                    }
                    if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0)
                    {
                        totalPayloads += freq;
                    }
                }
                Debug.Assert(i != totalFields - 1 || termIndex == totalTerms, termIndex + " " + totalTerms);
            }

            int[][] positionIndex = PositionIndex(skip, numFields, numTerms, termFreqs);
            int[][] positions, startOffsets, lengths;
            if (totalPositions > 0)
            {
                positions = ReadPositions(skip, numFields, flags, numTerms, termFreqs, CompressingTermVectorsWriter.POSITIONS, totalPositions, positionIndex);
            }
            else
            {
                positions = new int[numFields][];
            }

            if (totalOffsets > 0)
            {
                // average number of chars per term
                float[] charsPerTerm = new float[fieldNums.Length];
                for (int i = 0; i < charsPerTerm.Length; ++i)
                {
                    charsPerTerm[i] = J2N.BitConversion.Int32BitsToSingle(vectorsStream.ReadInt32());
                }
                startOffsets = ReadPositions(skip, numFields, flags, numTerms, termFreqs, CompressingTermVectorsWriter.OFFSETS, totalOffsets, positionIndex);
                lengths      = ReadPositions(skip, numFields, flags, numTerms, termFreqs, CompressingTermVectorsWriter.OFFSETS, totalOffsets, positionIndex);

                for (int i = 0; i < numFields; ++i)
                {
                    int[] fStartOffsets = startOffsets[i];
                    int[] fPositions    = positions[i];
                    // patch offsets from positions
                    if (fStartOffsets != null && fPositions != null)
                    {
                        float fieldCharsPerTerm = charsPerTerm[fieldNumOffs[i]];
                        for (int j = 0; j < startOffsets[i].Length; ++j)
                        {
                            fStartOffsets[j] += (int)(fieldCharsPerTerm * fPositions[j]);
                        }
                    }
                    if (fStartOffsets != null)
                    {
                        int[] fPrefixLengths = prefixLengths[i];
                        int[] fSuffixLengths = suffixLengths[i];
                        int[] fLengths       = lengths[i];
                        for (int j = 0, end = (int)numTerms.Get(skip + i); j < end; ++j)
                        {
                            // delta-decode start offsets and  patch lengths using term lengths
                            int termLength = fPrefixLengths[j] + fSuffixLengths[j];
                            lengths[i][positionIndex[i][j]] += termLength;
                            for (int k = positionIndex[i][j] + 1; k < positionIndex[i][j + 1]; ++k)
                            {
                                fStartOffsets[k] += fStartOffsets[k - 1];
                                fLengths[k]      += termLength;
                            }
                        }
                    }
                }
            }
            else
            {
                startOffsets = lengths = new int[numFields][];
            }
            if (totalPositions > 0)
            {
                // delta-decode positions
                for (int i = 0; i < numFields; ++i)
                {
                    int[] fPositions     = positions[i];
                    int[] fpositionIndex = positionIndex[i];
                    if (fPositions != null)
                    {
                        for (int j = 0, end = (int)numTerms.Get(skip + i); j < end; ++j)
                        {
                            // delta-decode start offsets
                            for (int k = fpositionIndex[j] + 1; k < fpositionIndex[j + 1]; ++k)
                            {
                                fPositions[k] += fPositions[k - 1];
                            }
                        }
                    }
                }
            }

            // payload lengths
            int[][] payloadIndex       = new int[numFields][];
            int     totalPayloadLength = 0;
            int     payloadOff         = 0;
            int     payloadLen         = 0;

            if (totalPayloads > 0)
            {
                reader.Reset(vectorsStream, totalPayloads);
                // skip
                int termIndex = 0;
                for (int i = 0; i < skip; ++i)
                {
                    int f         = (int)flags.Get(i);
                    int termCount = (int)numTerms.Get(i);
                    if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0)
                    {
                        for (int j = 0; j < termCount; ++j)
                        {
                            int freq = termFreqs[termIndex + j];
                            for (int k = 0; k < freq; ++k)
                            {
                                int l = (int)reader.Next();
                                payloadOff += l;
                            }
                        }
                    }
                    termIndex += termCount;
                }
                totalPayloadLength = payloadOff;
                // read doc payload lengths
                for (int i = 0; i < numFields; ++i)
                {
                    int f         = (int)flags.Get(skip + i);
                    int termCount = (int)numTerms.Get(skip + i);
                    if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0)
                    {
                        int totalFreq = positionIndex[i][termCount];
                        payloadIndex[i] = new int[totalFreq + 1];
                        int posIdx = 0;
                        payloadIndex[i][posIdx] = payloadLen;
                        for (int j = 0; j < termCount; ++j)
                        {
                            int freq = termFreqs[termIndex + j];
                            for (int k = 0; k < freq; ++k)
                            {
                                int payloadLength = (int)reader.Next();
                                payloadLen += payloadLength;
                                payloadIndex[i][posIdx + 1] = payloadLen;
                                ++posIdx;
                            }
                        }
                        Debug.Assert(posIdx == totalFreq);
                    }
                    termIndex += termCount;
                }
                totalPayloadLength += payloadLen;
                for (int i = skip + numFields; i < totalFields; ++i)
                {
                    int f         = (int)flags.Get(i);
                    int termCount = (int)numTerms.Get(i);
                    if ((f & CompressingTermVectorsWriter.PAYLOADS) != 0)
                    {
                        for (int j = 0; j < termCount; ++j)
                        {
                            int freq = termFreqs[termIndex + j];
                            for (int k = 0; k < freq; ++k)
                            {
                                totalPayloadLength += (int)reader.Next();
                            }
                        }
                    }
                    termIndex += termCount;
                }
                Debug.Assert(termIndex == totalTerms, termIndex + " " + totalTerms);
            }

            // decompress data
            BytesRef suffixBytes = new BytesRef();

            decompressor.Decompress(vectorsStream, totalLen + totalPayloadLength, docOff + payloadOff, docLen + payloadLen, suffixBytes);
            suffixBytes.Length = docLen;
            BytesRef payloadBytes = new BytesRef(suffixBytes.Bytes, suffixBytes.Offset + docLen, payloadLen);

            int[] FieldFlags = new int[numFields];
            for (int i = 0; i < numFields; ++i)
            {
                FieldFlags[i] = (int)flags.Get(skip + i);
            }

            int[] fieldNumTerms = new int[numFields];
            for (int i = 0; i < numFields; ++i)
            {
                fieldNumTerms[i] = (int)numTerms.Get(skip + i);
            }

            int[][] fieldTermFreqs = new int[numFields][];
            {
                int termIdx = 0;
                for (int i = 0; i < skip; ++i)
                {
                    termIdx += (int)numTerms.Get(i);
                }
                for (int i = 0; i < numFields; ++i)
                {
                    int termCount = (int)numTerms.Get(skip + i);
                    fieldTermFreqs[i] = new int[termCount];
                    for (int j = 0; j < termCount; ++j)
                    {
                        fieldTermFreqs[i][j] = termFreqs[termIdx++];
                    }
                }
            }

            Debug.Assert(Sum(fieldLengths) == docLen, Sum(fieldLengths) + " != " + docLen);

            return(new TVFields(this, fieldNums, FieldFlags, fieldNumOffs, fieldNumTerms, fieldLengths, prefixLengths, suffixLengths, fieldTermFreqs, positionIndex, positions, startOffsets, lengths, payloadBytes, payloadIndex, suffixBytes));
        }
Пример #23
0
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            long count    = 0;
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            bool missing  = false;

            // TODO: more efficient?
            JCG.HashSet <long> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new JCG.HashSet <long>();

                foreach (long?nv in values)
                {
                    long v;
                    if (nv == null)
                    {
                        v       = 0;
                        missing = true;
                    }
                    else
                    {
                        v = nv.Value;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
            }
            else
            {
                foreach (var nv in values)
                {
                    ++count;
                }
            }

            long delta = maxValue - minValue;

            int format;

            if (uniqueValues != null && (delta < 0L || PackedInt32s.BitsRequired(uniqueValues.Count - 1) < PackedInt32s.BitsRequired(delta)) && count <= int.MaxValue)
            {
                format = TABLE_COMPRESSED;
            }
            else if (gcd != 0 && gcd != 1)
            {
                format = GCD_COMPRESSED;
            }
            else
            {
                format = DELTA_COMPRESSED;
            }
            meta.WriteVInt32(field.Number);
            meta.WriteByte((byte)Lucene45DocValuesFormat.NUMERIC);
            meta.WriteVInt32(format);
            if (missing)
            {
                meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
                WriteMissingBitset(values);
            }
            else
            {
                meta.WriteInt64(-1L);
            }
            meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
            meta.WriteInt64(data.Position); // LUCENENET specific: Renamed from getFilePointer() to match FileStream
            meta.WriteVInt64(count);
            meta.WriteVInt32(BLOCK_SIZE);

            switch (format)
            {
            case GCD_COMPRESSED:
                meta.WriteInt64(minValue);
                meta.WriteInt64(gcd);
                BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    quotientWriter.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                quotientWriter.Finish();
                break;

            case DELTA_COMPRESSED:
                BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
                break;

            case TABLE_COMPRESSED:
                // LUCENENET NOTE: diming an array and then using .CopyTo() for better efficiency than LINQ .ToArray()
                long[] decode = new long[uniqueValues.Count];
                uniqueValues.CopyTo(decode, 0);
                Dictionary <long, int> encode = new Dictionary <long, int>();
                meta.WriteVInt32(decode.Length);
                for (int i = 0; i < decode.Length; i++)
                {
                    meta.WriteInt64(decode[i]);
                    encode[decode[i]] = i;
                }
                int bitsRequired = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                PackedInt32s.Writer ordsWriter = PackedInt32s.GetWriterNoHeader(data, PackedInt32s.Format.PACKED, (int)count, bitsRequired, PackedInt32s.DEFAULT_BUFFER_SIZE);
                foreach (long?nv in values)
                {
                    ordsWriter.Add(encode[nv.GetValueOrDefault()]);
                }
                ordsWriter.Finish();
                break;

            default:
                throw AssertionError.Create();
            }
        }
Пример #24
0
            public override void Finish(long termsFilePointer)
            {
                // write primary terms dict offsets
                packedIndexStart = outerInstance.m_output.GetFilePointer();

                PackedInt32s.Writer w = PackedInt32s.GetWriter(outerInstance.m_output, numIndexTerms, PackedInt32s.BitsRequired(termsFilePointer), PackedInt32s.DEFAULT);

                // relative to our indexStart
                long upto = 0;

                for (int i = 0; i < numIndexTerms; i++)
                {
                    upto += termsPointerDeltas[i];
                    w.Add(upto);
                }
                w.Finish();

                packedOffsetsStart = outerInstance.m_output.GetFilePointer();

                // write offsets into the byte[] terms
                w    = PackedInt32s.GetWriter(outerInstance.m_output, 1 + numIndexTerms, PackedInt32s.BitsRequired(totTermLength), PackedInt32s.DEFAULT);
                upto = 0;
                for (int i = 0; i < numIndexTerms; i++)
                {
                    w.Add(upto);
                    upto += termLengths[i];
                }
                w.Add(upto);
                w.Finish();

                // our referrer holds onto us, while other fields are
                // being written, so don't tie up this RAM:
                termLengths        = null;
                termsPointerDeltas = null;
            }
Пример #25
0
        internal virtual void AddNumericField(FieldInfo field, IEnumerable <long?> values, bool optimizeStorage)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte(MemoryDocValuesProducer.NUMBER);
            meta.WriteInt64(data.GetFilePointer());
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            bool missing  = false;
            // TODO: more efficient?
            ISet <long?> uniqueValues = null;

            if (optimizeStorage)
            {
                uniqueValues = new JCG.HashSet <long?>();

                long count = 0;
                foreach (var nv in values)
                {
                    long v;
                    if (nv == null)
                    {
                        v       = 0;
                        missing = true;
                    }
                    else
                    {
                        v = nv.Value;
                    }

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(count == maxDoc);
                }
            }

            if (missing)
            {
                long start = data.GetFilePointer();
                WriteMissingBitset(values);
                meta.WriteInt64(start);
                meta.WriteInt64(data.GetFilePointer() - start);
            }
            else
            {
                meta.WriteInt64(-1L);
            }

            if (uniqueValues != null)
            {
                // small number of unique values

                int           bitsPerValue  = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue,
                                                                                acceptableOverheadRatio);
                if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed
                    foreach (var nv in values)
                    {
                        data.WriteByte((byte)nv.GetValueOrDefault());
                    }
                }
                else
                {
                    meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed
                    long?[] decode = new long?[uniqueValues.Count];
                    uniqueValues.CopyTo(decode, 0);

                    var encode = new Dictionary <long?, int?>();
                    data.WriteVInt32(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        data.WriteInt64(decode[i].Value);
                        encode[decode[i]] = i;
                    }

                    meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                    data.WriteVInt32(formatAndBits.Format.Id);
                    data.WriteVInt32(formatAndBits.BitsPerValue);

                    PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc,
                                                                                formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE);
                    foreach (var nv in values)
                    {
                        var v = encode[nv.GetValueOrDefault()];

                        writer.Add((long)v);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED);
                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteInt64(minValue);
                data.WriteInt64(gcd);
                data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
                foreach (var nv in values)
                {
                    writer.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed

                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteVInt32(MemoryDocValuesProducer.BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
                foreach (var nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
            }
        }
Пример #26
0
        public override void AddNumericField(FieldInfo field, IEnumerable <long?> values)
        {
            meta.WriteVInt32(field.Number);
            meta.WriteByte((byte)NUMBER);
            meta.WriteInt64(data.GetFilePointer());
            long minValue = long.MaxValue;
            long maxValue = long.MinValue;
            long gcd      = 0;
            // TODO: more efficient?
            HashSet <long> uniqueValues = null;

            if (true)
            {
                uniqueValues = new HashSet <long>();

                long count = 0;
                foreach (long?nv in values)
                {
                    Debug.Assert(nv != null);
                    long v = nv.Value;

                    if (gcd != 1)
                    {
                        if (v < long.MinValue / 2 || v > long.MaxValue / 2)
                        {
                            // in that case v - minValue might overflow and make the GCD computation return
                            // wrong results. Since these extreme values are unlikely, we just discard
                            // GCD computation for them
                            gcd = 1;
                        } // minValue needs to be set first
                        else if (count != 0)
                        {
                            gcd = MathUtil.Gcd(gcd, v - minValue);
                        }
                    }

                    minValue = Math.Min(minValue, v);
                    maxValue = Math.Max(maxValue, v);

                    if (uniqueValues != null)
                    {
                        if (uniqueValues.Add(v))
                        {
                            if (uniqueValues.Count > 256)
                            {
                                uniqueValues = null;
                            }
                        }
                    }

                    ++count;
                }
                Debug.Assert(count == maxDoc);
            }

            if (uniqueValues != null)
            {
                // small number of unique values
                int           bitsPerValue  = PackedInt32s.BitsRequired(uniqueValues.Count - 1);
                FormatAndBits formatAndBits = PackedInt32s.FastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
                if (formatAndBits.BitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
                {
                    meta.WriteByte((byte)UNCOMPRESSED); // uncompressed
                    foreach (long?nv in values)
                    {
                        data.WriteByte((byte)nv.GetValueOrDefault());
                    }
                }
                else
                {
                    meta.WriteByte((byte)TABLE_COMPRESSED); // table-compressed
                    var decode = uniqueValues.ToArray();
                    var encode = new Dictionary <long, int>();
                    data.WriteVInt32(decode.Length);
                    for (int i = 0; i < decode.Length; i++)
                    {
                        data.WriteInt64(decode[i]);
                        encode[decode[i]] = i;
                    }

                    meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                    data.WriteVInt32(formatAndBits.Format.Id);
                    data.WriteVInt32(formatAndBits.BitsPerValue);

                    PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(data, formatAndBits.Format, maxDoc, formatAndBits.BitsPerValue, PackedInt32s.DEFAULT_BUFFER_SIZE);
                    foreach (long?nv in values)
                    {
                        writer.Add(encode[nv.GetValueOrDefault()]);
                    }
                    writer.Finish();
                }
            }
            else if (gcd != 0 && gcd != 1)
            {
                meta.WriteByte((byte)GCD_COMPRESSED);
                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteInt64(minValue);
                data.WriteInt64(gcd);
                data.WriteVInt32(BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add((nv.GetValueOrDefault() - minValue) / gcd);
                }
                writer.Finish();
            }
            else
            {
                meta.WriteByte((byte)DELTA_COMPRESSED); // delta-compressed

                meta.WriteVInt32(PackedInt32s.VERSION_CURRENT);
                data.WriteVInt32(BLOCK_SIZE);

                var writer = new BlockPackedWriter(data, BLOCK_SIZE);
                foreach (long?nv in values)
                {
                    writer.Add(nv.GetValueOrDefault());
                }
                writer.Finish();
            }
        }
Пример #27
0
 private void FlushFields(int totalFields, int[] fieldNums)
 {
     PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, PackedInt32s.BitsRequired(fieldNums.Length - 1), 1);
     foreach (DocData dd in pendingDocs)
     {
         foreach (FieldData fd in dd.fields)
         {
             int fieldNumIndex = Array.BinarySearch(fieldNums, fd.fieldNum);
             if (Debugging.AssertsEnabled)
             {
                 Debugging.Assert(fieldNumIndex >= 0);
             }
             writer.Add(fieldNumIndex);
         }
     }
     writer.Finish();
 }
Пример #28
0
        public override void Fill(int fromIndex, int toIndex, long val)
        {
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(PackedInt32s.BitsRequired(val) <= BitsPerValue);
                Debugging.Assert(fromIndex <= toIndex);
            }

            // minimum number of values that use an exact number of full blocks
            int nAlignedValues = 64 / Gcd(64, m_bitsPerValue);
            int span           = toIndex - fromIndex;

            if (span <= 3 * nAlignedValues)
            {
                // there needs be at least 2 * nAlignedValues aligned values for the
                // block approach to be worth trying
                base.Fill(fromIndex, toIndex, val);
                return;
            }

            // fill the first values naively until the next block start
            int fromIndexModNAlignedValues = fromIndex % nAlignedValues;

            if (fromIndexModNAlignedValues != 0)
            {
                for (int i = fromIndexModNAlignedValues; i < nAlignedValues; ++i)
                {
                    Set(fromIndex++, val);
                }
            }
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(fromIndex % nAlignedValues == 0);
            }

            // compute the long[] blocks for nAlignedValues consecutive values and
            // use them to set as many values as possible without applying any mask
            // or shift
            int nAlignedBlocks = (nAlignedValues * m_bitsPerValue) >> 6;

            long[] nAlignedValuesBlocks;
            {
                Packed64 values = new Packed64(nAlignedValues, m_bitsPerValue);
                for (int i = 0; i < nAlignedValues; ++i)
                {
                    values.Set(i, val);
                }
                nAlignedValuesBlocks = values.blocks;
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(nAlignedBlocks <= nAlignedValuesBlocks.Length);
                }
            }
            int startBlock = (int)(((long)fromIndex * m_bitsPerValue).TripleShift(6));
            int endBlock   = (int)(((long)toIndex * m_bitsPerValue).TripleShift(6));

            for (int block = startBlock; block < endBlock; ++block)
            {
                long blockValue = nAlignedValuesBlocks[block % nAlignedBlocks];
                blocks[block] = blockValue;
            }

            // fill the gap
            for (int i = (int)(((long)endBlock << 6) / m_bitsPerValue); i < toIndex; ++i)
            {
                Set(i, val);
            }
        }