Esempio n. 1
0
        /// <summary>
        /// Loads the segment information at segment load time.
        /// </summary>
        /// <param name="indexEnum">
        ///          the term enum. </param>
        /// <param name="indexDivisor">
        ///          the index divisor. </param>
        /// <param name="tiiFileLength">
        ///          the size of the tii file, used to approximate the size of the
        ///          buffer. </param>
        /// <param name="totalIndexInterval">
        ///          the total index interval. </param>
        public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval)
        {
            this.TotalIndexInterval = totalIndexInterval;
            IndexSize = 1 + ((int)indexEnum.Size - 1) / indexDivisor;
            SkipInterval = indexEnum.SkipInterval;
            // this is only an inital size, it will be GCed once the build is complete
            long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor;
            PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize));
            PagedBytesDataOutput dataOutput = dataPagedBytes.DataOutput;

            int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2);
            GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, IndexSize, PackedInts.DEFAULT);

            string currentField = null;
            IList<string> fieldStrs = new List<string>();
            int fieldCounter = -1;
            for (int i = 0; indexEnum.Next(); i++)
            {
                Term term = indexEnum.Term();
                if (currentField == null || !currentField.Equals(term.Field()))
                {
                    currentField = term.Field();
                    fieldStrs.Add(currentField);
                    fieldCounter++;
                }
                TermInfo termInfo = indexEnum.TermInfo();
                indexToTerms.Set(i, dataOutput.Position);
                dataOutput.WriteVInt(fieldCounter);
                dataOutput.WriteString(term.Text());
                dataOutput.WriteVInt(termInfo.DocFreq);
                if (termInfo.DocFreq >= SkipInterval)
                {
                    dataOutput.WriteVInt(termInfo.SkipOffset);
                }
                dataOutput.WriteVLong(termInfo.FreqPointer);
                dataOutput.WriteVLong(termInfo.ProxPointer);
                dataOutput.WriteVLong(indexEnum.IndexPointer);
                for (int j = 1; j < indexDivisor; j++)
                {
                    if (!indexEnum.Next())
                    {
                        break;
                    }
                }
            }

            Fields = new Term[fieldStrs.Count];
            for (int i = 0; i < Fields.Length; i++)
            {
                Fields[i] = new Term(fieldStrs[i]);
            }

            dataPagedBytes.Freeze(true);
            DataInput = dataPagedBytes.DataInput;
            IndexToDataOffset = indexToTerms.Mutable;

            RamBytesUsed_Renamed = Fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + IndexToDataOffset.RamBytesUsed();
        }
Esempio n. 2
0
        public virtual void Seek(SegmentTermEnum segmentTermEnum)
        {
            TermInfo ti;
            Term term;

            // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
            if (segmentTermEnum.FieldInfos == FieldInfos) // optimized case
            {
                term = segmentTermEnum.Term();
                ti = segmentTermEnum.TermInfo();
            } // punt case
            else
            {
                term = segmentTermEnum.Term();
                ti = Tis.Get(term);
            }

            Seek(ti, term);
        }
 private Term FindTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index)
 {
     int termPosition = index * TermIndexInterval * IndexDivisor;
     for (int i = 0; i < termPosition; i++)
     {
         // TODO: this test just uses random terms, so this is always possible
         AssumeTrue("ran out of terms", termEnum.Next());
     }
     Term term = termEnum.Term();
     // An indexed term is only written when the term after
     // it exists, so, if the number of terms is 0 mod
     // termIndexInterval, the last index term will not be
     // written; so we require a term after this term
     // as well:
     AssumeTrue("ran out of terms", termEnum.Next());
     return term;
 }
Esempio n. 4
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(term.Offset == 0);
                }

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
                }

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(b2.Offset == 0);
                }

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }
Esempio n. 5
0
        internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache)
        {
            if (Size_Renamed == 0)
            {
                return(null);
            }

            // optimize sequential access: first try scanning cached enum w/o seeking
            if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current
            {
                int enumOffset = (int)(enumerator.Position / TotalIndexInterval) + 1;
                if (IndexLength == enumOffset || Index.CompareTo(term, enumOffset) < 0) // but before end of block
                {
                    // no need to seek

                    TermInfo ti;
                    int      numScans = enumerator.ScanTo(term);
                    if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0)
                    {
                        ti = enumerator.TermInfo_Renamed;
                        if (numScans > 1)
                        {
                            // we only  want to put this TermInfo into the cache if
                            // scanEnum skipped more than one dictionary entry.
                            // this prevents RangeQueries or WildcardQueries to
                            // wipe out the cache when they iterate over a large numbers
                            // of terms in order
                            if (tiOrd == null)
                            {
                                if (useCache)
                                {
                                    TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.Position));
                                }
                            }
                            else
                            {
                                Debug.Assert(SameTermInfo(ti, tiOrd, enumerator));
                                Debug.Assert(enumerator.Position == tiOrd.TermOrd);
                            }
                        }
                    }
                    else
                    {
                        ti = null;
                    }

                    return(ti);
                }
            }

            // random-access: must seek
            int indexPos;

            if (tiOrd != null)
            {
                indexPos = (int)(tiOrd.TermOrd / TotalIndexInterval);
            }
            else
            {
                // Must do binary search:
                indexPos = Index.GetIndexOffset(term);
            }

            Index.SeekEnum(enumerator, indexPos);
            enumerator.ScanTo(term);
            TermInfo ti_;

            if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0)
            {
                ti_ = enumerator.TermInfo_Renamed;
                if (tiOrd == null)
                {
                    if (useCache)
                    {
                        TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.Position));
                    }
                }
                else
                {
                    Debug.Assert(SameTermInfo(ti_, tiOrd, enumerator));
                    Debug.Assert(enumerator.Position == tiOrd.TermOrd);
                }
            }
            else
            {
                ti_ = null;
            }
            return(ti_);
        }
Esempio n. 6
0
 public void CacheCurrentTerm(SegmentTermEnum enumerator)
 {
     TermsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.TermInfo_Renamed, enumerator.Position));
 }
Esempio n. 7
0
        internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache)
        {
            if (Size_Renamed == 0)
            {
                return null;
            }

            // optimize sequential access: first try scanning cached enum w/o seeking
            if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current
            {
                int enumOffset = (int)(enumerator.Position / TotalIndexInterval) + 1;
                if (IndexLength == enumOffset || Index.CompareTo(term, enumOffset) < 0) // but before end of block
                {
                    // no need to seek

                    TermInfo ti;
                    int numScans = enumerator.ScanTo(term);
                    if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0)
                    {
                        ti = enumerator.TermInfo_Renamed;
                        if (numScans > 1)
                        {
                            // we only  want to put this TermInfo into the cache if
                            // scanEnum skipped more than one dictionary entry.
                            // this prevents RangeQueries or WildcardQueries to
                            // wipe out the cache when they iterate over a large numbers
                            // of terms in order
                            if (tiOrd == null)
                            {
                                if (useCache)
                                {
                                    TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.Position));
                                }
                            }
                            else
                            {
                                Debug.Assert(SameTermInfo(ti, tiOrd, enumerator));
                                Debug.Assert(enumerator.Position == tiOrd.TermOrd);
                            }
                        }
                    }
                    else
                    {
                        ti = null;
                    }

                    return ti;
                }
            }

            // random-access: must seek
            int indexPos;
            if (tiOrd != null)
            {
                indexPos = (int)(tiOrd.TermOrd / TotalIndexInterval);
            }
            else
            {
                // Must do binary search:
                indexPos = Index.GetIndexOffset(term);
            }

            Index.SeekEnum(enumerator, indexPos);
            enumerator.ScanTo(term);
            TermInfo ti_;

            if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0)
            {
                ti_ = enumerator.TermInfo_Renamed;
                if (tiOrd == null)
                {
                    if (useCache)
                    {
                        TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.Position));
                    }
                }
                else
                {
                    Debug.Assert(SameTermInfo(ti_, tiOrd, enumerator));
                    Debug.Assert(enumerator.Position == tiOrd.TermOrd);
                }
            }
            else
            {
                ti_ = null;
            }
            return ti_;
        }
Esempio n. 8
0
 public void CacheCurrentTerm(SegmentTermEnum enumerator)
 {
     TermsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.TermInfo_Renamed, enumerator.Position));
 }
Esempio n. 9
0
            // Swap in S, in place of E:
            internal virtual bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                Debug.Assert(term.Offset == 0);

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                Debug.Assert(IsHighBMPChar(term.Bytes, pos));

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                Scratch[0] = (sbyte)term.Bytes[pos];
                Scratch[1] = (sbyte)term.Bytes[pos + 1];
                Scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos] = unchecked((byte)0xf0);
                term.Bytes[pos + 1] = unchecked((byte)0x90);
                term.Bytes[pos + 2] = unchecked((byte)0x80);
                term.Bytes[pos + 3] = unchecked((byte)0x80);
                term.Length = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                OuterInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field() != InternedFieldName)
                {
                    return false;
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes();
                Debug.Assert(b2.Offset == 0);

                bool matches;
                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length = savLength;
                term.Bytes[pos] = (byte)Scratch[0];
                term.Bytes[pos + 1] = (byte)Scratch[1];
                term.Bytes[pos + 2] = (byte)Scratch[2];

                return matches;
            }
Esempio n. 10
0
            // Seek type 2 "continue" (back to the start of the
            // surrogates): scan the stripped suffix from the
            // prior term, backwards. If there was an E in that
            // part, then we try to seek back to S.  If that
            // seek finds a matching term, we go there.
            private bool DoContinue()
            {
                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("  try cont");
                }

                int downTo = prevTerm.Length - 1;

                bool didSeek = false;

                int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1);

                while (downTo > limit)
                {
                    if (IsHighBMPChar(prevTerm.Bytes, downTo))
                    {
                        if (DEBUG_SURROGATES)
                        {
                            Console.WriteLine("    found E pos=" + downTo + " vs len=" + prevTerm.Length);
                        }

                        if (SeekToNonBMP(seekTermEnum, prevTerm, downTo))
                        {
                            // TODO: more efficient seek?
                            outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true);
                            //newSuffixStart = downTo+4;
                            newSuffixStart = downTo;
                            scratchTerm.CopyBytes(termEnum.Term().Bytes);
                            didSeek = true;
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      seek!");
                            }
                            break;
                        }
                        else
                        {
                            if (DEBUG_SURROGATES)
                            {
                                Console.WriteLine("      no seek");
                            }
                        }
                    }

                    // Shorten prevTerm in place so that we don't redo
                    // this loop if we come back here:
                    if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0)
                    {
                        prevTerm.Length = downTo;
                    }

                    downTo--;
                }

                return(didSeek);
            }