예제 #1
0
        internal virtual void SeekEnum(SegmentTermEnum enumerator, int indexOffset)
        {
            PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone();

            input.SetPosition(indexToDataOffset.Get(indexOffset));

            // read the term
            int  fieldId = input.ReadVInt32();
            Term field   = fields[fieldId];
            Term term    = new Term(field.Field, input.ReadString());

            // read the terminfo
            var termInfo = new TermInfo();

            termInfo.DocFreq = input.ReadVInt32();
            if (termInfo.DocFreq >= skipInterval)
            {
                termInfo.SkipOffset = input.ReadVInt32();
            }
            else
            {
                termInfo.SkipOffset = 0;
            }
            termInfo.FreqPointer = input.ReadVInt64();
            termInfo.ProxPointer = input.ReadVInt64();

            long pointer = input.ReadVInt64();

            // perform the seek
            enumerator.Seek(pointer, ((long)indexOffset * totalIndexInterval) - 1, term, termInfo);
        }
예제 #2
0
            internal virtual void Reset(FieldInfo fieldInfo)
            {
                //System.out.println("pff.reset te=" + termEnum);
                this.fieldInfo = fieldInfo;

                internedFieldName = fieldInfo.Name.Intern();

                Term term = new Term(internedFieldName);

                if (termEnum == null)
                {
                    termEnum     = outerInstance.TermsDict.Terms(term);
                    seekTermEnum = outerInstance.TermsDict.Terms(term);
                    //System.out.println("  term=" + termEnum.term());
                }
                else
                {
                    outerInstance.TermsDict.SeekEnum(termEnum, term, true);
                }
                skipNext = true;

                unicodeSortOrder = outerInstance.SortTermsByUnicode;

                Term t = termEnum.Term();

                if (t != null && t.Field == internedFieldName)
                {
                    newSuffixStart  = 0;
                    prevTerm.Length = 0;
                    SurrogateDance();
                }
            }
예제 #3
0
 public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs)
 {
     pos.LiveDocs = liveDocs;
     pos.Seek(termEnum);
     docID = -1;
     return(this);
 }
예제 #4
0
        /// <summary>
        /// Returns the position of a <see cref="Term"/> in the set or -1. </summary>
        internal long GetPosition(Term term)
        {
            if (size == 0)
            {
                return(-1);
            }

            EnsureIndexIsRead();
            int indexOffset = index.GetIndexOffset(term);

            SegmentTermEnum enumerator = GetThreadResources().termEnum;

            index.SeekEnum(enumerator, indexOffset);

            while (CompareAsUTF16(term, enumerator.Term()) > 0 && enumerator.Next())
            {
            }

            if (CompareAsUTF16(term, enumerator.Term()) == 0)
            {
                return(enumerator.position);
            }
            else
            {
                return(-1);
            }
        }
예제 #5
0
        /// <summary>
        /// Loads the segment information at segment load time.
        /// </summary>
        /// <param name="indexEnum">
        ///          The term enum. </param>
        /// <param name="indexDivisor">
        ///          The index divisor. </param>
        /// <param name="tiiFileLength">
        ///          The size of the tii file, used to approximate the size of the
        ///          buffer. </param>
        /// <param name="totalIndexInterval">
        ///          The total index interval. </param>
        public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval)
        {
            this.totalIndexInterval = totalIndexInterval;
            indexSize    = 1 + ((int)indexEnum.size - 1) / indexDivisor;
            skipInterval = indexEnum.skipInterval;
            // this is only an inital size, it will be GCed once the build is complete
            long                 initialSize    = (long)(tiiFileLength * 1.5) / indexDivisor;
            PagedBytes           dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize));
            PagedBytesDataOutput dataOutput     = dataPagedBytes.GetDataOutput();

            int            bitEstimate  = 1 + MathUtil.Log(tiiFileLength, 2);
            GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInt32s.DEFAULT);

            string         currentField = null;
            IList <string> fieldStrs    = new List <string>();
            int            fieldCounter = -1;

            for (int i = 0; indexEnum.Next(); i++)
            {
                Term term = indexEnum.Term();
                if (currentField == null || !currentField.Equals(term.Field, StringComparison.Ordinal))
                {
                    currentField = term.Field;
                    fieldStrs.Add(currentField);
                    fieldCounter++;
                }
                TermInfo termInfo = indexEnum.TermInfo();
                indexToTerms.Set(i, dataOutput.GetPosition());
                dataOutput.WriteVInt32(fieldCounter);
                dataOutput.WriteString(term.Text());
                dataOutput.WriteVInt32(termInfo.DocFreq);
                if (termInfo.DocFreq >= skipInterval)
                {
                    dataOutput.WriteVInt32(termInfo.SkipOffset);
                }
                dataOutput.WriteVInt64(termInfo.FreqPointer);
                dataOutput.WriteVInt64(termInfo.ProxPointer);
                dataOutput.WriteVInt64(indexEnum.indexPointer);
                for (int j = 1; j < indexDivisor; j++)
                {
                    if (!indexEnum.Next())
                    {
                        break;
                    }
                }
            }

            fields = new Term[fieldStrs.Count];
            for (int i = 0; i < fields.Length; i++)
            {
                fields[i] = new Term(fieldStrs[i]);
            }

            dataPagedBytes.Freeze(true);
            dataInput         = dataPagedBytes.GetDataInput();
            indexToDataOffset = indexToTerms.Mutable;

            ramBytesUsed = fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + indexToDataOffset.RamBytesUsed();
        }
예제 #6
0
 public PreDocsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs)
 {
     docs.LiveDocs = liveDocs;
     docs.Seek(termEnum);
     docs.freq = 1;
     docID     = -1;
     return(this);
 }
예제 #7
0
        internal TermInfosReader(Directory dir, string seg, FieldInfos fis, IOContext context, int indexDivisor)
        {
            bool success = false;

            if (indexDivisor < 1 && indexDivisor != -1)
            {
                throw new ArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor);
            }

            try
            {
                directory  = dir;
                segment    = seg;
                fieldInfos = fis;

                origEnum = new SegmentTermEnum(directory.OpenInput(IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), context), fieldInfos, false);
                size     = origEnum.size;

                if (indexDivisor != -1)
                {
                    // Load terms index
                    totalIndexInterval = origEnum.indexInterval * indexDivisor;

                    string          indexFileName = IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION);
                    SegmentTermEnum indexEnum     = new SegmentTermEnum(directory.OpenInput(indexFileName, context), fieldInfos, true);

                    try
                    {
                        index       = new TermInfosReaderIndex(indexEnum, indexDivisor, dir.FileLength(indexFileName), totalIndexInterval);
                        indexLength = index.Length;
                    }
                    finally
                    {
                        indexEnum.Dispose();
                    }
                }
                else
                {
                    // Do not load terms index:
                    totalIndexInterval = -1;
                    index       = null;
                    indexLength = -1;
                }
                success = true;
            }
            finally
            {
                // With lock-less commits, it's entirely possible (and
                // fine) to hit a FileNotFound exception above. In
                // this case, we want to explicitly close any subset
                // of things that were opened so that we don't have to
                // wait for a GC to do so.
                if (!success)
                {
                    Dispose();
                }
            }
        }
예제 #8
0
 internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, bool useCache)
 {
     if (useCache)
     {
         return(SeekEnum(enumerator, term, termsCache.Get(new CloneableTerm(DeepCopyOf(term))), useCache));
     }
     else
     {
         return(SeekEnum(enumerator, term, null, useCache));
     }
 }
예제 #9
0
        public object Clone()
        {
            // LUCENENET: MemberwiseClone() doesn't throw in .NET
            SegmentTermEnum clone = (SegmentTermEnum)base.MemberwiseClone();

            clone.input    = (IndexInput)input.Clone();
            clone.termInfo = new TermInfo(termInfo);

            clone.termBuffer = (TermBuffer)termBuffer.Clone();
            clone.prevBuffer = (TermBuffer)prevBuffer.Clone();
            clone.scanBuffer = new TermBuffer();

            return(clone);
        }
예제 #10
0
        public virtual void Seek(SegmentTermEnum segmentTermEnum)
        {
            TermInfo ti;
            Term     term;

            // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs
            if (segmentTermEnum.fieldInfos == fieldInfos) // optimized case
            {
                term = segmentTermEnum.Term();
                ti   = segmentTermEnum.TermInfo();
            } // punt case
            else
            {
                term = segmentTermEnum.Term();
                ti   = tis.Get(term);
            }

            Seek(ti, term);
        }
예제 #11
0
 // called only from asserts
 private static bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) // LUCENENET: CA1822: Mark members as static
 {
     if (ti1.DocFreq != ti2.DocFreq)
     {
         return(false);
     }
     if (ti1.FreqPointer != ti2.FreqPointer)
     {
         return(false);
     }
     if (ti1.ProxPointer != ti2.ProxPointer)
     {
         return(false);
     }
     // skipOffset is only valid when docFreq >= skipInterval:
     if (ti1.DocFreq >= enumerator.skipInterval && ti1.SkipOffset != ti2.SkipOffset)
     {
         return(false);
     }
     return(true);
 }
예제 #12
0
 // called only from asserts
 private bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator)
 {
     if (ti1.DocFreq != ti2.DocFreq)
     {
         return(false);
     }
     if (ti1.FreqPointer != ti2.FreqPointer)
     {
         return(false);
     }
     if (ti1.ProxPointer != ti2.ProxPointer)
     {
         return(false);
     }
     // skipOffset is only valid when docFreq >= skipInterval:
     if (ti1.DocFreq >= enumerator.skipInterval && ti1.SkipOffset != ti2.SkipOffset)
     {
         return(false);
     }
     return(true);
 }
예제 #13
0
        public object Clone()
        {
            SegmentTermEnum clone = null;

            try
            {
                clone = (SegmentTermEnum)base.MemberwiseClone();
            }
#pragma warning disable 168
            catch (InvalidOperationException e)
#pragma warning restore 168
            {
            }

            clone.input    = (IndexInput)input.Clone();
            clone.termInfo = new TermInfo(termInfo);

            clone.termBuffer = (TermBuffer)termBuffer.Clone();
            clone.prevBuffer = (TermBuffer)prevBuffer.Clone();
            clone.scanBuffer = new TermBuffer();

            return(clone);
        }
예제 #14
0
        internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache)
        {
            if (size == 0)
            {
                return(null);
            }

            // optimize sequential access: first try scanning cached enum w/o seeking
            if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current
            {
                int enumOffset = (int)(enumerator.position / totalIndexInterval) + 1;
                if (indexLength == enumOffset || index.CompareTo(term, enumOffset) < 0) // but before end of block
                {
                    // no need to seek

                    TermInfo ti;
                    int      numScans = enumerator.ScanTo(term);
                    if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0)
                    {
                        ti = enumerator.termInfo;
                        if (numScans > 1)
                        {
                            // we only  want to put this TermInfo into the cache if
                            // scanEnum skipped more than one dictionary entry.
                            // this prevents RangeQueries or WildcardQueries to
                            // wipe out the cache when they iterate over a large numbers
                            // of terms in order
                            if (tiOrd == null)
                            {
                                if (useCache)
                                {
                                    termsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.position));
                                }
                            }
                            else if (Debugging.AssertsEnabled)
                            {
                                Debugging.Assert(SameTermInfo(ti, tiOrd, enumerator));
                                Debugging.Assert((int)enumerator.position == tiOrd.termOrd);
                            }
                        }
                    }
                    else
                    {
                        ti = null;
                    }

                    return(ti);
                }
            }

            // random-access: must seek
            int indexPos;

            if (tiOrd != null)
            {
                indexPos = (int)(tiOrd.termOrd / totalIndexInterval);
            }
            else
            {
                // Must do binary search:
                indexPos = index.GetIndexOffset(term);
            }

            index.SeekEnum(enumerator, indexPos);
            enumerator.ScanTo(term);
            TermInfo ti_;

            if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0)
            {
                ti_ = enumerator.termInfo;
                if (tiOrd == null)
                {
                    if (useCache)
                    {
                        termsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.position));
                    }
                }
                else if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(SameTermInfo(ti_, tiOrd, enumerator));
                    Debugging.Assert(enumerator.position == tiOrd.termOrd);
                }
            }
            else
            {
                ti_ = null;
            }
            return(ti_);
        }
예제 #15
0
 public void CacheCurrentTerm(SegmentTermEnum enumerator)
 {
     termsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.termInfo, enumerator.position));
 }
예제 #16
0
            // Swap in S, in place of E:
            private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
            {
                int savLength = term.Length;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(term.Offset == 0);
                }

                // The 3 bytes starting at downTo make up 1
                // unicode character:
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(IsHighBMPChar(term.Bytes, pos));
                }

                // NOTE: we cannot make this assert, because
                // AutomatonQuery legitimately sends us malformed UTF8
                // (eg the UTF8 bytes with just 0xee)
                // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

                // Save the bytes && length, since we need to
                // restore this if seek "back" finds no matching
                // terms
                if (term.Bytes.Length < 4 + pos)
                {
                    term.Grow(4 + pos);
                }

                scratch[0] = (sbyte)term.Bytes[pos];
                scratch[1] = (sbyte)term.Bytes[pos + 1];
                scratch[2] = (sbyte)term.Bytes[pos + 2];

                term.Bytes[pos]     = 0xf0;
                term.Bytes[pos + 1] = 0x90;
                term.Bytes[pos + 2] = 0x80;
                term.Bytes[pos + 3] = 0x80;
                term.Length         = 4 + pos;

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
                }

                // Seek "back":
                outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true);

                // Test if the term we seek'd to in fact found a
                // surrogate pair at the same position as the E:
                Term t2 = te.Term();

                // Cannot be null (or move to next field) because at
                // "worst" it'd seek to the same term we are on now,
                // unless we are being called from seek
                if (t2 == null || t2.Field != internedFieldName)
                {
                    return(false);
                }

                if (DEBUG_SURROGATES)
                {
                    Console.WriteLine("      got term=" + UnicodeUtil.ToHexString(t2.Text()));
                }

                // Now test if prefix is identical and we found
                // a non-BMP char at the same position:
                BytesRef b2 = t2.Bytes;

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(b2.Offset == 0);
                }

                bool matches;

                if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos))
                {
                    matches = true;
                    for (int i = 0; i < pos; i++)
                    {
                        if (term.Bytes[i] != b2.Bytes[i])
                        {
                            matches = false;
                            break;
                        }
                    }
                }
                else
                {
                    matches = false;
                }

                // Restore term:
                term.Length         = savLength;
                term.Bytes[pos]     = (byte)scratch[0];
                term.Bytes[pos + 1] = (byte)scratch[1];
                term.Bytes[pos + 2] = (byte)scratch[2];

                return(matches);
            }