internal virtual void Reset(FieldInfo fieldInfo) { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; internedFieldName = fieldInfo.Name.Intern(); Term term = new Term(internedFieldName); if (termEnum == null) { termEnum = outerInstance.TermsDict.Terms(term); seekTermEnum = outerInstance.TermsDict.Terms(term); //System.out.println(" term=" + termEnum.term()); } else { outerInstance.TermsDict.SeekEnum(termEnum, term, true); } skipNext = true; unicodeSortOrder = outerInstance.SortTermsByUnicode; Term t = termEnum.Term(); if (t != null && t.Field == internedFieldName) { newSuffixStart = 0; prevTerm.Length = 0; SurrogateDance(); } }
internal virtual void SeekEnum(SegmentTermEnum enumerator, int indexOffset) { PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone(); input.SetPosition(indexToDataOffset.Get(indexOffset)); // read the term int fieldId = input.ReadVInt32(); Term field = fields[fieldId]; Term term = new Term(field.Field, input.ReadString()); // read the terminfo var termInfo = new TermInfo(); termInfo.DocFreq = input.ReadVInt32(); if (termInfo.DocFreq >= skipInterval) { termInfo.SkipOffset = input.ReadVInt32(); } else { termInfo.SkipOffset = 0; } termInfo.FreqPointer = input.ReadVInt64(); termInfo.ProxPointer = input.ReadVInt64(); long pointer = input.ReadVInt64(); // perform the seek enumerator.Seek(pointer, ((long)indexOffset * totalIndexInterval) - 1, term, termInfo); }
internal virtual void Reset(FieldInfo fieldInfo) { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; InternedFieldName = String.Intern(fieldInfo.Name); Term term = new Term(InternedFieldName); if (TermEnum == null) { TermEnum = OuterInstance.TermsDict.Terms(term); SeekTermEnum = OuterInstance.TermsDict.Terms(term); //System.out.println(" term=" + termEnum.term()); } else { OuterInstance.TermsDict.SeekEnum(TermEnum, term, true); } SkipNext = true; UnicodeSortOrder = OuterInstance.SortTermsByUnicode(); Term t = TermEnum.Term(); if (t != null && t.Field() == InternedFieldName) { NewSuffixStart = 0; PrevTerm.Length = 0; SurrogateDance(); } }
public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs) { pos.LiveDocs = liveDocs; pos.Seek(termEnum); docID = -1; return(this); }
public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, Bits liveDocs) { Pos.LiveDocs = liveDocs; Pos.Seek(termEnum); DocID_Renamed = -1; return(this); }
/// <summary> /// Returns the position of a <see cref="Term"/> in the set or -1. </summary> internal long GetPosition(Term term) { if (size == 0) { return(-1); } EnsureIndexIsRead(); int indexOffset = index.GetIndexOffset(term); SegmentTermEnum enumerator = GetThreadResources().termEnum; index.SeekEnum(enumerator, indexOffset); while (CompareAsUTF16(term, enumerator.Term()) > 0 && enumerator.Next()) { } if (CompareAsUTF16(term, enumerator.Term()) == 0) { return(enumerator.position); } else { return(-1); } }
/// <summary> /// Loads the segment information at segment load time. /// </summary> /// <param name="indexEnum"> /// The term enum. </param> /// <param name="indexDivisor"> /// The index divisor. </param> /// <param name="tiiFileLength"> /// The size of the tii file, used to approximate the size of the /// buffer. </param> /// <param name="totalIndexInterval"> /// The total index interval. </param> public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) { this.totalIndexInterval = totalIndexInterval; indexSize = 1 + ((int)indexEnum.size - 1) / indexDivisor; skipInterval = indexEnum.skipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.GetDataOutput(); int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInt32s.DEFAULT); string currentField = null; IList <string> fieldStrs = new List <string>(); int fieldCounter = -1; for (int i = 0; indexEnum.Next(); i++) { Term term = indexEnum.Term(); if (currentField == null || !currentField.Equals(term.Field, StringComparison.Ordinal)) { currentField = term.Field; fieldStrs.Add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.TermInfo(); indexToTerms.Set(i, dataOutput.GetPosition()); dataOutput.WriteVInt32(fieldCounter); dataOutput.WriteString(term.Text()); dataOutput.WriteVInt32(termInfo.DocFreq); if (termInfo.DocFreq >= skipInterval) { dataOutput.WriteVInt32(termInfo.SkipOffset); } dataOutput.WriteVInt64(termInfo.FreqPointer); dataOutput.WriteVInt64(termInfo.ProxPointer); dataOutput.WriteVInt64(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.Next()) { break; } } } fields = new Term[fieldStrs.Count]; for (int i = 0; i < fields.Length; i++) { fields[i] = new Term(fieldStrs[i]); } dataPagedBytes.Freeze(true); dataInput = dataPagedBytes.GetDataInput(); indexToDataOffset = indexToTerms.Mutable; ramBytesUsed = fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + indexToDataOffset.RamBytesUsed(); }
public PreDocsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs) { docs.LiveDocs = liveDocs; docs.Seek(termEnum); docs.freq = 1; docID = -1; return(this); }
public PreDocsEnum Reset(SegmentTermEnum termEnum, Bits liveDocs) { Docs.LiveDocs = liveDocs; Docs.Seek(termEnum); Docs.Freq_Renamed = 1; DocID_Renamed = -1; return(this); }
/// <summary> /// Loads the segment information at segment load time. /// </summary> /// <param name="indexEnum"> /// the term enum. </param> /// <param name="indexDivisor"> /// the index divisor. </param> /// <param name="tiiFileLength"> /// the size of the tii file, used to approximate the size of the /// buffer. </param> /// <param name="totalIndexInterval"> /// the total index interval. </param> public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) { this.TotalIndexInterval = totalIndexInterval; IndexSize = 1 + ((int)indexEnum.Size - 1) / indexDivisor; SkipInterval = indexEnum.SkipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.DataOutput; int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, IndexSize, PackedInts.DEFAULT); string currentField = null; IList<string> fieldStrs = new List<string>(); int fieldCounter = -1; for (int i = 0; indexEnum.Next(); i++) { Term term = indexEnum.Term(); if (currentField == null || !currentField.Equals(term.Field())) { currentField = term.Field(); fieldStrs.Add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.TermInfo(); indexToTerms.Set(i, dataOutput.Position); dataOutput.WriteVInt(fieldCounter); dataOutput.WriteString(term.Text()); dataOutput.WriteVInt(termInfo.DocFreq); if (termInfo.DocFreq >= SkipInterval) { dataOutput.WriteVInt(termInfo.SkipOffset); } dataOutput.WriteVLong(termInfo.FreqPointer); dataOutput.WriteVLong(termInfo.ProxPointer); dataOutput.WriteVLong(indexEnum.IndexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.Next()) { break; } } } Fields = new Term[fieldStrs.Count]; for (int i = 0; i < Fields.Length; i++) { Fields[i] = new Term(fieldStrs[i]); } dataPagedBytes.Freeze(true); DataInput = dataPagedBytes.DataInput; IndexToDataOffset = indexToTerms.Mutable; RamBytesUsed_Renamed = Fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + IndexToDataOffset.RamBytesUsed(); }
internal TermInfosReader(Directory dir, string seg, FieldInfos fis, IOContext context, int indexDivisor) { bool success = false; if (indexDivisor < 1 && indexDivisor != -1) { throw new System.ArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); } try { directory = dir; segment = seg; fieldInfos = fis; origEnum = new SegmentTermEnum(directory.OpenInput(IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), context), fieldInfos, false); size = origEnum.size; if (indexDivisor != -1) { // Load terms index totalIndexInterval = origEnum.indexInterval * indexDivisor; string indexFileName = IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION); SegmentTermEnum indexEnum = new SegmentTermEnum(directory.OpenInput(indexFileName, context), fieldInfos, true); try { index = new TermInfosReaderIndex(indexEnum, indexDivisor, dir.FileLength(indexFileName), totalIndexInterval); indexLength = index.Length; } finally { indexEnum.Dispose(); } } else { // Do not load terms index: totalIndexInterval = -1; index = null; indexLength = -1; } success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { Dispose(); } } }
internal TermInfosReader(Directory dir, string seg, FieldInfos fis, IOContext context, int indexDivisor) { bool success = false; if (indexDivisor < 1 && indexDivisor != -1) { throw new System.ArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); } try { Directory = dir; Segment = seg; FieldInfos = fis; OrigEnum = new SegmentTermEnum(Directory.OpenInput(IndexFileNames.SegmentFileName(Segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), context), FieldInfos, false); Size_Renamed = OrigEnum.Size; if (indexDivisor != -1) { // Load terms index TotalIndexInterval = OrigEnum.IndexInterval * indexDivisor; string indexFileName = IndexFileNames.SegmentFileName(Segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION); SegmentTermEnum indexEnum = new SegmentTermEnum(Directory.OpenInput(indexFileName, context), FieldInfos, true); try { Index = new TermInfosReaderIndex(indexEnum, indexDivisor, dir.FileLength(indexFileName), TotalIndexInterval); IndexLength = Index.Length(); } finally { indexEnum.Dispose(); } } else { // Do not load terms index: TotalIndexInterval = -1; Index = null; IndexLength = -1; } success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { Dispose(); } } }
public virtual void TestSeekEnum() { int indexPosition = 3; SegmentTermEnum clone = (SegmentTermEnum)TermEnum.Clone(); Term term = FindTermThatWouldBeAtIndex(clone, indexPosition); SegmentTermEnum enumerator = clone; Index.SeekEnum(enumerator, indexPosition); Assert.AreEqual(term, enumerator.Term()); clone.Dispose(); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, bool useCache) { if (useCache) { return(SeekEnum(enumerator, term, termsCache.Get(new CloneableTerm(DeepCopyOf(term))), useCache)); } else { return(SeekEnum(enumerator, term, null, useCache)); } }
public static void AfterClass() { TermEnum.Dispose(); Reader.Dispose(); Directory.Dispose(); TermEnum = null; Reader = null; Directory = null; Index = null; SampleTerms = null; }
public override void AfterClass() { TermEnum.Dispose(); Reader.Dispose(); Directory.Dispose(); TermEnum = null; Reader = null; Directory = null; Index = null; SampleTerms = null; base.AfterClass(); }
public override void AfterClass() { termEnum.Dispose(); reader.Dispose(); directory.Dispose(); termEnum = null; reader = null; directory = null; index = null; sampleTerms = null; base.AfterClass(); }
public object Clone() { // LUCENENET: MemberwiseClone() doesn't throw in .NET SegmentTermEnum clone = (SegmentTermEnum)base.MemberwiseClone(); clone.input = (IndexInput)input.Clone(); clone.termInfo = new TermInfo(termInfo); clone.termBuffer = (TermBuffer)termBuffer.Clone(); clone.prevBuffer = (TermBuffer)prevBuffer.Clone(); clone.scanBuffer = new TermBuffer(); return(clone); }
public override void BeforeClass() { base.BeforeClass(); // NOTE: turn off compound file, this test will open some index files directly. OldFormatImpersonationIsActive = true; IndexWriterConfig config = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random, MockTokenizer.KEYWORD, false)).SetUseCompoundFile(false); TermIndexInterval = config.TermIndexInterval; IndexDivisor = TestUtil.NextInt32(Random, 1, 10); NUMBER_OF_DOCUMENTS = AtLeast(100); NUMBER_OF_FIELDS = AtLeast(Math.Max(10, 3 * TermIndexInterval * IndexDivisor / NUMBER_OF_DOCUMENTS)); Directory = NewDirectory(); config.SetCodec(new PreFlexRWCodec()); LogMergePolicy mp = NewLogMergePolicy(); // NOTE: turn off compound file, this test will open some index files directly. mp.NoCFSRatio = 0.0; config.SetMergePolicy(mp); Populate(Directory, config); DirectoryReader r0 = IndexReader.Open(Directory); SegmentReader r = LuceneTestCase.GetOnlySegmentReader(r0); string segment = r.SegmentName; r.Dispose(); FieldInfosReader infosReader = (new PreFlexRWCodec()).FieldInfosFormat.FieldInfosReader; FieldInfos fieldInfos = infosReader.Read(Directory, segment, "", IOContext.READ_ONCE); string segmentFileName = IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION); long tiiFileLength = Directory.FileLength(segmentFileName); IndexInput input = Directory.OpenInput(segmentFileName, NewIOContext(Random)); TermEnum = new SegmentTermEnum(Directory.OpenInput(IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), NewIOContext(Random)), fieldInfos, false); int totalIndexInterval = TermEnum.indexInterval * IndexDivisor; SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true); Index = new TermInfosReaderIndex(indexEnum, IndexDivisor, tiiFileLength, totalIndexInterval); indexEnum.Dispose(); input.Dispose(); Reader = IndexReader.Open(Directory); SampleTerms = Sample(Random, Reader, 1000); }
private Term FindTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) { int termPosition = index * TermIndexInterval * IndexDivisor; for (int i = 0; i < termPosition; i++) { // TODO: this test just uses random terms, so this is always possible AssumeTrue("ran out of terms", termEnum.Next()); } Term term = termEnum.Term(); // An indexed term is only written when the term after // it exists, so, if the number of terms is 0 mod // termIndexInterval, the last index term will not be // written; so we require a term after this term // as well: AssumeTrue("ran out of terms", termEnum.Next()); return(term); }
public virtual void Seek(SegmentTermEnum segmentTermEnum) { TermInfo ti; Term term; // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs if (segmentTermEnum.FieldInfos == FieldInfos) // optimized case { term = segmentTermEnum.Term(); ti = segmentTermEnum.TermInfo(); } // punt case else { term = segmentTermEnum.Term(); ti = Tis.Get(term); } Seek(ti, term); }
public object Clone() { SegmentTermEnum clone = null; try { clone = (SegmentTermEnum)base.MemberwiseClone(); } catch (InvalidOperationException e) { } clone.Input = (IndexInput)Input.Clone(); clone.TermInfo_Renamed = new TermInfo(TermInfo_Renamed); clone.TermBuffer = (TermBuffer)TermBuffer.Clone(); clone.PrevBuffer = (TermBuffer)PrevBuffer.Clone(); clone.ScanBuffer = new TermBuffer(); return(clone); }
// called only from asserts private bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.DocFreq != ti2.DocFreq) { return(false); } if (ti1.FreqPointer != ti2.FreqPointer) { return(false); } if (ti1.ProxPointer != ti2.ProxPointer) { return(false); } // skipOffset is only valid when docFreq >= skipInterval: if (ti1.DocFreq >= enumerator.skipInterval && ti1.SkipOffset != ti2.SkipOffset) { return(false); } return(true); }
// called only from asserts private static bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) // LUCENENET: CA1822: Mark members as static { if (ti1.DocFreq != ti2.DocFreq) { return(false); } if (ti1.FreqPointer != ti2.FreqPointer) { return(false); } if (ti1.ProxPointer != ti2.ProxPointer) { return(false); } // skipOffset is only valid when docFreq >= skipInterval: if (ti1.DocFreq >= enumerator.skipInterval && ti1.SkipOffset != ti2.SkipOffset) { return(false); } return(true); }
public object Clone() { SegmentTermEnum clone = null; try { clone = (SegmentTermEnum)base.MemberwiseClone(); } #pragma warning disable 168 catch (InvalidOperationException e) #pragma warning restore 168 { } clone.input = (IndexInput)input.Clone(); clone.termInfo = new TermInfo(termInfo); clone.termBuffer = (TermBuffer)termBuffer.Clone(); clone.prevBuffer = (TermBuffer)prevBuffer.Clone(); clone.scanBuffer = new TermBuffer(); return(clone); }
public void CacheCurrentTerm(SegmentTermEnum enumerator) { termsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.termInfo, enumerator.position)); }
public void BeforeClass() { // NOTE: turn off compound file, this test will open some index files directly. OLD_FORMAT_IMPERSONATION_IS_ACTIVE = true; IndexWriterConfig config = NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random(), MockTokenizer.KEYWORD, false)).SetUseCompoundFile(false); TermIndexInterval = config.TermIndexInterval; IndexDivisor = TestUtil.NextInt(Random(), 1, 10); NUMBER_OF_DOCUMENTS = AtLeast(100); NUMBER_OF_FIELDS = AtLeast(Math.Max(10, 3 * TermIndexInterval * IndexDivisor / NUMBER_OF_DOCUMENTS)); Directory = NewDirectory(); config.SetCodec(new PreFlexRWCodec(OLD_FORMAT_IMPERSONATION_IS_ACTIVE)); LogMergePolicy mp = NewLogMergePolicy(); // NOTE: turn off compound file, this test will open some index files directly. mp.NoCFSRatio = 0.0; config.SetMergePolicy(mp); Populate(Directory, config); DirectoryReader r0 = IndexReader.Open(Directory); SegmentReader r = LuceneTestCase.GetOnlySegmentReader(r0); string segment = r.SegmentName; r.Dispose(); FieldInfosReader infosReader = (new PreFlexRWCodec(OLD_FORMAT_IMPERSONATION_IS_ACTIVE)).FieldInfosFormat().FieldInfosReader; FieldInfos fieldInfos = infosReader.Read(Directory, segment, "", IOContext.READONCE); string segmentFileName = IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION); long tiiFileLength = Directory.FileLength(segmentFileName); IndexInput input = Directory.OpenInput(segmentFileName, NewIOContext(Random())); TermEnum = new SegmentTermEnum(Directory.OpenInput(IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), NewIOContext(Random())), fieldInfos, false); int totalIndexInterval = TermEnum.IndexInterval * IndexDivisor; SegmentTermEnum indexEnum = new SegmentTermEnum(input, fieldInfos, true); Index = new TermInfosReaderIndex(indexEnum, IndexDivisor, tiiFileLength, totalIndexInterval); indexEnum.Dispose(); input.Dispose(); Reader = IndexReader.Open(Directory); SampleTerms = Sample(Random(), Reader, 1000); }
private Term FindTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) { int termPosition = index * TermIndexInterval * IndexDivisor; for (int i = 0; i < termPosition; i++) { // TODO: this test just uses random terms, so this is always possible AssumeTrue("ran out of terms", termEnum.Next()); } Term term = termEnum.Term(); // An indexed term is only written when the term after // it exists, so, if the number of terms is 0 mod // termIndexInterval, the last index term will not be // written; so we require a term after this term // as well: AssumeTrue("ran out of terms", termEnum.Next()); return term; }
public PreDocsEnum Reset(SegmentTermEnum termEnum, Bits liveDocs) { Docs.LiveDocs = liveDocs; Docs.Seek(termEnum); Docs.Freq_Renamed = 1; DocID_Renamed = -1; return this; }
public void CacheCurrentTerm(SegmentTermEnum enumerator) { TermsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.TermInfo_Renamed, enumerator.Position)); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache) { if (size == 0) { return(null); } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current { int enumOffset = (int)(enumerator.position / totalIndexInterval) + 1; if (indexLength == enumOffset || index.CompareTo(term, enumOffset) < 0) // but before end of block { // no need to seek TermInfo ti; int numScans = enumerator.ScanTo(term); if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti = enumerator.termInfo; if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // this prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { if (useCache) { termsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.position)); } } else { Debug.Assert(SameTermInfo(ti, tiOrd, enumerator)); Debug.Assert(enumerator.position == tiOrd.termOrd); } } } else { ti = null; } return(ti); } } // random-access: must seek int indexPos; if (tiOrd != null) { indexPos = (int)(tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: indexPos = index.GetIndexOffset(term); } index.SeekEnum(enumerator, indexPos); enumerator.ScanTo(term); TermInfo ti_; if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti_ = enumerator.termInfo; if (tiOrd == null) { if (useCache) { termsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.position)); } } else { Debug.Assert(SameTermInfo(ti_, tiOrd, enumerator)); Debug.Assert(enumerator.position == tiOrd.termOrd); } } else { ti_ = null; } return(ti_); }
// Swap in S, in place of E: internal virtual bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; Debug.Assert(term.Offset == 0); // The 3 bytes starting at downTo make up 1 // unicode character: Debug.Assert(IsHighBMPChar(term.Bytes, pos)); // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } Scratch[0] = (sbyte)term.Bytes[pos]; Scratch[1] = (sbyte)term.Bytes[pos + 1]; Scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = unchecked((byte)0xf0); term.Bytes[pos + 1] = unchecked((byte)0x90); term.Bytes[pos + 2] = unchecked((byte)0x80); term.Bytes[pos + 3] = unchecked((byte)0x80); term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": OuterInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field() != InternedFieldName) { return false; } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes(); Debug.Assert(b2.Offset == 0); bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)Scratch[0]; term.Bytes[pos + 1] = (byte)Scratch[1]; term.Bytes[pos + 2] = (byte)Scratch[2]; return matches; }
// called only from asserts private bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.DocFreq != ti2.DocFreq) { return false; } if (ti1.FreqPointer != ti2.FreqPointer) { return false; } if (ti1.ProxPointer != ti2.ProxPointer) { return false; } // skipOffset is only valid when docFreq >= skipInterval: if (ti1.DocFreq >= enumerator.SkipInterval && ti1.SkipOffset != ti2.SkipOffset) { return false; } return true; }
public virtual void SeekEnum(SegmentTermEnum enumerator, int indexOffset) { PagedBytesDataInput input = (PagedBytesDataInput)DataInput.Clone(); input.Position = IndexToDataOffset.Get(indexOffset); // read the term int fieldId = input.ReadVInt(); Term field = Fields[fieldId]; Term term = new Term(field.Field(), input.ReadString()); // read the terminfo TermInfo termInfo = new TermInfo(); termInfo.DocFreq = input.ReadVInt(); if (termInfo.DocFreq >= SkipInterval) { termInfo.SkipOffset = input.ReadVInt(); } else { termInfo.SkipOffset = 0; } termInfo.FreqPointer = input.ReadVLong(); termInfo.ProxPointer = input.ReadVLong(); long pointer = input.ReadVLong(); // perform the seek enumerator.Seek(pointer, ((long)indexOffset * TotalIndexInterval) - 1, term, termInfo); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, bool useCache) { if (useCache) { return SeekEnum(enumerator, term, TermsCache.Get(new CloneableTerm(DeepCopyOf(term))), useCache); } else { return SeekEnum(enumerator, term, null, useCache); } }
public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, Bits liveDocs) { Pos.LiveDocs = liveDocs; Pos.Seek(termEnum); DocID_Renamed = -1; return this; }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Offset == 0); } // The 3 bytes starting at downTo make up 1 // unicode character: if (Debugging.AssertsEnabled) { Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); } // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 is null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text)); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache) { if (Size_Renamed == 0) { return null; } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current { int enumOffset = (int)(enumerator.Position / TotalIndexInterval) + 1; if (IndexLength == enumOffset || Index.CompareTo(term, enumOffset) < 0) // but before end of block { // no need to seek TermInfo ti; int numScans = enumerator.ScanTo(term); if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti = enumerator.TermInfo_Renamed; if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // this prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { if (useCache) { TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.Position)); } } else { Debug.Assert(SameTermInfo(ti, tiOrd, enumerator)); Debug.Assert(enumerator.Position == tiOrd.TermOrd); } } } else { ti = null; } return ti; } } // random-access: must seek int indexPos; if (tiOrd != null) { indexPos = (int)(tiOrd.TermOrd / TotalIndexInterval); } else { // Must do binary search: indexPos = Index.GetIndexOffset(term); } Index.SeekEnum(enumerator, indexPos); enumerator.ScanTo(term); TermInfo ti_; if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti_ = enumerator.TermInfo_Renamed; if (tiOrd == null) { if (useCache) { TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.Position)); } } else { Debug.Assert(SameTermInfo(ti_, tiOrd, enumerator)); Debug.Assert(enumerator.Position == tiOrd.TermOrd); } } else { ti_ = null; } return ti_; }