internal virtual void SeekEnum(SegmentTermEnum enumerator, int indexOffset) { PagedBytesDataInput input = (PagedBytesDataInput)dataInput.Clone(); input.SetPosition(indexToDataOffset.Get(indexOffset)); // read the term int fieldId = input.ReadVInt32(); Term field = fields[fieldId]; Term term = new Term(field.Field, input.ReadString()); // read the terminfo var termInfo = new TermInfo(); termInfo.DocFreq = input.ReadVInt32(); if (termInfo.DocFreq >= skipInterval) { termInfo.SkipOffset = input.ReadVInt32(); } else { termInfo.SkipOffset = 0; } termInfo.FreqPointer = input.ReadVInt64(); termInfo.ProxPointer = input.ReadVInt64(); long pointer = input.ReadVInt64(); // perform the seek enumerator.Seek(pointer, ((long)indexOffset * totalIndexInterval) - 1, term, termInfo); }
internal virtual void Reset(FieldInfo fieldInfo) { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; internedFieldName = fieldInfo.Name.Intern(); Term term = new Term(internedFieldName); if (termEnum == null) { termEnum = outerInstance.TermsDict.Terms(term); seekTermEnum = outerInstance.TermsDict.Terms(term); //System.out.println(" term=" + termEnum.term()); } else { outerInstance.TermsDict.SeekEnum(termEnum, term, true); } skipNext = true; unicodeSortOrder = outerInstance.SortTermsByUnicode; Term t = termEnum.Term(); if (t != null && t.Field == internedFieldName) { newSuffixStart = 0; prevTerm.Length = 0; SurrogateDance(); } }
public DocsAndPositionsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs) { pos.LiveDocs = liveDocs; pos.Seek(termEnum); docID = -1; return(this); }
/// <summary> /// Returns the position of a <see cref="Term"/> in the set or -1. </summary> internal long GetPosition(Term term) { if (size == 0) { return(-1); } EnsureIndexIsRead(); int indexOffset = index.GetIndexOffset(term); SegmentTermEnum enumerator = GetThreadResources().termEnum; index.SeekEnum(enumerator, indexOffset); while (CompareAsUTF16(term, enumerator.Term()) > 0 && enumerator.Next()) { } if (CompareAsUTF16(term, enumerator.Term()) == 0) { return(enumerator.position); } else { return(-1); } }
/// <summary> /// Loads the segment information at segment load time. /// </summary> /// <param name="indexEnum"> /// The term enum. </param> /// <param name="indexDivisor"> /// The index divisor. </param> /// <param name="tiiFileLength"> /// The size of the tii file, used to approximate the size of the /// buffer. </param> /// <param name="totalIndexInterval"> /// The total index interval. </param> public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) { this.totalIndexInterval = totalIndexInterval; indexSize = 1 + ((int)indexEnum.size - 1) / indexDivisor; skipInterval = indexEnum.skipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.GetDataOutput(); int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInt32s.DEFAULT); string currentField = null; IList <string> fieldStrs = new List <string>(); int fieldCounter = -1; for (int i = 0; indexEnum.Next(); i++) { Term term = indexEnum.Term(); if (currentField == null || !currentField.Equals(term.Field, StringComparison.Ordinal)) { currentField = term.Field; fieldStrs.Add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.TermInfo(); indexToTerms.Set(i, dataOutput.GetPosition()); dataOutput.WriteVInt32(fieldCounter); dataOutput.WriteString(term.Text()); dataOutput.WriteVInt32(termInfo.DocFreq); if (termInfo.DocFreq >= skipInterval) { dataOutput.WriteVInt32(termInfo.SkipOffset); } dataOutput.WriteVInt64(termInfo.FreqPointer); dataOutput.WriteVInt64(termInfo.ProxPointer); dataOutput.WriteVInt64(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.Next()) { break; } } } fields = new Term[fieldStrs.Count]; for (int i = 0; i < fields.Length; i++) { fields[i] = new Term(fieldStrs[i]); } dataPagedBytes.Freeze(true); dataInput = dataPagedBytes.GetDataInput(); indexToDataOffset = indexToTerms.Mutable; ramBytesUsed = fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + indexToDataOffset.RamBytesUsed(); }
public PreDocsEnum Reset(SegmentTermEnum termEnum, IBits liveDocs) { docs.LiveDocs = liveDocs; docs.Seek(termEnum); docs.freq = 1; docID = -1; return(this); }
internal TermInfosReader(Directory dir, string seg, FieldInfos fis, IOContext context, int indexDivisor) { bool success = false; if (indexDivisor < 1 && indexDivisor != -1) { throw new ArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); } try { directory = dir; segment = seg; fieldInfos = fis; origEnum = new SegmentTermEnum(directory.OpenInput(IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_EXTENSION), context), fieldInfos, false); size = origEnum.size; if (indexDivisor != -1) { // Load terms index totalIndexInterval = origEnum.indexInterval * indexDivisor; string indexFileName = IndexFileNames.SegmentFileName(segment, "", Lucene3xPostingsFormat.TERMS_INDEX_EXTENSION); SegmentTermEnum indexEnum = new SegmentTermEnum(directory.OpenInput(indexFileName, context), fieldInfos, true); try { index = new TermInfosReaderIndex(indexEnum, indexDivisor, dir.FileLength(indexFileName), totalIndexInterval); indexLength = index.Length; } finally { indexEnum.Dispose(); } } else { // Do not load terms index: totalIndexInterval = -1; index = null; indexLength = -1; } success = true; } finally { // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In // this case, we want to explicitly close any subset // of things that were opened so that we don't have to // wait for a GC to do so. if (!success) { Dispose(); } } }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, bool useCache) { if (useCache) { return(SeekEnum(enumerator, term, termsCache.Get(new CloneableTerm(DeepCopyOf(term))), useCache)); } else { return(SeekEnum(enumerator, term, null, useCache)); } }
public object Clone() { // LUCENENET: MemberwiseClone() doesn't throw in .NET SegmentTermEnum clone = (SegmentTermEnum)base.MemberwiseClone(); clone.input = (IndexInput)input.Clone(); clone.termInfo = new TermInfo(termInfo); clone.termBuffer = (TermBuffer)termBuffer.Clone(); clone.prevBuffer = (TermBuffer)prevBuffer.Clone(); clone.scanBuffer = new TermBuffer(); return(clone); }
public virtual void Seek(SegmentTermEnum segmentTermEnum) { TermInfo ti; Term term; // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs if (segmentTermEnum.fieldInfos == fieldInfos) // optimized case { term = segmentTermEnum.Term(); ti = segmentTermEnum.TermInfo(); } // punt case else { term = segmentTermEnum.Term(); ti = tis.Get(term); } Seek(ti, term); }
// called only from asserts private static bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) // LUCENENET: CA1822: Mark members as static { if (ti1.DocFreq != ti2.DocFreq) { return(false); } if (ti1.FreqPointer != ti2.FreqPointer) { return(false); } if (ti1.ProxPointer != ti2.ProxPointer) { return(false); } // skipOffset is only valid when docFreq >= skipInterval: if (ti1.DocFreq >= enumerator.skipInterval && ti1.SkipOffset != ti2.SkipOffset) { return(false); } return(true); }
// called only from asserts private bool SameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { if (ti1.DocFreq != ti2.DocFreq) { return(false); } if (ti1.FreqPointer != ti2.FreqPointer) { return(false); } if (ti1.ProxPointer != ti2.ProxPointer) { return(false); } // skipOffset is only valid when docFreq >= skipInterval: if (ti1.DocFreq >= enumerator.skipInterval && ti1.SkipOffset != ti2.SkipOffset) { return(false); } return(true); }
public object Clone() { SegmentTermEnum clone = null; try { clone = (SegmentTermEnum)base.MemberwiseClone(); } #pragma warning disable 168 catch (InvalidOperationException e) #pragma warning restore 168 { } clone.input = (IndexInput)input.Clone(); clone.termInfo = new TermInfo(termInfo); clone.termBuffer = (TermBuffer)termBuffer.Clone(); clone.prevBuffer = (TermBuffer)prevBuffer.Clone(); clone.scanBuffer = new TermBuffer(); return(clone); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache) { if (size == 0) { return(null); } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current { int enumOffset = (int)(enumerator.position / totalIndexInterval) + 1; if (indexLength == enumOffset || index.CompareTo(term, enumOffset) < 0) // but before end of block { // no need to seek TermInfo ti; int numScans = enumerator.ScanTo(term); if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti = enumerator.termInfo; if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // this prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { if (useCache) { termsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.position)); } } else if (Debugging.AssertsEnabled) { Debugging.Assert(SameTermInfo(ti, tiOrd, enumerator)); Debugging.Assert((int)enumerator.position == tiOrd.termOrd); } } } else { ti = null; } return(ti); } } // random-access: must seek int indexPos; if (tiOrd != null) { indexPos = (int)(tiOrd.termOrd / totalIndexInterval); } else { // Must do binary search: indexPos = index.GetIndexOffset(term); } index.SeekEnum(enumerator, indexPos); enumerator.ScanTo(term); TermInfo ti_; if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti_ = enumerator.termInfo; if (tiOrd == null) { if (useCache) { termsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.position)); } } else if (Debugging.AssertsEnabled) { Debugging.Assert(SameTermInfo(ti_, tiOrd, enumerator)); Debugging.Assert(enumerator.position == tiOrd.termOrd); } } else { ti_ = null; } return(ti_); }
public void CacheCurrentTerm(SegmentTermEnum enumerator) { termsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.termInfo, enumerator.position)); }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Offset == 0); } // The 3 bytes starting at downTo make up 1 // unicode character: if (Debugging.AssertsEnabled) { Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); } // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }