/// <summary> /// Loads the segment information at segment load time. /// </summary> /// <param name="indexEnum"> /// the term enum. </param> /// <param name="indexDivisor"> /// the index divisor. </param> /// <param name="tiiFileLength"> /// the size of the tii file, used to approximate the size of the /// buffer. </param> /// <param name="totalIndexInterval"> /// the total index interval. </param> public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) { this.TotalIndexInterval = totalIndexInterval; IndexSize = 1 + ((int)indexEnum.Size - 1) / indexDivisor; SkipInterval = indexEnum.SkipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.DataOutput; int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, IndexSize, PackedInts.DEFAULT); string currentField = null; IList<string> fieldStrs = new List<string>(); int fieldCounter = -1; for (int i = 0; indexEnum.Next(); i++) { Term term = indexEnum.Term(); if (currentField == null || !currentField.Equals(term.Field())) { currentField = term.Field(); fieldStrs.Add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.TermInfo(); indexToTerms.Set(i, dataOutput.Position); dataOutput.WriteVInt(fieldCounter); dataOutput.WriteString(term.Text()); dataOutput.WriteVInt(termInfo.DocFreq); if (termInfo.DocFreq >= SkipInterval) { dataOutput.WriteVInt(termInfo.SkipOffset); } dataOutput.WriteVLong(termInfo.FreqPointer); dataOutput.WriteVLong(termInfo.ProxPointer); dataOutput.WriteVLong(indexEnum.IndexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.Next()) { break; } } } Fields = new Term[fieldStrs.Count]; for (int i = 0; i < Fields.Length; i++) { Fields[i] = new Term(fieldStrs[i]); } dataPagedBytes.Freeze(true); DataInput = dataPagedBytes.DataInput; IndexToDataOffset = indexToTerms.Mutable; RamBytesUsed_Renamed = Fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + IndexToDataOffset.RamBytesUsed(); }
public virtual void Seek(SegmentTermEnum segmentTermEnum) { TermInfo ti; Term term; // use comparison of fieldinfos to verify that termEnum belongs to the same segment as this SegmentTermDocs if (segmentTermEnum.FieldInfos == FieldInfos) // optimized case { term = segmentTermEnum.Term(); ti = segmentTermEnum.TermInfo(); } // punt case else { term = segmentTermEnum.Term(); ti = Tis.Get(term); } Seek(ti, term); }
private Term FindTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) { int termPosition = index * TermIndexInterval * IndexDivisor; for (int i = 0; i < termPosition; i++) { // TODO: this test just uses random terms, so this is always possible AssumeTrue("ran out of terms", termEnum.Next()); } Term term = termEnum.Term(); // An indexed term is only written when the term after // it exists, so, if the number of terms is 0 mod // termIndexInterval, the last index term will not be // written; so we require a term after this term // as well: AssumeTrue("ran out of terms", termEnum.Next()); return term; }
// Swap in S, in place of E: private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; if (Debugging.AssertsEnabled) { Debugging.Assert(term.Offset == 0); } // The 3 bytes starting at downTo make up 1 // unicode character: if (Debugging.AssertsEnabled) { Debugging.Assert(IsHighBMPChar(term.Bytes, pos)); } // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } scratch[0] = (sbyte)term.Bytes[pos]; scratch[1] = (sbyte)term.Bytes[pos + 1]; scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = 0xf0; term.Bytes[pos + 1] = 0x90; term.Bytes[pos + 2] = 0x80; term.Bytes[pos + 3] = 0x80; term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": outerInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field != internedFieldName) { return(false); } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text)); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes; if (Debugging.AssertsEnabled) { Debugging.Assert(b2.Offset == 0); } bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)scratch[0]; term.Bytes[pos + 1] = (byte)scratch[1]; term.Bytes[pos + 2] = (byte)scratch[2]; return(matches); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache) { if (Size_Renamed == 0) { return(null); } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current { int enumOffset = (int)(enumerator.Position / TotalIndexInterval) + 1; if (IndexLength == enumOffset || Index.CompareTo(term, enumOffset) < 0) // but before end of block { // no need to seek TermInfo ti; int numScans = enumerator.ScanTo(term); if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti = enumerator.TermInfo_Renamed; if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // this prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { if (useCache) { TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.Position)); } } else { Debug.Assert(SameTermInfo(ti, tiOrd, enumerator)); Debug.Assert(enumerator.Position == tiOrd.TermOrd); } } } else { ti = null; } return(ti); } } // random-access: must seek int indexPos; if (tiOrd != null) { indexPos = (int)(tiOrd.TermOrd / TotalIndexInterval); } else { // Must do binary search: indexPos = Index.GetIndexOffset(term); } Index.SeekEnum(enumerator, indexPos); enumerator.ScanTo(term); TermInfo ti_; if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti_ = enumerator.TermInfo_Renamed; if (tiOrd == null) { if (useCache) { TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.Position)); } } else { Debug.Assert(SameTermInfo(ti_, tiOrd, enumerator)); Debug.Assert(enumerator.Position == tiOrd.TermOrd); } } else { ti_ = null; } return(ti_); }
public void CacheCurrentTerm(SegmentTermEnum enumerator) { TermsCache.Put(new CloneableTerm(enumerator.Term()), new TermInfoAndOrd(enumerator.TermInfo_Renamed, enumerator.Position)); }
internal TermInfo SeekEnum(SegmentTermEnum enumerator, Term term, TermInfoAndOrd tiOrd, bool useCache) { if (Size_Renamed == 0) { return null; } // optimize sequential access: first try scanning cached enum w/o seeking if (enumerator.Term() != null && ((enumerator.Prev() != null && CompareAsUTF16(term, enumerator.Prev()) > 0) || CompareAsUTF16(term, enumerator.Term()) >= 0)) // term is at or past current { int enumOffset = (int)(enumerator.Position / TotalIndexInterval) + 1; if (IndexLength == enumOffset || Index.CompareTo(term, enumOffset) < 0) // but before end of block { // no need to seek TermInfo ti; int numScans = enumerator.ScanTo(term); if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti = enumerator.TermInfo_Renamed; if (numScans > 1) { // we only want to put this TermInfo into the cache if // scanEnum skipped more than one dictionary entry. // this prevents RangeQueries or WildcardQueries to // wipe out the cache when they iterate over a large numbers // of terms in order if (tiOrd == null) { if (useCache) { TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti, enumerator.Position)); } } else { Debug.Assert(SameTermInfo(ti, tiOrd, enumerator)); Debug.Assert(enumerator.Position == tiOrd.TermOrd); } } } else { ti = null; } return ti; } } // random-access: must seek int indexPos; if (tiOrd != null) { indexPos = (int)(tiOrd.TermOrd / TotalIndexInterval); } else { // Must do binary search: indexPos = Index.GetIndexOffset(term); } Index.SeekEnum(enumerator, indexPos); enumerator.ScanTo(term); TermInfo ti_; if (enumerator.Term() != null && CompareAsUTF16(term, enumerator.Term()) == 0) { ti_ = enumerator.TermInfo_Renamed; if (tiOrd == null) { if (useCache) { TermsCache.Put(new CloneableTerm(DeepCopyOf(term)), new TermInfoAndOrd(ti_, enumerator.Position)); } } else { Debug.Assert(SameTermInfo(ti_, tiOrd, enumerator)); Debug.Assert(enumerator.Position == tiOrd.TermOrd); } } else { ti_ = null; } return ti_; }
// Swap in S, in place of E: internal virtual bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) { int savLength = term.Length; Debug.Assert(term.Offset == 0); // The 3 bytes starting at downTo make up 1 // unicode character: Debug.Assert(IsHighBMPChar(term.Bytes, pos)); // NOTE: we cannot make this assert, because // AutomatonQuery legitimately sends us malformed UTF8 // (eg the UTF8 bytes with just 0xee) // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString(); // Save the bytes && length, since we need to // restore this if seek "back" finds no matching // terms if (term.Bytes.Length < 4 + pos) { term.Grow(4 + pos); } Scratch[0] = (sbyte)term.Bytes[pos]; Scratch[1] = (sbyte)term.Bytes[pos + 1]; Scratch[2] = (sbyte)term.Bytes[pos + 2]; term.Bytes[pos] = unchecked((byte)0xf0); term.Bytes[pos + 1] = unchecked((byte)0x90); term.Bytes[pos + 2] = unchecked((byte)0x80); term.Bytes[pos + 3] = unchecked((byte)0x80); term.Length = 4 + pos; if (DEBUG_SURROGATES) { Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); } // Seek "back": OuterInstance.TermsDict.SeekEnum(te, new Term(fieldInfo.Name, term), true); // Test if the term we seek'd to in fact found a // surrogate pair at the same position as the E: Term t2 = te.Term(); // Cannot be null (or move to next field) because at // "worst" it'd seek to the same term we are on now, // unless we are being called from seek if (t2 == null || t2.Field() != InternedFieldName) { return false; } if (DEBUG_SURROGATES) { Console.WriteLine(" got term=" + UnicodeUtil.ToHexString(t2.Text())); } // Now test if prefix is identical and we found // a non-BMP char at the same position: BytesRef b2 = t2.Bytes(); Debug.Assert(b2.Offset == 0); bool matches; if (b2.Length >= term.Length && IsNonBMPChar(b2.Bytes, pos)) { matches = true; for (int i = 0; i < pos; i++) { if (term.Bytes[i] != b2.Bytes[i]) { matches = false; break; } } } else { matches = false; } // Restore term: term.Length = savLength; term.Bytes[pos] = (byte)Scratch[0]; term.Bytes[pos + 1] = (byte)Scratch[1]; term.Bytes[pos + 2] = (byte)Scratch[2]; return matches; }
// Seek type 2 "continue" (back to the start of the // surrogates): scan the stripped suffix from the // prior term, backwards. If there was an E in that // part, then we try to seek back to S. If that // seek finds a matching term, we go there. private bool DoContinue() { if (DEBUG_SURROGATES) { Console.WriteLine(" try cont"); } int downTo = prevTerm.Length - 1; bool didSeek = false; int limit = Math.Min(newSuffixStart, scratchTerm.Length - 1); while (downTo > limit) { if (IsHighBMPChar(prevTerm.Bytes, downTo)) { if (DEBUG_SURROGATES) { Console.WriteLine(" found E pos=" + downTo + " vs len=" + prevTerm.Length); } if (SeekToNonBMP(seekTermEnum, prevTerm, downTo)) { // TODO: more efficient seek? outerInstance.TermsDict.SeekEnum(termEnum, seekTermEnum.Term(), true); //newSuffixStart = downTo+4; newSuffixStart = downTo; scratchTerm.CopyBytes(termEnum.Term().Bytes); didSeek = true; if (DEBUG_SURROGATES) { Console.WriteLine(" seek!"); } break; } else { if (DEBUG_SURROGATES) { Console.WriteLine(" no seek"); } } } // Shorten prevTerm in place so that we don't redo // this loop if we come back here: if ((prevTerm.Bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.Bytes[downTo] & 0x80) == 0) { prevTerm.Length = downTo; } downTo--; } return(didSeek); }