/// <summary> /// Loads the segment information at segment load time. /// </summary> /// <param name="indexEnum"> /// The term enum. </param> /// <param name="indexDivisor"> /// The index divisor. </param> /// <param name="tiiFileLength"> /// The size of the tii file, used to approximate the size of the /// buffer. </param> /// <param name="totalIndexInterval"> /// The total index interval. </param> public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) { this.totalIndexInterval = totalIndexInterval; indexSize = 1 + ((int)indexEnum.size - 1) / indexDivisor; skipInterval = indexEnum.skipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.GetDataOutput(); int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, indexSize, PackedInt32s.DEFAULT); string currentField = null; IList <string> fieldStrs = new List <string>(); int fieldCounter = -1; for (int i = 0; indexEnum.Next(); i++) { Term term = indexEnum.Term(); if (currentField == null || !currentField.Equals(term.Field, StringComparison.Ordinal)) { currentField = term.Field; fieldStrs.Add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.TermInfo(); indexToTerms.Set(i, dataOutput.GetPosition()); dataOutput.WriteVInt32(fieldCounter); dataOutput.WriteString(term.Text()); dataOutput.WriteVInt32(termInfo.DocFreq); if (termInfo.DocFreq >= skipInterval) { dataOutput.WriteVInt32(termInfo.SkipOffset); } dataOutput.WriteVInt64(termInfo.FreqPointer); dataOutput.WriteVInt64(termInfo.ProxPointer); dataOutput.WriteVInt64(indexEnum.indexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.Next()) { break; } } } fields = new Term[fieldStrs.Count]; for (int i = 0; i < fields.Length; i++) { fields[i] = new Term(fieldStrs[i]); } dataPagedBytes.Freeze(true); dataInput = dataPagedBytes.GetDataInput(); indexToDataOffset = indexToTerms.Mutable; ramBytesUsed = fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + indexToDataOffset.RamBytesUsed(); }
/// <summary> /// Returns the position of a <see cref="Term"/> in the set or -1. </summary> internal long GetPosition(Term term) { if (size == 0) { return(-1); } EnsureIndexIsRead(); int indexOffset = index.GetIndexOffset(term); SegmentTermEnum enumerator = GetThreadResources().termEnum; index.SeekEnum(enumerator, indexOffset); while (CompareAsUTF16(term, enumerator.Term()) > 0 && enumerator.Next()) { } if (CompareAsUTF16(term, enumerator.Term()) == 0) { return(enumerator.position); } else { return(-1); } }
/// <summary> /// Loads the segment information at segment load time. /// </summary> /// <param name="indexEnum"> /// the term enum. </param> /// <param name="indexDivisor"> /// the index divisor. </param> /// <param name="tiiFileLength"> /// the size of the tii file, used to approximate the size of the /// buffer. </param> /// <param name="totalIndexInterval"> /// the total index interval. </param> public TermInfosReaderIndex(SegmentTermEnum indexEnum, int indexDivisor, long tiiFileLength, int totalIndexInterval) { this.TotalIndexInterval = totalIndexInterval; IndexSize = 1 + ((int)indexEnum.Size - 1) / indexDivisor; SkipInterval = indexEnum.SkipInterval; // this is only an inital size, it will be GCed once the build is complete long initialSize = (long)(tiiFileLength * 1.5) / indexDivisor; PagedBytes dataPagedBytes = new PagedBytes(EstimatePageBits(initialSize)); PagedBytesDataOutput dataOutput = dataPagedBytes.DataOutput; int bitEstimate = 1 + MathUtil.Log(tiiFileLength, 2); GrowableWriter indexToTerms = new GrowableWriter(bitEstimate, IndexSize, PackedInts.DEFAULT); string currentField = null; IList<string> fieldStrs = new List<string>(); int fieldCounter = -1; for (int i = 0; indexEnum.Next(); i++) { Term term = indexEnum.Term(); if (currentField == null || !currentField.Equals(term.Field())) { currentField = term.Field(); fieldStrs.Add(currentField); fieldCounter++; } TermInfo termInfo = indexEnum.TermInfo(); indexToTerms.Set(i, dataOutput.Position); dataOutput.WriteVInt(fieldCounter); dataOutput.WriteString(term.Text()); dataOutput.WriteVInt(termInfo.DocFreq); if (termInfo.DocFreq >= SkipInterval) { dataOutput.WriteVInt(termInfo.SkipOffset); } dataOutput.WriteVLong(termInfo.FreqPointer); dataOutput.WriteVLong(termInfo.ProxPointer); dataOutput.WriteVLong(indexEnum.IndexPointer); for (int j = 1; j < indexDivisor; j++) { if (!indexEnum.Next()) { break; } } } Fields = new Term[fieldStrs.Count]; for (int i = 0; i < Fields.Length; i++) { Fields[i] = new Term(fieldStrs[i]); } dataPagedBytes.Freeze(true); DataInput = dataPagedBytes.DataInput; IndexToDataOffset = indexToTerms.Mutable; RamBytesUsed_Renamed = Fields.Length * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.ShallowSizeOfInstance(typeof(Term))) + dataPagedBytes.RamBytesUsed() + IndexToDataOffset.RamBytesUsed(); }
private Term FindTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) { int termPosition = index * TermIndexInterval * IndexDivisor; for (int i = 0; i < termPosition; i++) { // TODO: this test just uses random terms, so this is always possible AssumeTrue("ran out of terms", termEnum.Next()); } Term term = termEnum.Term(); // An indexed term is only written when the term after // it exists, so, if the number of terms is 0 mod // termIndexInterval, the last index term will not be // written; so we require a term after this term // as well: AssumeTrue("ran out of terms", termEnum.Next()); return(term); }
public override BytesRef Next() { if (DEBUG_SURROGATES) { Console.WriteLine("TE.next()"); } if (skipNext) { if (DEBUG_SURROGATES) { Console.WriteLine(" skipNext=true"); } skipNext = false; if (termEnum.Term() == null) { return(null); // PreFlex codec interns field names: } else if (termEnum.Term().Field != internedFieldName) { return(null); } else { return(current = termEnum.Term().Bytes); } } // TODO: can we use STE's prevBuffer here? prevTerm.CopyBytes(termEnum.Term().Bytes); if (termEnum.Next() && termEnum.Term().Field == internedFieldName) { newSuffixStart = termEnum.newSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" newSuffixStart=" + newSuffixStart); } SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); current = null; } else { current = t.Bytes; } return(current); } else { // this field is exhausted, but we have to give // surrogateDance a chance to seek back: if (DEBUG_SURROGATES) { Console.WriteLine(" force cont"); } //newSuffixStart = prevTerm.length; newSuffixStart = 0; SurrogateDance(); Term t = termEnum.Term(); if (t == null || t.Field != internedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field.Equals(internedFieldName, StringComparison.Ordinal)); return(null); } else { current = t.Bytes; return(current); } } }
private Term FindTermThatWouldBeAtIndex(SegmentTermEnum termEnum, int index) { int termPosition = index * TermIndexInterval * IndexDivisor; for (int i = 0; i < termPosition; i++) { // TODO: this test just uses random terms, so this is always possible AssumeTrue("ran out of terms", termEnum.Next()); } Term term = termEnum.Term(); // An indexed term is only written when the term after // it exists, so, if the number of terms is 0 mod // termIndexInterval, the last index term will not be // written; so we require a term after this term // as well: AssumeTrue("ran out of terms", termEnum.Next()); return term; }
public override BytesRef Next() { if (DEBUG_SURROGATES) { Console.WriteLine("TE.next()"); } if (SkipNext) { if (DEBUG_SURROGATES) { Console.WriteLine(" skipNext=true"); } SkipNext = false; if (TermEnum.Term() == null) { return(null); // PreFlex codec interns field names: } else if (TermEnum.Term().Field() != InternedFieldName) { return(null); } else { return(Current = TermEnum.Term().Bytes()); } } // TODO: can we use STE's prevBuffer here? PrevTerm.CopyBytes(TermEnum.Term().Bytes()); if (TermEnum.Next() && TermEnum.Term().Field() == InternedFieldName) { NewSuffixStart = TermEnum.NewSuffixStart; if (DEBUG_SURROGATES) { Console.WriteLine(" newSuffixStart=" + NewSuffixStart); } SurrogateDance(); Term t = TermEnum.Term(); if (t == null || t.Field() != InternedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field().Equals(InternedFieldName)); Current = null; } else { Current = t.Bytes(); } return(Current); } else { // this field is exhausted, but we have to give // surrogateDance a chance to seek back: if (DEBUG_SURROGATES) { Console.WriteLine(" force cont"); } //newSuffixStart = prevTerm.length; NewSuffixStart = 0; SurrogateDance(); Term t = TermEnum.Term(); if (t == null || t.Field() != InternedFieldName) { // PreFlex codec interns field names; verify: Debug.Assert(t == null || !t.Field().Equals(InternedFieldName)); return(null); } else { Current = t.Bytes(); return(Current); } } }