/// <remarks> /// TODO: we may want an alternate mode here which is /// "if you are about to return NOT_FOUND I won't use /// the terms data from that"; eg FuzzyTermsEnum will /// (usually) just immediately call seek again if we /// return NOT_FOUND so it's a waste for us to fill in /// the term that was actually NOT_FOUND /// </remarks> public override SeekStatus SeekCeil(BytesRef target) { if (indexEnum == null) { throw new InvalidOperationException("terms index was not loaded"); } //System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" + target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term() + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending=" + seekPending + " divisor=" + indexReader.getDivisor() + " this=" + this); if (didIndexNext) { if (nextIndexTerm == null) { //System.out.println(" nextIndexTerm=null"); } else { //System.out.println(" nextIndexTerm=" + nextIndexTerm.utf8ToString()); } } bool doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (indexIsCurrent) { int cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(term, target); if (cmp == 0) { // Already at the requested term return(SeekStatus.FOUND); } else if (cmp < 0) { // Target term is after current term if (!didIndexNext) { if (indexEnum.Next() == -1) { nextIndexTerm = null; } else { nextIndexTerm = indexEnum.Term; } //System.out.println(" now do index next() nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString())); didIndexNext = true; } if (nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; //System.out.println(" skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null" : nextIndexTerm.utf8ToString())); } } } if (doSeek) { //System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: input.Seek(indexEnum.Seek(target)); bool result = NextBlock(); // Block must exist since, at least, the indexed term // is in the block: Debug.Assert(result); indexIsCurrent = true; didIndexNext = false; blocksSinceSeek = 0; if (doOrd) { state.Ord = indexEnum.Ord - 1; } term.CopyBytes(indexEnum.Term); //System.out.println(" seek: term=" + term.utf8ToString()); } else { //System.out.println(" skip seek"); if (state.TermBlockOrd == blockTermCount && !NextBlock()) { indexIsCurrent = false; return(SeekStatus.END); } } seekPending = false; int common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < termBlockPrefix) { int cmp = (term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (state.TermBlockOrd < blockTermCount) { while (state.TermBlockOrd < blockTermCount - 1) { state.TermBlockOrd++; state.Ord++; termSuffixesReader.SkipBytes(termSuffixesReader.ReadVInt32()); } int suffix = termSuffixesReader.ReadVInt32(); term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); } state.Ord++; if (!NextBlock()) { indexIsCurrent = false; return(SeekStatus.END); } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: Debug.Assert(state.TermBlockOrd == 0); int suffix = termSuffixesReader.ReadVInt32(); term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } else { common++; } continue; } // Test every term in this block while (true) { state.TermBlockOrd++; state.Ord++; int suffix = termSuffixesReader.ReadVInt32(); // We know the prefix matches, so just compare the new suffix: int termLen = termBlockPrefix + suffix; int bytePos = termSuffixesReader.Position; bool next = false; int limit = target.Offset + (termLen < target.Length ? termLen : target.Length); int targetPos = target.Offset + termBlockPrefix; while (targetPos < limit) { int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } else if (cmp > 0) { // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); //System.out.println(" NOT_FOUND"); return(SeekStatus.NOT_FOUND); } } if (!next && target.Length <= termLen) { term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); if (target.Length == termLen) { // Done! Exact match. Stop here, fill in // real term, return FOUND. //System.out.println(" FOUND"); return(SeekStatus.FOUND); } else { //System.out.println(" NOT_FOUND"); return(SeekStatus.NOT_FOUND); } } if (state.TermBlockOrd == blockTermCount) { // Must pre-fill term for next block's common prefix term.Length = termBlockPrefix + suffix; if (term.Bytes.Length < term.Length) { term.Grow(term.Length); } termSuffixesReader.ReadBytes(term.Bytes, termBlockPrefix, suffix); break; } else { termSuffixesReader.SkipBytes(suffix); } } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: Debug.Assert(indexIsCurrent); if (!NextBlock()) { //System.out.println(" END"); indexIsCurrent = false; return(SeekStatus.END); } common = 0; } }
/// <remarks> /// TODO: we may want an alternate mode here which is /// "if you are about to return NOT_FOUND I won't use /// the terms data from that"; eg FuzzyTermsEnum will /// (usually) just immediately call seek again if we /// return NOT_FOUND so it's a waste for us to fill in /// the term that was actually NOT_FOUND /// </remarks> public override SeekStatus SeekCeil(BytesRef target) { if (_indexEnum == null) { throw new InvalidOperationException("terms index was not loaded"); } var doSeek = true; // See if we can avoid seeking, because target term // is after current term but before next index term: if (_indexIsCurrent) { var cmp = BytesRef.UTF8SortedAsUnicodeComparer.Compare(_term, target); if (cmp == 0) { return(SeekStatus.FOUND); // Already at the requested term } if (cmp < 0) { // Target term is after current term if (!_didIndexNext) { _nextIndexTerm = _indexEnum.Next == -1 ? null : _indexEnum.Term; _didIndexNext = true; } if (_nextIndexTerm == null || BytesRef.UTF8SortedAsUnicodeComparer.Compare(target, _nextIndexTerm) < 0) { // Optimization: requested term is within the // same term block we are now in; skip seeking // (but do scanning): doSeek = false; } } } if (doSeek) { //System.out.println(" seek"); // Ask terms index to find biggest indexed term (= // first term in a block) that's <= our text: _input.Seek(_indexEnum.Seek(target)); var result = NextBlock(); // Block must exist since, at least, the indexed term // is in the block: Debug.Assert(result); _indexIsCurrent = true; _didIndexNext = false; _blocksSinceSeek = 0; if (_doOrd) { _state.Ord = _indexEnum.Ord - 1; } _term.CopyBytes(_indexEnum.Term); } else { if (_state.TermBlockOrd == _blockTermCount && !NextBlock()) { _indexIsCurrent = false; return(SeekStatus.END); } } _seekPending = false; var common = 0; // Scan within block. We could do this by calling // _next() and testing the resulting term, but this // is wasteful. Instead, we first confirm the // target matches the common prefix of this block, // and then we scan the term bytes directly from the // termSuffixesreader's byte[], saving a copy into // the BytesRef term per term. Only when we return // do we then copy the bytes into the term. while (true) { // First, see if target term matches common prefix // in this block: if (common < _termBlockPrefix) { var cmp = (_term.Bytes[common] & 0xFF) - (target.Bytes[target.Offset + common] & 0xFF); if (cmp < 0) { // TODO: maybe we should store common prefix // in block header? (instead of relying on // last term of previous block) // Target's prefix is after the common block // prefix, so term cannot be in this block // but it could be in next block. We // must scan to end-of-block to set common // prefix for next block: if (_state.TermBlockOrd < _blockTermCount) { while (_state.TermBlockOrd < _blockTermCount - 1) { _state.TermBlockOrd++; _state.Ord++; _termSuffixesReader.SkipBytes(_termSuffixesReader.ReadVInt()); } var suffix = _termSuffixesReader.ReadVInt(); _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); } _state.Ord++; if (!NextBlock()) { _indexIsCurrent = false; return(SeekStatus.END); } common = 0; } else if (cmp > 0) { // Target's prefix is before the common prefix // of this block, so we position to start of // block and return NOT_FOUND: Debug.Assert(_state.TermBlockOrd == 0); var suffix = _termSuffixesReader.ReadVInt(); _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } else { common++; } continue; } // Test every term in this block while (true) { _state.TermBlockOrd++; _state.Ord++; var suffix = _termSuffixesReader.ReadVInt(); // We know the prefix matches, so just compare the new suffix: var termLen = _termBlockPrefix + suffix; var bytePos = _termSuffixesReader.Position; var next = false; var limit = target.Offset + (termLen < target.Length ? termLen : target.Length); var targetPos = target.Offset + _termBlockPrefix; while (targetPos < limit) { var cmp = (_termSuffixes[bytePos++] & 0xFF) - (target.Bytes[targetPos++] & 0xFF); if (cmp < 0) { // Current term is still before the target; // keep scanning next = true; break; } if (cmp <= 0) { continue; } // Done! Current term is after target. Stop // here, fill in real term, return NOT_FOUND. _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); return(SeekStatus.NOT_FOUND); } if (!next && target.Length <= termLen) { _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); return(target.Length == termLen ? SeekStatus.FOUND : SeekStatus.NOT_FOUND); } if (_state.TermBlockOrd == _blockTermCount) { // Must pre-fill term for next block's common prefix _term.Length = _termBlockPrefix + suffix; if (_term.Bytes.Length < _term.Length) { _term.Grow(_term.Length); } _termSuffixesReader.ReadBytes(_term.Bytes, _termBlockPrefix, suffix); break; } _termSuffixesReader.SkipBytes(suffix); } // The purpose of the terms dict index is to seek // the enum to the closest index term before the // term we are looking for. So, we should never // cross another index term (besides the first // one) while we are scanning: Debug.Assert(_indexIsCurrent); if (!NextBlock()) { _indexIsCurrent = false; return(SeekStatus.END); } common = 0; } }