private int GenerateBreakUpSuggestions(Term term, IndexReader ir, int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency, SuggestWord[] prefix, JCG.PriorityQueue <SuggestWordArrayWrapper> suggestions, int totalEvaluations, BreakSuggestionSortMethod sortMethod) { string termText = term.Text; int termLength = termText.CodePointCount(0, termText.Length); int useMinBreakWordLength = minBreakWordLength; if (useMinBreakWordLength < 1) { useMinBreakWordLength = 1; } if (termLength < (useMinBreakWordLength * 2)) { return(0); } int thisTimeEvaluations = 0; for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) { int end = termText.OffsetByCodePoints(0, i); string leftText = termText.Substring(0, end); string rightText = termText.Substring(end); SuggestWord leftWord = GenerateSuggestWord(ir, term.Field, leftText); if (leftWord.Freq >= useMinSuggestionFrequency) { SuggestWord rightWord = GenerateSuggestWord(ir, term.Field, rightText); if (rightWord.Freq >= useMinSuggestionFrequency) { SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(NewSuggestion(prefix, leftWord, rightWord)); suggestions.Enqueue(suggestion); if (suggestions.Count > maxSuggestions) { suggestions.Dequeue(); } } int newNumberBreaks = numberBreaks + 1; if (newNumberBreaks <= maxChanges) { int evaluations = GenerateBreakUpSuggestions(new Term(term.Field, rightWord.String), ir, newNumberBreaks, maxSuggestions, useMinSuggestionFrequency, NewPrefix(prefix, leftWord), suggestions, totalEvaluations, sortMethod); totalEvaluations += evaluations; } } thisTimeEvaluations++; totalEvaluations++; if (totalEvaluations >= maxEvaluations) { break; } } return(thisTimeEvaluations); }
/// <summary> /// Generate suggestions by breaking the passed-in term into multiple words. /// The scores returned are equal to the number of word breaks needed so a /// lower score is generally preferred over a higher score. /// </summary> /// <param name="suggestMode"> /// - default = <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/> </param> /// <param name="sortMethod"> /// - default = <see cref="BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY"/> </param> /// <returns> one or more arrays of words formed by breaking up the original term </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> public virtual SuggestWord[][] SuggestWordBreaks(Term term, int maxSuggestions, IndexReader ir, SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) { if (maxSuggestions < 1) { return(Arrays.Empty <SuggestWord[]>()); } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; IComparer <SuggestWordArrayWrapper> queueComparer = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY ? (IComparer <SuggestWordArrayWrapper>) new LengthThenMaxFreqComparer() : new LengthThenSumFreqComparer(); JCG.PriorityQueue <SuggestWordArrayWrapper> suggestions = new JCG.PriorityQueue <SuggestWordArrayWrapper>(queueInitialCapacity, queueComparer); int origFreq = ir.DocFreq(term); if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { return(Arrays.Empty <SuggestWord[]>()); } int useMinSuggestionFrequency = minSuggestionFrequency; if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) { useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq); } GenerateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, Arrays.Empty <SuggestWord>(), suggestions, 0, sortMethod); SuggestWord[][] suggestionArray = new SuggestWord[suggestions.Count][]; for (int i = suggestions.Count - 1; i >= 0; i--) { suggestionArray[i] = suggestions.Dequeue().SuggestWords; } return(suggestionArray); }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer is null) { // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests) throw IllegalStateException.Create("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de is null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de is null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) => { if (left.score < right.score) { return(-1); } else if (left.score > right.score) { return(1); } else { return(left.startOffset - right.startOffset); } })); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. if (Debugging.AssertsEnabled) { Debugging.Assert(EMPTY.StartOffset == int.MaxValue); } if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset)); return(passages); } // advance breakiterator if (Debugging.AssertsEnabled) { Debugging.Assert(BreakIterator.Done < 0); } current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term is null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: if (Debugging.AssertsEnabled) { Debugging.Assert(false); } return(null); }
public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order Debug.Assert(CompareToLastTerm(bytes)); //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.Count == maxSize) { ScoreTerm t = stQueue.Peek(); if (boost < t.Boost) { return(true); } if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } TermState state = termsEnum.GetTermState(); Debug.Assert(state != null); if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2)) { // if the term is already in the PQ, only update docFreq of term in PQ Debug.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums"); t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; Debug.Assert(st.TermState.DocFreq == 0); st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); stQueue.Add(st); // possibly drop entries from queue if (stQueue.Count > maxSize) { st = stQueue.Dequeue(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(m_topReaderContext)); } Debug.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize"); // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.Count == maxSize) { t2 = stQueue.Peek(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }
/// <summary> /// <para> /// Generate suggestions by combining one or more of the passed-in terms into /// single words. The returned <see cref="CombineSuggestion"/> contains both a /// <see cref="SuggestWord"/> and also an array detailing which passed-in terms were /// involved in creating this combination. The scores returned are equal to the /// number of word combinations needed, also one less than the length of the /// array <see cref="CombineSuggestion.OriginalTermIndexes"/>. Generally, a /// suggestion with a lower score is preferred over a higher score. /// </para> /// <para> /// To prevent two adjacent terms from being combined (for instance, if one is /// mandatory and the other is prohibited), separate the two terms with /// <see cref="WordBreakSpellChecker.SEPARATOR_TERM"/> /// </para> /// <para> /// When suggestMode equals <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/>, each /// suggestion will include at least one term not in the index. /// </para> /// <para> /// When suggestMode equals <see cref="SuggestMode.SUGGEST_MORE_POPULAR"/>, each /// suggestion will have the same, or better frequency than the most-popular /// included term. /// </para> /// </summary> /// <returns> an array of words generated by combining original terms </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> public virtual CombineSuggestion[] SuggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir, SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { if (maxSuggestions < 1) { return(Arrays.Empty <CombineSuggestion>()); } int[] origFreqs = null; if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { origFreqs = new int[terms.Length]; for (int i = 0; i < terms.Length; i++) { origFreqs[i] = ir.DocFreq(terms[i]); } } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; IComparer <CombineSuggestionWrapper> queueComparer = new CombinationsThenFreqComparer(); JCG.PriorityQueue <CombineSuggestionWrapper> suggestions = new JCG.PriorityQueue <CombineSuggestionWrapper>(queueInitialCapacity, queueComparer); int thisTimeEvaluations = 0; for (int i = 0; i < terms.Length - 1; i++) { if (terms[i].Equals(SEPARATOR_TERM)) { continue; } string leftTermText = terms[i].Text; int leftTermLength = leftTermText.CodePointCount(0, leftTermText.Length); if (leftTermLength > maxCombineWordLength) { continue; } int maxFreq = 0; int minFreq = int.MaxValue; if (origFreqs != null) { maxFreq = origFreqs[i]; minFreq = origFreqs[i]; } string combinedTermText = leftTermText; int combinedLength = leftTermLength; for (int j = i + 1; j < terms.Length && j - i <= maxChanges; j++) { if (terms[j].Equals(SEPARATOR_TERM)) { break; } string rightTermText = terms[j].Text; int rightTermLength = rightTermText.CodePointCount(0, rightTermText.Length); combinedTermText += rightTermText; combinedLength += rightTermLength; if (combinedLength > maxCombineWordLength) { break; } if (origFreqs != null) { maxFreq = Math.Max(maxFreq, origFreqs[j]); minFreq = Math.Min(minFreq, origFreqs[j]); } Term combinedTerm = new Term(terms[0].Field, combinedTermText); int combinedTermFreq = ir.DocFreq(combinedTerm); if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq) { if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0) { if (combinedTermFreq >= minSuggestionFrequency) { int[] origIndexes = new int[j - i + 1]; origIndexes[0] = i; for (int k = 1; k < origIndexes.Length; k++) { origIndexes[k] = i + k; } SuggestWord word = new SuggestWord(); word.Freq = combinedTermFreq; word.Score = origIndexes.Length - 1; word.String = combinedTerm.Text; CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(new CombineSuggestion(word, origIndexes), (origIndexes.Length - 1)); suggestions.Enqueue(suggestion); if (suggestions.Count > maxSuggestions) { suggestions.TryDequeue(out CombineSuggestionWrapper _); } } } } thisTimeEvaluations++; if (thisTimeEvaluations == maxEvaluations) { break; } } } CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.Count]; for (int i = suggestions.Count - 1; i >= 0; i--) { combineSuggestions[i] = suggestions.Dequeue().CombineSuggestion; } return(combineSuggestions); }
public override bool Collect(BytesRef bytes) { float boost = boostAtt.Boost; // make sure within a single seg we always collect // terms in order if (Debugging.AssertsEnabled) { Debugging.Assert(CompareToLastTerm(bytes)); } //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord); // ignore uncompetitive hits if (stQueue.Count == maxSize) { ScoreTerm t = stQueue.Peek(); // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (NumericUtils.SingleToSortableInt32(boost) < NumericUtils.SingleToSortableInt32(t.Boost)) { return(true); } // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (NumericUtils.SingleToSortableInt32(boost) == NumericUtils.SingleToSortableInt32(t.Boost) && termComp.Compare(bytes, t.Bytes) > 0) { return(true); } } TermState state = termsEnum.GetTermState(); if (Debugging.AssertsEnabled) { Debugging.Assert(state != null); } if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2)) { // if the term is already in the PQ, only update docFreq of term in PQ // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled if (Debugging.AssertsEnabled) { Debugging.Assert(NumericUtils.SingleToSortableInt32(t2.Boost) == NumericUtils.SingleToSortableInt32(boost), "boost should be equal in all segment TermsEnums"); } t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); } else { // add new entry in PQ, we must clone the term, else it may get overwritten! st.Bytes.CopyBytes(bytes); st.Boost = boost; visitedTerms[st.Bytes] = st; if (Debugging.AssertsEnabled) { Debugging.Assert(st.TermState.DocFreq == 0); } st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq); stQueue.Add(st); // possibly drop entries from queue if (stQueue.Count > maxSize) { st = stQueue.Dequeue(); visitedTerms.Remove(st.Bytes); st.TermState.Clear(); // reset the termstate! } else { st = new ScoreTerm(termComp, new TermContext(m_topReaderContext)); } if (Debugging.AssertsEnabled) { Debugging.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize"); } // set maxBoostAtt with values to help FuzzyTermsEnum to optimize if (stQueue.Count == maxSize) { t2 = stQueue.Peek(); maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost; maxBoostAtt.CompetitiveTerm = t2.Bytes; } } return(true); }