private int GenerateBreakUpSuggestions(Term term, IndexReader ir, int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency, SuggestWord[] prefix, JCG.PriorityQueue <SuggestWordArrayWrapper> suggestions, int totalEvaluations, BreakSuggestionSortMethod sortMethod) { string termText = term.Text; int termLength = termText.CodePointCount(0, termText.Length); int useMinBreakWordLength = minBreakWordLength; if (useMinBreakWordLength < 1) { useMinBreakWordLength = 1; } if (termLength < (useMinBreakWordLength * 2)) { return(0); } int thisTimeEvaluations = 0; for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++) { int end = termText.OffsetByCodePoints(0, i); string leftText = termText.Substring(0, end); string rightText = termText.Substring(end); SuggestWord leftWord = GenerateSuggestWord(ir, term.Field, leftText); if (leftWord.Freq >= useMinSuggestionFrequency) { SuggestWord rightWord = GenerateSuggestWord(ir, term.Field, rightText); if (rightWord.Freq >= useMinSuggestionFrequency) { SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(NewSuggestion(prefix, leftWord, rightWord)); suggestions.Enqueue(suggestion); if (suggestions.Count > maxSuggestions) { suggestions.Dequeue(); } } int newNumberBreaks = numberBreaks + 1; if (newNumberBreaks <= maxChanges) { int evaluations = GenerateBreakUpSuggestions(new Term(term.Field, rightWord.String), ir, newNumberBreaks, maxSuggestions, useMinSuggestionFrequency, NewPrefix(prefix, leftWord), suggestions, totalEvaluations, sortMethod); totalEvaluations += evaluations; } } thisTimeEvaluations++; totalEvaluations++; if (totalEvaluations >= maxEvaluations) { break; } } return(thisTimeEvaluations); }
// algorithm: treat sentence snippets as miniature documents // we can intersect these with the postings lists via BreakIterator.preceding(offset),s // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq)) private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc, TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n) { PassageScorer scorer = GetScorer(field); if (scorer is null) { // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests) throw IllegalStateException.Create("PassageScorer cannot be null"); } JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>(); float[] weights = new float[terms.Length]; // initialize postings for (int i = 0; i < terms.Length; i++) { DocsAndPositionsEnum de = postings[i]; int pDoc; if (de == EMPTY) { continue; } else if (de is null) { postings[i] = EMPTY; // initially if (!termsEnum.SeekExact(terms[i])) { continue; // term not found } de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS); if (de is null) { // no positions available throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } pDoc = de.Advance(doc); } else { pDoc = de.DocID; if (pDoc < doc) { pDoc = de.Advance(doc); } } if (doc == pDoc) { weights[i] = scorer.Weight(contentLength, de.Freq); de.NextPosition(); pq.Add(new OffsetsEnum(de, i)); } } pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) => { if (left.score < right.score) { return(-1); } else if (left.score > right.score) { return(1); } else { return(left.startOffset - right.startOffset); } })); Passage current = new Passage(); while (pq.TryDequeue(out OffsetsEnum off)) { DocsAndPositionsEnum dp = off.dp; int start = dp.StartOffset; if (start == -1) { throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight"); } int end = dp.EndOffset; // LUCENE-5166: this hit would span the content limit... however more valid // hits may exist (they are sorted by start). so we pretend like we never // saw this term, it won't cause a passage to be added to passageQueue or anything. if (Debugging.AssertsEnabled) { Debugging.Assert(EMPTY.StartOffset == int.MaxValue); } if (start < contentLength && end > contentLength) { continue; } if (start >= current.endOffset) { if (current.startOffset >= 0) { // finalize current current.score *= scorer.Norm(current.startOffset); // new sentence: first add 'current' to queue if (passageQueue.Count == n && current.score < passageQueue.Peek().score) { current.Reset(); // can't compete, just reset it } else { passageQueue.Enqueue(current); if (passageQueue.Count > n) { current = passageQueue.Dequeue(); current.Reset(); } else { current = new Passage(); } } } // if we exceed limit, we are done if (start >= contentLength) { Passage[] passages = passageQueue.ToArray(); foreach (Passage p in passages) { p.Sort(); } // sort in ascending order ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset)); return(passages); } // advance breakiterator if (Debugging.AssertsEnabled) { Debugging.Assert(BreakIterator.Done < 0); } current.startOffset = Math.Max(bi.Preceding(start + 1), 0); current.endOffset = Math.Min(bi.Next(), contentLength); } int tf = 0; while (true) { tf++; BytesRef term = terms[off.id]; if (term is null) { // multitermquery match, pull from payload term = off.dp.GetPayload(); if (Debugging.AssertsEnabled) { Debugging.Assert(term != null); } } current.AddMatch(start, end, term); if (off.pos == dp.Freq) { break; // removed from pq } else { off.pos++; dp.NextPosition(); start = dp.StartOffset; end = dp.EndOffset; } if (start >= current.endOffset || end > contentLength) { pq.Enqueue(off); break; } } current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset); } // Dead code but compiler disagrees: if (Debugging.AssertsEnabled) { Debugging.Assert(false); } return(null); }
/// <summary> /// <para> /// Generate suggestions by combining one or more of the passed-in terms into /// single words. The returned <see cref="CombineSuggestion"/> contains both a /// <see cref="SuggestWord"/> and also an array detailing which passed-in terms were /// involved in creating this combination. The scores returned are equal to the /// number of word combinations needed, also one less than the length of the /// array <see cref="CombineSuggestion.OriginalTermIndexes"/>. Generally, a /// suggestion with a lower score is preferred over a higher score. /// </para> /// <para> /// To prevent two adjacent terms from being combined (for instance, if one is /// mandatory and the other is prohibited), separate the two terms with /// <see cref="WordBreakSpellChecker.SEPARATOR_TERM"/> /// </para> /// <para> /// When suggestMode equals <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/>, each /// suggestion will include at least one term not in the index. /// </para> /// <para> /// When suggestMode equals <see cref="SuggestMode.SUGGEST_MORE_POPULAR"/>, each /// suggestion will have the same, or better frequency than the most-popular /// included term. /// </para> /// </summary> /// <returns> an array of words generated by combining original terms </returns> /// <exception cref="IOException"> If there is a low-level I/O error. </exception> public virtual CombineSuggestion[] SuggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir, SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX) { if (maxSuggestions < 1) { return(Arrays.Empty <CombineSuggestion>()); } int[] origFreqs = null; if (suggestMode != SuggestMode.SUGGEST_ALWAYS) { origFreqs = new int[terms.Length]; for (int i = 0; i < terms.Length; i++) { origFreqs[i] = ir.DocFreq(terms[i]); } } int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions; IComparer <CombineSuggestionWrapper> queueComparer = new CombinationsThenFreqComparer(); JCG.PriorityQueue <CombineSuggestionWrapper> suggestions = new JCG.PriorityQueue <CombineSuggestionWrapper>(queueInitialCapacity, queueComparer); int thisTimeEvaluations = 0; for (int i = 0; i < terms.Length - 1; i++) { if (terms[i].Equals(SEPARATOR_TERM)) { continue; } string leftTermText = terms[i].Text; int leftTermLength = leftTermText.CodePointCount(0, leftTermText.Length); if (leftTermLength > maxCombineWordLength) { continue; } int maxFreq = 0; int minFreq = int.MaxValue; if (origFreqs != null) { maxFreq = origFreqs[i]; minFreq = origFreqs[i]; } string combinedTermText = leftTermText; int combinedLength = leftTermLength; for (int j = i + 1; j < terms.Length && j - i <= maxChanges; j++) { if (terms[j].Equals(SEPARATOR_TERM)) { break; } string rightTermText = terms[j].Text; int rightTermLength = rightTermText.CodePointCount(0, rightTermText.Length); combinedTermText += rightTermText; combinedLength += rightTermLength; if (combinedLength > maxCombineWordLength) { break; } if (origFreqs != null) { maxFreq = Math.Max(maxFreq, origFreqs[j]); minFreq = Math.Min(minFreq, origFreqs[j]); } Term combinedTerm = new Term(terms[0].Field, combinedTermText); int combinedTermFreq = ir.DocFreq(combinedTerm); if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq) { if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0) { if (combinedTermFreq >= minSuggestionFrequency) { int[] origIndexes = new int[j - i + 1]; origIndexes[0] = i; for (int k = 1; k < origIndexes.Length; k++) { origIndexes[k] = i + k; } SuggestWord word = new SuggestWord(); word.Freq = combinedTermFreq; word.Score = origIndexes.Length - 1; word.String = combinedTerm.Text; CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(new CombineSuggestion(word, origIndexes), (origIndexes.Length - 1)); suggestions.Enqueue(suggestion); if (suggestions.Count > maxSuggestions) { suggestions.TryDequeue(out CombineSuggestionWrapper _); } } } } thisTimeEvaluations++; if (thisTimeEvaluations == maxEvaluations) { break; } } } CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.Count]; for (int i = suggestions.Count - 1; i >= 0; i--) { combineSuggestions[i] = suggestions.Dequeue().CombineSuggestion; } return(combineSuggestions); }