Esempio n. 1
0
        private int GenerateBreakUpSuggestions(Term term, IndexReader ir,
                                               int numberBreaks, int maxSuggestions, int useMinSuggestionFrequency,
                                               SuggestWord[] prefix, JCG.PriorityQueue <SuggestWordArrayWrapper> suggestions,
                                               int totalEvaluations, BreakSuggestionSortMethod sortMethod)
        {
            string termText              = term.Text;
            int    termLength            = termText.CodePointCount(0, termText.Length);
            int    useMinBreakWordLength = minBreakWordLength;

            if (useMinBreakWordLength < 1)
            {
                useMinBreakWordLength = 1;
            }
            if (termLength < (useMinBreakWordLength * 2))
            {
                return(0);
            }

            int thisTimeEvaluations = 0;

            for (int i = useMinBreakWordLength; i <= (termLength - useMinBreakWordLength); i++)
            {
                int         end       = termText.OffsetByCodePoints(0, i);
                string      leftText  = termText.Substring(0, end);
                string      rightText = termText.Substring(end);
                SuggestWord leftWord  = GenerateSuggestWord(ir, term.Field, leftText);

                if (leftWord.Freq >= useMinSuggestionFrequency)
                {
                    SuggestWord rightWord = GenerateSuggestWord(ir, term.Field, rightText);
                    if (rightWord.Freq >= useMinSuggestionFrequency)
                    {
                        SuggestWordArrayWrapper suggestion = new SuggestWordArrayWrapper(NewSuggestion(prefix, leftWord, rightWord));
                        suggestions.Enqueue(suggestion);
                        if (suggestions.Count > maxSuggestions)
                        {
                            suggestions.Dequeue();
                        }
                    }
                    int newNumberBreaks = numberBreaks + 1;
                    if (newNumberBreaks <= maxChanges)
                    {
                        int evaluations = GenerateBreakUpSuggestions(new Term(term.Field, rightWord.String),
                                                                     ir, newNumberBreaks, maxSuggestions,
                                                                     useMinSuggestionFrequency, NewPrefix(prefix, leftWord),
                                                                     suggestions, totalEvaluations, sortMethod);
                        totalEvaluations += evaluations;
                    }
                }

                thisTimeEvaluations++;
                totalEvaluations++;
                if (totalEvaluations >= maxEvaluations)
                {
                    break;
                }
            }
            return(thisTimeEvaluations);
        }
Esempio n. 2
0
        /// <summary>
        /// Generate suggestions by breaking the passed-in term into multiple words.
        /// The scores returned are equal to the number of word breaks needed so a
        /// lower score is generally preferred over a higher score.
        /// </summary>
        /// <param name="suggestMode">
        ///          - default = <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/> </param>
        /// <param name="sortMethod">
        ///          - default = <see cref="BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY"/> </param>
        /// <returns> one or more arrays of words formed by breaking up the original term </returns>
        /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
        public virtual SuggestWord[][] SuggestWordBreaks(Term term, int maxSuggestions, IndexReader ir,
                                                         SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX,
                                                         BreakSuggestionSortMethod sortMethod = BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY)
        {
            if (maxSuggestions < 1)
            {
                return(Arrays.Empty <SuggestWord[]>());
            }

            int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
            IComparer <SuggestWordArrayWrapper> queueComparer = sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY
                ? (IComparer <SuggestWordArrayWrapper>) new LengthThenMaxFreqComparer()
                : new LengthThenSumFreqComparer();

            JCG.PriorityQueue <SuggestWordArrayWrapper> suggestions = new JCG.PriorityQueue <SuggestWordArrayWrapper>(queueInitialCapacity, queueComparer);

            int origFreq = ir.DocFreq(term);

            if (origFreq > 0 && suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)
            {
                return(Arrays.Empty <SuggestWord[]>());
            }

            int useMinSuggestionFrequency = minSuggestionFrequency;

            if (suggestMode == SuggestMode.SUGGEST_MORE_POPULAR)
            {
                useMinSuggestionFrequency = (origFreq == 0 ? 1 : origFreq);
            }

            GenerateBreakUpSuggestions(term, ir, 1, maxSuggestions, useMinSuggestionFrequency, Arrays.Empty <SuggestWord>(), suggestions, 0, sortMethod);

            SuggestWord[][] suggestionArray = new SuggestWord[suggestions.Count][];
            for (int i = suggestions.Count - 1; i >= 0; i--)
            {
                suggestionArray[i] = suggestions.Dequeue().SuggestWords;
            }

            return(suggestionArray);
        }
Esempio n. 3
0
        // algorithm: treat sentence snippets as miniature documents
        // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
        // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
        private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc,
                                       TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n)
        {
            PassageScorer scorer = GetScorer(field);

            if (scorer is null)
            {
                // LUCENENET: Changed from NullPointerException to InvalidOperationException (which isn't caught anywhere outside of tests)
                throw IllegalStateException.Create("PassageScorer cannot be null");
            }
            JCG.PriorityQueue <OffsetsEnum> pq = new JCG.PriorityQueue <OffsetsEnum>();
            float[] weights = new float[terms.Length];
            // initialize postings
            for (int i = 0; i < terms.Length; i++)
            {
                DocsAndPositionsEnum de = postings[i];
                int pDoc;
                if (de == EMPTY)
                {
                    continue;
                }
                else if (de is null)
                {
                    postings[i] = EMPTY; // initially
                    if (!termsEnum.SeekExact(terms[i]))
                    {
                        continue; // term not found
                    }
                    de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS);
                    if (de is null)
                    {
                        // no positions available
                        throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                    }
                    pDoc = de.Advance(doc);
                }
                else
                {
                    pDoc = de.DocID;
                    if (pDoc < doc)
                    {
                        pDoc = de.Advance(doc);
                    }
                }

                if (doc == pDoc)
                {
                    weights[i] = scorer.Weight(contentLength, de.Freq);
                    de.NextPosition();
                    pq.Add(new OffsetsEnum(de, i));
                }
            }

            pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination

            JCG.PriorityQueue <Passage> passageQueue = new JCG.PriorityQueue <Passage>(n, Comparer <Passage> .Create((left, right) =>
            {
                if (left.score < right.score)
                {
                    return(-1);
                }
                else if (left.score > right.score)
                {
                    return(1);
                }
                else
                {
                    return(left.startOffset - right.startOffset);
                }
            }));
            Passage current = new Passage();

            while (pq.TryDequeue(out OffsetsEnum off))
            {
                DocsAndPositionsEnum dp = off.dp;
                int start = dp.StartOffset;
                if (start == -1)
                {
                    throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                }
                int end = dp.EndOffset;
                // LUCENE-5166: this hit would span the content limit... however more valid
                // hits may exist (they are sorted by start). so we pretend like we never
                // saw this term, it won't cause a passage to be added to passageQueue or anything.
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(EMPTY.StartOffset == int.MaxValue);
                }
                if (start < contentLength && end > contentLength)
                {
                    continue;
                }
                if (start >= current.endOffset)
                {
                    if (current.startOffset >= 0)
                    {
                        // finalize current
                        current.score *= scorer.Norm(current.startOffset);
                        // new sentence: first add 'current' to queue
                        if (passageQueue.Count == n && current.score < passageQueue.Peek().score)
                        {
                            current.Reset(); // can't compete, just reset it
                        }
                        else
                        {
                            passageQueue.Enqueue(current);
                            if (passageQueue.Count > n)
                            {
                                current = passageQueue.Dequeue();
                                current.Reset();
                            }
                            else
                            {
                                current = new Passage();
                            }
                        }
                    }
                    // if we exceed limit, we are done
                    if (start >= contentLength)
                    {
                        Passage[] passages = passageQueue.ToArray();
                        foreach (Passage p in passages)
                        {
                            p.Sort();
                        }
                        // sort in ascending order
                        ArrayUtil.TimSort(passages, Comparer <Passage> .Create((left, right) => left.startOffset - right.startOffset));
                        return(passages);
                    }
                    // advance breakiterator
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(BreakIterator.Done < 0);
                    }
                    current.startOffset = Math.Max(bi.Preceding(start + 1), 0);
                    current.endOffset   = Math.Min(bi.Next(), contentLength);
                }
                int tf = 0;
                while (true)
                {
                    tf++;
                    BytesRef term = terms[off.id];
                    if (term is null)
                    {
                        // multitermquery match, pull from payload
                        term = off.dp.GetPayload();
                        if (Debugging.AssertsEnabled)
                        {
                            Debugging.Assert(term != null);
                        }
                    }
                    current.AddMatch(start, end, term);
                    if (off.pos == dp.Freq)
                    {
                        break; // removed from pq
                    }
                    else
                    {
                        off.pos++;
                        dp.NextPosition();
                        start = dp.StartOffset;
                        end   = dp.EndOffset;
                    }
                    if (start >= current.endOffset || end > contentLength)
                    {
                        pq.Enqueue(off);
                        break;
                    }
                }
                current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset);
            }

            // Dead code but compiler disagrees:
            if (Debugging.AssertsEnabled)
            {
                Debugging.Assert(false);
            }
            return(null);
        }
Esempio n. 4
0
            public override bool Collect(BytesRef bytes)
            {
                float boost = boostAtt.Boost;

                // make sure within a single seg we always collect
                // terms in order
                Debug.Assert(CompareToLastTerm(bytes));

                //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
                // ignore uncompetitive hits
                if (stQueue.Count == maxSize)
                {
                    ScoreTerm t = stQueue.Peek();
                    if (boost < t.Boost)
                    {
                        return(true);
                    }
                    if (boost == t.Boost && termComp.Compare(bytes, t.Bytes) > 0)
                    {
                        return(true);
                    }
                }
                TermState state = termsEnum.GetTermState();

                Debug.Assert(state != null);
                if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2))
                {
                    // if the term is already in the PQ, only update docFreq of term in PQ
                    Debug.Assert(t2.Boost == boost, "boost should be equal in all segment TermsEnums");
                    t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                }
                else
                {
                    // add new entry in PQ, we must clone the term, else it may get overwritten!
                    st.Bytes.CopyBytes(bytes);
                    st.Boost = boost;
                    visitedTerms[st.Bytes] = st;
                    Debug.Assert(st.TermState.DocFreq == 0);
                    st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    stQueue.Add(st);
                    // possibly drop entries from queue
                    if (stQueue.Count > maxSize)
                    {
                        st = stQueue.Dequeue();
                        visitedTerms.Remove(st.Bytes);
                        st.TermState.Clear(); // reset the termstate!
                    }
                    else
                    {
                        st = new ScoreTerm(termComp, new TermContext(m_topReaderContext));
                    }
                    Debug.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize");
                    // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                    if (stQueue.Count == maxSize)
                    {
                        t2 = stQueue.Peek();
                        maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost;
                        maxBoostAtt.CompetitiveTerm        = t2.Bytes;
                    }
                }

                return(true);
            }
Esempio n. 5
0
        /// <summary>
        /// <para>
        /// Generate suggestions by combining one or more of the passed-in terms into
        /// single words. The returned <see cref="CombineSuggestion"/> contains both a
        /// <see cref="SuggestWord"/> and also an array detailing which passed-in terms were
        /// involved in creating this combination. The scores returned are equal to the
        /// number of word combinations needed, also one less than the length of the
        /// array <see cref="CombineSuggestion.OriginalTermIndexes"/>. Generally, a
        /// suggestion with a lower score is preferred over a higher score.
        /// </para>
        /// <para>
        /// To prevent two adjacent terms from being combined (for instance, if one is
        /// mandatory and the other is prohibited), separate the two terms with
        /// <see cref="WordBreakSpellChecker.SEPARATOR_TERM"/>
        /// </para>
        /// <para>
        /// When suggestMode equals <see cref="SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX"/>, each
        /// suggestion will include at least one term not in the index.
        /// </para>
        /// <para>
        /// When suggestMode equals <see cref="SuggestMode.SUGGEST_MORE_POPULAR"/>, each
        /// suggestion will have the same, or better frequency than the most-popular
        /// included term.
        /// </para>
        /// </summary>
        /// <returns> an array of words generated by combining original terms </returns>
        /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
        public virtual CombineSuggestion[] SuggestWordCombinations(Term[] terms, int maxSuggestions,
                                                                   IndexReader ir, SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)
        {
            if (maxSuggestions < 1)
            {
                return(Arrays.Empty <CombineSuggestion>());
            }

            int[] origFreqs = null;
            if (suggestMode != SuggestMode.SUGGEST_ALWAYS)
            {
                origFreqs = new int[terms.Length];
                for (int i = 0; i < terms.Length; i++)
                {
                    origFreqs[i] = ir.DocFreq(terms[i]);
                }
            }

            int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
            IComparer <CombineSuggestionWrapper> queueComparer = new CombinationsThenFreqComparer();

            JCG.PriorityQueue <CombineSuggestionWrapper> suggestions = new JCG.PriorityQueue <CombineSuggestionWrapper>(queueInitialCapacity, queueComparer);

            int thisTimeEvaluations = 0;

            for (int i = 0; i < terms.Length - 1; i++)
            {
                if (terms[i].Equals(SEPARATOR_TERM))
                {
                    continue;
                }
                string leftTermText   = terms[i].Text;
                int    leftTermLength = leftTermText.CodePointCount(0, leftTermText.Length);
                if (leftTermLength > maxCombineWordLength)
                {
                    continue;
                }
                int maxFreq = 0;
                int minFreq = int.MaxValue;
                if (origFreqs != null)
                {
                    maxFreq = origFreqs[i];
                    minFreq = origFreqs[i];
                }
                string combinedTermText = leftTermText;
                int    combinedLength   = leftTermLength;
                for (int j = i + 1; j < terms.Length && j - i <= maxChanges; j++)
                {
                    if (terms[j].Equals(SEPARATOR_TERM))
                    {
                        break;
                    }
                    string rightTermText   = terms[j].Text;
                    int    rightTermLength = rightTermText.CodePointCount(0, rightTermText.Length);
                    combinedTermText += rightTermText;
                    combinedLength   += rightTermLength;
                    if (combinedLength > maxCombineWordLength)
                    {
                        break;
                    }

                    if (origFreqs != null)
                    {
                        maxFreq = Math.Max(maxFreq, origFreqs[j]);
                        minFreq = Math.Min(minFreq, origFreqs[j]);
                    }

                    Term combinedTerm     = new Term(terms[0].Field, combinedTermText);
                    int  combinedTermFreq = ir.DocFreq(combinedTerm);

                    if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq)
                    {
                        if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0)
                        {
                            if (combinedTermFreq >= minSuggestionFrequency)
                            {
                                int[] origIndexes = new int[j - i + 1];
                                origIndexes[0] = i;
                                for (int k = 1; k < origIndexes.Length; k++)
                                {
                                    origIndexes[k] = i + k;
                                }
                                SuggestWord word = new SuggestWord();
                                word.Freq   = combinedTermFreq;
                                word.Score  = origIndexes.Length - 1;
                                word.String = combinedTerm.Text;
                                CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(new CombineSuggestion(word, origIndexes), (origIndexes.Length - 1));
                                suggestions.Enqueue(suggestion);
                                if (suggestions.Count > maxSuggestions)
                                {
                                    suggestions.TryDequeue(out CombineSuggestionWrapper _);
                                }
                            }
                        }
                    }
                    thisTimeEvaluations++;
                    if (thisTimeEvaluations == maxEvaluations)
                    {
                        break;
                    }
                }
            }
            CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.Count];
            for (int i = suggestions.Count - 1; i >= 0; i--)
            {
                combineSuggestions[i] = suggestions.Dequeue().CombineSuggestion;
            }
            return(combineSuggestions);
        }
Esempio n. 6
0
            public override bool Collect(BytesRef bytes)
            {
                float boost = boostAtt.Boost;

                // make sure within a single seg we always collect
                // terms in order
                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(CompareToLastTerm(bytes));
                }

                //System.out.println("TTR.collect term=" + bytes.utf8ToString() + " boost=" + boost + " ord=" + readerContext.ord);
                // ignore uncompetitive hits
                if (stQueue.Count == maxSize)
                {
                    ScoreTerm t = stQueue.Peek();
                    // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled
                    if (NumericUtils.SingleToSortableInt32(boost) < NumericUtils.SingleToSortableInt32(t.Boost))
                    {
                        return(true);
                    }
                    // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled
                    if (NumericUtils.SingleToSortableInt32(boost) == NumericUtils.SingleToSortableInt32(t.Boost) && termComp.Compare(bytes, t.Bytes) > 0)
                    {
                        return(true);
                    }
                }
                TermState state = termsEnum.GetTermState();

                if (Debugging.AssertsEnabled)
                {
                    Debugging.Assert(state != null);
                }
                if (visitedTerms.TryGetValue(bytes, out ScoreTerm t2))
                {
                    // if the term is already in the PQ, only update docFreq of term in PQ
                    // LUCENENET specific - compare bits rather than using equality operators to prevent these comparisons from failing in x86 in .NET Framework with optimizations enabled
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(NumericUtils.SingleToSortableInt32(t2.Boost) == NumericUtils.SingleToSortableInt32(boost), "boost should be equal in all segment TermsEnums");
                    }
                    t2.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                }
                else
                {
                    // add new entry in PQ, we must clone the term, else it may get overwritten!
                    st.Bytes.CopyBytes(bytes);
                    st.Boost = boost;
                    visitedTerms[st.Bytes] = st;
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(st.TermState.DocFreq == 0);
                    }
                    st.TermState.Register(state, m_readerContext.Ord, termsEnum.DocFreq, termsEnum.TotalTermFreq);
                    stQueue.Add(st);
                    // possibly drop entries from queue
                    if (stQueue.Count > maxSize)
                    {
                        st = stQueue.Dequeue();
                        visitedTerms.Remove(st.Bytes);
                        st.TermState.Clear(); // reset the termstate!
                    }
                    else
                    {
                        st = new ScoreTerm(termComp, new TermContext(m_topReaderContext));
                    }
                    if (Debugging.AssertsEnabled)
                    {
                        Debugging.Assert(stQueue.Count <= maxSize, "the PQ size must be limited to maxSize");
                    }
                    // set maxBoostAtt with values to help FuzzyTermsEnum to optimize
                    if (stQueue.Count == maxSize)
                    {
                        t2 = stQueue.Peek();
                        maxBoostAtt.MaxNonCompetitiveBoost = t2.Boost;
                        maxBoostAtt.CompetitiveTerm        = t2.Bytes;
                    }
                }

                return(true);
            }