Example #1
0
        /// <summary>
        /// Provide spelling corrections based on several parameters.
        /// </summary>
        /// <param name="term"> The term to suggest spelling corrections for </param>
        /// <param name="numSug"> The maximum number of spelling corrections </param>
        /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
        /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
        /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
        /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
        /// <param name="spare"> a chars scratch </param>
        /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
        /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception>
        protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir,
                                                                          int docfreq, int editDistance, float accuracy, CharsRef spare)
        {
            var atts = new AttributeSource();
            IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            Terms terms = MultiFields.GetTerms(ir, term.Field);

            if (terms == null)
            {
                return(new List <ScoreTerm>());
            }
            FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);

            var stQueue = new Support.PriorityQueue <ScoreTerm>();

            BytesRef        queryTerm = new BytesRef(term.Text());
            BytesRef        candidateTerm;
            ScoreTerm       st       = new ScoreTerm();
            IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>();

            while ((candidateTerm = e.Next()) != null)
            {
                float boost = boostAtt.Boost;
                // ignore uncompetitive hits
                if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost)
                {
                    continue;
                }

                // ignore exact match of the same term
                if (queryTerm.BytesEquals(candidateTerm))
                {
                    continue;
                }

                int df = e.DocFreq;

                // check docFreq if required
                if (df <= docfreq)
                {
                    continue;
                }

                float  score;
                string termAsString;
                if (distance == INTERNAL_LEVENSHTEIN)
                {
                    // delay creating strings until the end
                    termAsString = null;
                    // undo FuzzyTermsEnum's scale factor for a real scaled lev score
                    score = boost / e.ScaleFactor + e.MinSimilarity;
                }
                else
                {
                    UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
                    termAsString = spare.ToString();
                    score        = distance.GetDistance(term.Text(), termAsString);
                }

                if (score < accuracy)
                {
                    continue;
                }

                // add new entry in PQ
                st.Term         = BytesRef.DeepCopyOf(candidateTerm);
                st.Boost        = boost;
                st.Docfreq      = df;
                st.TermAsString = termAsString;
                st.Score        = score;
                stQueue.Offer(st);
                // possibly drop entries from queue
                st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm();
                maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity;
            }

            return(stQueue);
        }
Example #2
0
        // algorithm: treat sentence snippets as miniature documents
        // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
        // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
        private Passage[] HighlightDoc(string field, BytesRef[] terms, int contentLength, BreakIterator bi, int doc,
                                       TermsEnum termsEnum, DocsAndPositionsEnum[] postings, int n)
        {
            PassageScorer scorer = GetScorer(field);

            if (scorer == null)
            {
                throw new NullReferenceException("PassageScorer cannot be null");
            }
            Support.PriorityQueue <OffsetsEnum> pq = new Support.PriorityQueue <OffsetsEnum>();
            float[] weights = new float[terms.Length];
            // initialize postings
            for (int i = 0; i < terms.Length; i++)
            {
                DocsAndPositionsEnum de = postings[i];
                int pDoc;
                if (de == EMPTY)
                {
                    continue;
                }
                else if (de == null)
                {
                    postings[i] = EMPTY; // initially
                    if (!termsEnum.SeekExact(terms[i]))
                    {
                        continue; // term not found
                    }
                    de = postings[i] = termsEnum.DocsAndPositions(null, null, DocsAndPositionsFlags.OFFSETS);
                    if (de == null)
                    {
                        // no positions available
                        throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                    }
                    pDoc = de.Advance(doc);
                }
                else
                {
                    pDoc = de.DocID;
                    if (pDoc < doc)
                    {
                        pDoc = de.Advance(doc);
                    }
                }

                if (doc == pDoc)
                {
                    weights[i] = scorer.Weight(contentLength, de.Freq);
                    de.NextPosition();
                    pq.Add(new OffsetsEnum(de, i));
                }
            }

            pq.Add(new OffsetsEnum(EMPTY, int.MaxValue)); // a sentinel for termination

            Support.PriorityQueue <Passage> passageQueue = new Support.PriorityQueue <Passage>(n, new HighlightDocComparerAnonymousHelper1());
            Passage current = new Passage();

            OffsetsEnum off;

            while ((off = pq.Poll()) != null)
            {
                DocsAndPositionsEnum dp = off.dp;
                int start = dp.StartOffset;
                if (start == -1)
                {
                    throw new ArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                }
                int end = dp.EndOffset;
                // LUCENE-5166: this hit would span the content limit... however more valid
                // hits may exist (they are sorted by start). so we pretend like we never
                // saw this term, it won't cause a passage to be added to passageQueue or anything.
                Debug.Assert(EMPTY.StartOffset == int.MaxValue);
                if (start < contentLength && end > contentLength)
                {
                    continue;
                }
                if (start >= current.endOffset)
                {
                    if (current.startOffset >= 0)
                    {
                        // finalize current
                        current.score *= scorer.Norm(current.startOffset);
                        // new sentence: first add 'current' to queue
                        if (passageQueue.Count == n && current.score < passageQueue.Peek().score)
                        {
                            current.Reset(); // can't compete, just reset it
                        }
                        else
                        {
                            passageQueue.Offer(current);
                            if (passageQueue.Count > n)
                            {
                                current = passageQueue.Poll();
                                current.Reset();
                            }
                            else
                            {
                                current = new Passage();
                            }
                        }
                    }
                    // if we exceed limit, we are done
                    if (start >= contentLength)
                    {
                        Passage[] passages = passageQueue.ToArray();
                        foreach (Passage p in passages)
                        {
                            p.Sort();
                        }
                        // sort in ascending order
                        ArrayUtil.TimSort(passages, new HighlightDocComparerAnonymousHelper2());
                        return(passages);
                    }
                    // advance breakiterator
                    Debug.Assert(BreakIterator.Done < 0);
                    current.startOffset = Math.Max(bi.Preceding(start + 1), 0);
                    current.endOffset   = Math.Min(bi.Next(), contentLength);
                }
                int tf = 0;
                while (true)
                {
                    tf++;
                    BytesRef term = terms[off.id];
                    if (term == null)
                    {
                        // multitermquery match, pull from payload
                        term = off.dp.GetPayload();
                        Debug.Assert(term != null);
                    }
                    current.AddMatch(start, end, term);
                    if (off.pos == dp.Freq)
                    {
                        break; // removed from pq
                    }
                    else
                    {
                        off.pos++;
                        dp.NextPosition();
                        start = dp.StartOffset;
                        end   = dp.EndOffset;
                    }
                    if (start >= current.endOffset || end > contentLength)
                    {
                        pq.Offer(off);
                        break;
                    }
                }
                current.score += weights[off.id] * scorer.Tf(tf, current.endOffset - current.startOffset);
            }

            // Dead code but compiler disagrees:
            Debug.Assert(false);
            return(null);
        }