コード例 #1
0
        /// <summary>
        /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
        /// length <paramref name="prefixLength"/> with <paramref name="term"/> and which have a fuzzy similarity &gt;
        /// <paramref name="minSimilarity"/>.
        /// <para/>
        /// After calling the constructor the enumeration is already pointing to the first
        /// valid term if such a term exists.
        /// </summary>
        /// <param name="terms"> Delivers terms. </param>
        /// <param name="atts"> <see cref="AttributeSource"/> created by the rewrite method of <see cref="MultiTermQuery"/>
        /// thats contains information about competitive boosts during rewrite. It is also used
        /// to cache DFAs between segment transitions. </param>
        /// <param name="term"> Pattern term. </param>
        /// <param name="minSimilarity"> Minimum required similarity for terms from the reader. Pass an integer value
        ///        representing edit distance. Passing a fraction is deprecated. </param>
        /// <param name="prefixLength"> Length of required common prefix. Default value is 0. </param>
        /// <param name="transpositions"> Transpositions </param>
        /// <exception cref="IOException"> if there is a low-level IO error </exception>
        public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength, bool transpositions)
        {
            boostAtt = Attributes.AddAttribute <IBoostAttribute>();
            if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
            {
                throw new ArgumentException("fractional edit distances are not allowed");
            }
            if (minSimilarity < 0.0f)
            {
                throw new ArgumentOutOfRangeException(nameof(minSimilarity), "minimumSimilarity cannot be less than 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }
            if (prefixLength < 0)
            {
                throw new ArgumentOutOfRangeException(nameof(prefixLength), "prefixLength cannot be less than 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
            }
            this.m_terms = terms;
            this.term    = term;

            // convert the string into a utf32 int[] representation for fast comparisons
            string utf16 = term.Text;

            this.m_termText = new int[utf16.CodePointCount(0, utf16.Length)];
            for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
            {
                m_termText[j++] = cp = utf16.CodePointAt(i);
            }
            this.m_termLength = m_termText.Length;
            this.dfaAtt       = atts.AddAttribute <ILevenshteinAutomataAttribute>();

            //The prefix could be longer than the word.
            //It's kind of silly though.  It means we must match the entire word.
            this.m_realPrefixLength = prefixLength > m_termLength ? m_termLength : prefixLength;
            // if minSimilarity >= 1, we treat it as number of edits
            if (minSimilarity >= 1f)
            {
                this.m_minSimilarity = 0; // just driven by number of edits
                m_maxEdits           = (int)minSimilarity;
                m_raw = true;
            }
            else
            {
                this.m_minSimilarity = minSimilarity;
                // calculate the maximum k edits for this similarity
                m_maxEdits = InitialMaxDistance(this.m_minSimilarity, m_termLength);
                m_raw      = false;
            }
            if (transpositions && m_maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
            {
                throw UnsupportedOperationException.Create("with transpositions enabled, distances > " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
            }
            this.transpositions = transpositions;
            this.m_scaleFactor  = 1.0f / (1.0f - this.m_minSimilarity);

            this.maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            bottom           = maxBoostAtt.MaxNonCompetitiveBoost;
            bottomTerm       = maxBoostAtt.CompetitiveTerm;
            BottomChanged(null, true);
        }
コード例 #2
0
            public AutomatonFuzzyTermsEnum(FuzzyTermsEnum outerInstance, TermsEnum tenum, CompiledAutomaton[] compiled)
                : base(tenum, false)
            {
                this.outerInstance = outerInstance;

                boostAtt      = Attributes.AddAttribute <IBoostAttribute>();
                this.matchers = new ByteRunAutomaton[compiled.Length];
                for (int i = 0; i < compiled.Length; i++)
                {
                    this.matchers[i] = compiled[i].RunAutomaton;
                }
                termRef = new BytesRef(outerInstance.term.Text);
            }
コード例 #3
0
ファイル: TopTermsRewrite.cs プロジェクト: segovia/lucenenet
            public override void SetNextEnum(TermsEnum termsEnum)
            {
                this.termsEnum = termsEnum;
                this.termComp  = termsEnum.Comparer;

                Debug.Assert(CompareToLastTerm(null));

                // lazy init the initial ScoreTerm because comparer is not known on ctor:
                if (st == null)
                {
                    st = new ScoreTerm(this.termComp, new TermContext(m_topReaderContext));
                }
                boostAtt = termsEnum.Attributes.AddAttribute <IBoostAttribute>();
            }
コード例 #4
0
            /// <summary>
            /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
            /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity &gt;
            /// <c>minSimilarity</c>.
            /// <para/>
            /// After calling the constructor the enumeration is already pointing to the first
            /// valid term if such a term exists.
            /// </summary>
            /// <exception cref="IOException">If there is a low-level I/O error.</exception>
            public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance)
                : base(outerInstance.m_terms.GetEnumerator())
            {
                this.outerInstance = outerInstance;
                this.boostAtt      = Attributes.AddAttribute <IBoostAttribute>();

                this.text = new int[outerInstance.m_termLength - outerInstance.m_realPrefixLength];
                System.Array.Copy(outerInstance.m_termText, outerInstance.m_realPrefixLength, text, 0, text.Length);
                string prefix = UnicodeUtil.NewString(outerInstance.m_termText, 0, outerInstance.m_realPrefixLength);

                prefixBytesRef = new BytesRef(prefix);
                this.d         = new int[this.text.Length + 1];
                this.p         = new int[this.text.Length + 1];


                SetInitialSeekTerm(prefixBytesRef);
            }
コード例 #5
0
        private void AddTerms(IndexReader reader, FieldVals f)
        {
            if (f.queryString is null)
            {
                return;
            }
            Terms terms = MultiFields.GetTerms(reader, f.fieldName);

            if (terms is null)
            {
                return;
            }
            TokenStream ts = analyzer.GetTokenStream(f.fieldName, f.queryString);

            try
            {
                ICharTermAttribute termAtt = ts.AddAttribute <ICharTermAttribute>();

                int           corpusNumDocs  = reader.NumDocs;
                ISet <string> processedTerms = new JCG.HashSet <string>();
                ts.Reset();
                while (ts.IncrementToken())
                {
                    string term = termAtt.ToString();
                    if (!processedTerms.Contains(term))
                    {
                        processedTerms.Add(term);
                        ScoreTermQueue  variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
                        float           minScore  = 0;
                        Term            startTerm = new Term(f.fieldName, term);
                        AttributeSource atts      = new AttributeSource();
                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
                            atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
#pragma warning disable 612, 618
                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
#pragma warning restore 612, 618
                        //store the df so all variants use same idf
                        int             df                   = reader.DocFreq(startTerm);
                        int             numVariants          = 0;
                        int             totalVariantDocFreqs = 0;
                        BytesRef        possibleMatch;
                        IBoostAttribute boostAtt =
                            fe.Attributes.AddAttribute <IBoostAttribute>();
                        while (fe.MoveNext())
                        {
                            possibleMatch = fe.Term;
                            numVariants++;
                            totalVariantDocFreqs += fe.DocFreq;
                            float score = boostAtt.Boost;
                            if (variantsQ.Count < MAX_VARIANTS_PER_TERM || score > minScore)
                            {
                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
                                variantsQ.InsertWithOverflow(st);
                                minScore = variantsQ.Top.Score; // maintain minScore
                            }
                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Count >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
                        }

                        if (numVariants > 0)
                        {
                            int avgDf = totalVariantDocFreqs / numVariants;
                            if (df == 0)    //no direct match we can use as df for all variants
                            {
                                df = avgDf; //use avg df of all variants
                            }

                            // take the top variants (scored by edit distance) and reset the score
                            // to include an IDF factor then add to the global queue for ranking
                            // overall top query terms
                            int size = variantsQ.Count;
                            for (int i = 0; i < size; i++)
                            {
                                ScoreTerm st = variantsQ.Pop();
                                st.Score = (st.Score * st.Score) * sim.Idf(df, corpusNumDocs);
                                q.InsertWithOverflow(st);
                            }
                        }
                    }
                }
                ts.End();
            }
            finally
            {
                IOUtils.DisposeWhileHandlingException(ts);
            }
        }
コード例 #6
0
        /// <summary>
        /// Provide spelling corrections based on several parameters.
        /// </summary>
        /// <param name="term"> The term to suggest spelling corrections for </param>
        /// <param name="numSug"> The maximum number of spelling corrections </param>
        /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
        /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
        /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
        /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
        /// <param name="spare"> a chars scratch </param>
        /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
        /// <exception cref="System.IO.IOException"> If I/O related errors occur </exception>
        protected internal virtual IEnumerable <ScoreTerm> SuggestSimilar(Term term, int numSug, IndexReader ir,
                                                                          int docfreq, int editDistance, float accuracy, CharsRef spare)
        {
            var atts = new AttributeSource();
            IMaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            Terms terms = MultiFields.GetTerms(ir, term.Field);

            if (terms == null)
            {
                return(new List <ScoreTerm>());
            }
            FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);

            var stQueue = new Support.PriorityQueue <ScoreTerm>();

            BytesRef        queryTerm = new BytesRef(term.Text());
            BytesRef        candidateTerm;
            ScoreTerm       st       = new ScoreTerm();
            IBoostAttribute boostAtt = e.Attributes.AddAttribute <IBoostAttribute>();

            while ((candidateTerm = e.Next()) != null)
            {
                float boost = boostAtt.Boost;
                // ignore uncompetitive hits
                if (stQueue.Count >= numSug && boost <= stQueue.Peek().Boost)
                {
                    continue;
                }

                // ignore exact match of the same term
                if (queryTerm.BytesEquals(candidateTerm))
                {
                    continue;
                }

                int df = e.DocFreq;

                // check docFreq if required
                if (df <= docfreq)
                {
                    continue;
                }

                float  score;
                string termAsString;
                if (distance == INTERNAL_LEVENSHTEIN)
                {
                    // delay creating strings until the end
                    termAsString = null;
                    // undo FuzzyTermsEnum's scale factor for a real scaled lev score
                    score = boost / e.ScaleFactor + e.MinSimilarity;
                }
                else
                {
                    UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
                    termAsString = spare.ToString();
                    score        = distance.GetDistance(term.Text(), termAsString);
                }

                if (score < accuracy)
                {
                    continue;
                }

                // add new entry in PQ
                st.Term         = BytesRef.DeepCopyOf(candidateTerm);
                st.Boost        = boost;
                st.Docfreq      = df;
                st.TermAsString = termAsString;
                st.Score        = score;
                stQueue.Offer(st);
                // possibly drop entries from queue
                st = (stQueue.Count > numSug) ? stQueue.Poll() : new ScoreTerm();
                maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Count >= numSug) ? stQueue.Peek().Boost : float.NegativeInfinity;
            }

            return(stQueue);
        }
コード例 #7
0
 /// <summary>
 /// Swap in a new actual enum to proxy to </summary>
 protected virtual void SetEnum(TermsEnum actualEnum)
 {
     this.actualEnum     = actualEnum;
     this.actualBoostAtt = actualEnum.Attributes.AddAttribute <IBoostAttribute>();
 }
コード例 #8
0
 public override void SetNextEnum(TermsEnum termsEnum)
 {
     this.termsEnum = termsEnum;
     this.boostAtt  = termsEnum.Attributes.AddAttribute <IBoostAttribute>();
 }
コード例 #9
0
 public TermRangeTermsEnumAnonymousInnerClassHelper(MultiTermQueryAnonymousInnerClassHelper outerInstance, TermsEnum iterator, BytesRef bref1, BytesRef bref2)
     : base(iterator, bref1, bref2, true, true)
 {
     this.OuterInstance = outerInstance;
     boostAtt = Attributes().AddAttribute<IBoostAttribute>();
 }