Example #1
0
        /// <summary>
        /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
        /// length <paramref name="prefixLength"/> with <paramref name="term"/> and which have a fuzzy similarity &gt;
        /// <paramref name="minSimilarity"/>.
        /// <para/>
        /// After calling the constructor the enumeration is already pointing to the first
        /// valid term if such a term exists.
        /// </summary>
        /// <param name="terms"> Delivers terms. </param>
        /// <param name="atts"> <see cref="AttributeSource"/> created by the rewrite method of <see cref="MultiTermQuery"/>
        /// thats contains information about competitive boosts during rewrite. It is also used
        /// to cache DFAs between segment transitions. </param>
        /// <param name="term"> Pattern term. </param>
        /// <param name="minSimilarity"> Minimum required similarity for terms from the reader. Pass an integer value
        ///        representing edit distance. Passing a fraction is deprecated. </param>
        /// <param name="prefixLength"> Length of required common prefix. Default value is 0. </param>
        /// <param name="transpositions"> Transpositions </param>
        /// <exception cref="IOException"> if there is a low-level IO error </exception>
        public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength, bool transpositions)
        {
            InitializeInstanceFields();
            if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
            {
                throw new ArgumentException("fractional edit distances are not allowed");
            }
            if (minSimilarity < 0.0f)
            {
                throw new ArgumentException("minimumSimilarity cannot be less than 0");
            }
            if (prefixLength < 0)
            {
                throw new ArgumentException("prefixLength cannot be less than 0");
            }
            this.m_terms = terms;
            this.term    = term;

            // convert the string into a utf32 int[] representation for fast comparisons
            string utf16 = term.Text();

            this.m_termText = new int[utf16.CodePointCount(0, utf16.Length)];
            for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
            {
                m_termText[j++] = cp = utf16.CodePointAt(i);
            }
            this.m_termLength = m_termText.Length;
            this.dfaAtt       = atts.AddAttribute <ILevenshteinAutomataAttribute>();

            //The prefix could be longer than the word.
            //It's kind of silly though.  It means we must match the entire word.
            this.m_realPrefixLength = prefixLength > m_termLength ? m_termLength : prefixLength;
            // if minSimilarity >= 1, we treat it as number of edits
            if (minSimilarity >= 1f)
            {
                this.m_minSimilarity = 0; // just driven by number of edits
                m_maxEdits           = (int)minSimilarity;
                m_raw = true;
            }
            else
            {
                this.m_minSimilarity = minSimilarity;
                // calculate the maximum k edits for this similarity
                m_maxEdits = InitialMaxDistance(this.m_minSimilarity, m_termLength);
                m_raw      = false;
            }
            if (transpositions && m_maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
            {
                throw new NotSupportedException("with transpositions enabled, distances > " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
            }
            this.transpositions = transpositions;
            this.m_scaleFactor  = 1.0f / (1.0f - this.m_minSimilarity);

            this.maxBoostAtt = atts.AddAttribute <IMaxNonCompetitiveBoostAttribute>();
            bottom           = maxBoostAtt.MaxNonCompetitiveBoost;
            bottomTerm       = maxBoostAtt.CompetitiveTerm;
            BottomChanged(null, true);
        }
Example #2
0
        /// <summary>
        /// Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
        /// length <code>prefixLength</code> with <code>term</code> and which have a fuzzy similarity &gt;
        /// <code>minSimilarity</code>.
        /// <p>
        /// After calling the constructor the enumeration is already pointing to the first
        /// valid term if such a term exists.
        /// </summary>
        /// <param name="terms"> Delivers terms. </param>
        /// <param name="atts"> <seealso cref="AttributeSource"/> created by the rewrite method of <seealso cref="MultiTermQuery"/>
        /// thats contains information about competitive boosts during rewrite. It is also used
        /// to cache DFAs between segment transitions. </param>
        /// <param name="term"> Pattern term. </param>
        /// <param name="minSimilarity"> Minimum required similarity for terms from the reader. Pass an integer value
        ///        representing edit distance. Passing a fraction is deprecated. </param>
        /// <param name="prefixLength"> Length of required common prefix. Default value is 0. </param>
        /// <exception cref="IOException"> if there is a low-level IO error </exception>
        public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, float minSimilarity, int prefixLength, bool transpositions)
        {
            if (!InstanceFieldsInitialized)
            {
                InitializeInstanceFields();
                InstanceFieldsInitialized = true;
            }
            if (minSimilarity >= 1.0f && minSimilarity != (int)minSimilarity)
            {
                throw new System.ArgumentException("fractional edit distances are not allowed");
            }
            if (minSimilarity < 0.0f)
            {
                throw new System.ArgumentException("minimumSimilarity cannot be less than 0");
            }
            if (prefixLength < 0)
            {
                throw new System.ArgumentException("prefixLength cannot be less than 0");
            }
            this.Terms = terms;
            this.Term_Renamed = term;

            // convert the string into a utf32 int[] representation for fast comparisons
            string utf16 = term.Text();
            //LUCENE TO-DO
            //this.TermText = new int[utf16.codePointCount(0, utf16.Length)];
            this.TermText = new int[utf16.Length];
            for (int cp, i = 0, j = 0; i < utf16.Length; i += Character.CharCount(cp))
            {
                TermText[j++] = cp = Character.CodePointAt(utf16, i);
            }
            this.TermLength = TermText.Length;
            this.DfaAtt = atts.AddAttribute<ILevenshteinAutomataAttribute>();

            //The prefix could be longer than the word.
            //It's kind of silly though.  It means we must match the entire word.
            this.RealPrefixLength = prefixLength > TermLength ? TermLength : prefixLength;
            // if minSimilarity >= 1, we treat it as number of edits
            if (minSimilarity >= 1f)
            {
                this.MinSimilarity_Renamed = 0; // just driven by number of edits
                MaxEdits = (int)minSimilarity;
                Raw = true;
            }
            else
            {
                this.MinSimilarity_Renamed = minSimilarity;
                // calculate the maximum k edits for this similarity
                MaxEdits = InitialMaxDistance(this.MinSimilarity_Renamed, TermLength);
                Raw = false;
            }
            if (transpositions && MaxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
            {
                throw new System.NotSupportedException("with transpositions enabled, distances > " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + " are not supported ");
            }
            this.Transpositions = transpositions;
            this.Scale_factor = 1.0f / (1.0f - this.MinSimilarity_Renamed);

            this.MaxBoostAtt = atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>();
            Bottom = MaxBoostAtt.MaxNonCompetitiveBoost;
            BottomTerm = MaxBoostAtt.CompetitiveTerm;
            BottomChanged(null, true);
        }