예제 #1
0
        public static bool FuzzyMatch(string pattern, string str_origin, ref long outScore, List <int> matches)
        {
            int str_n;
            int pattern_n;
            int str_start;

            string str;

            using (new ScopedProfiler("[FM] Init"))
            {
                outScore = -100000;
                matches.Clear();

                if (string.IsNullOrEmpty(str_origin))
                {
                    return(false);
                }

                if (string.IsNullOrEmpty(pattern))
                {
                    return(true);
                }

                str       = str_origin.ToLowerInvariant();
                pattern_n = pattern.Length;

                var allowEmptyQuery = false; //filter.HasFilters();
                if (allowEmptyQuery && string.IsNullOrEmpty(pattern))
                {
                    outScore = 10;
                    return(true);
                }

                // find [str_start..str_end) that contains pattern's first and last letter
                str_start = 0;
                var str_end = str.Length - 1;

                var pattern_first_lower = pattern[0];
                var pattern_end_lower   = pattern[pattern.Length - 1];

                for (; str_start < str.Length; ++str_start)
                {
                    if (pattern_first_lower == str[str_start])
                    {
                        break;
                    }
                }

                for (; str_end >= 0; --str_end)
                {
                    if (pattern_end_lower == str[str_end])
                    {
                        break;
                    }
                }
                ++str_end;

                str_n = str_end - str_start;

                // str subset is shorter than pattern
                if (str_n < pattern_n)
                {
                    return(false);
                }

                // do check that pattern is fully inside [str_start..str_end)
                var pattern_i = 0;
                var str_i     = str_start;
                while (pattern_i < pattern_n && str_i < str_end)
                {
                    if (pattern[pattern_i] == str[str_i])
                    {
                        ++pattern_i;
                    }
                    ++str_i;
                }

                if (pattern_i < pattern_n)
                {
                    return(false);
                }
            }

            using (new ScopedProfiler("[FM] Body"))
                using (var d = FuzzyMatchData.Request(str_n, pattern_n))
                {
                    var str_n_minus_pattern_n_plus_1 = str_n - pattern_n + 1;

                    var prev_min_i = 0;

                    using (var _3 = new ScopedProfiler("[FM] Match loop"))
                        for (var j = 0; j < pattern_n; ++j)
                        {
                            var min_i       = str_n + 1;
                            var first_match = true;

                            for (int i = Math.Max(j, prev_min_i), end_i = str_n_minus_pattern_n_plus_1 + j; i < end_i; ++i)
                            {
                                // Skip existing <> tags
                                if (str[i] == '<')
                                {
                                    for (; i < end_i; ++i)
                                    {
                                        if (str[i] == '>')
                                        {
                                            break;
                                        }
                                    }
                                }

                                var si = i + str_start;

                                var inQuoteExpectEquality = false; //i > 0 && j > 0 && filter.indexesAreInSameQuotes(j, j - 1);

                                var match  = false;
                                var cancel = false;

                                if (pattern[j] == str[si])
                                {
                                    if (inQuoteExpectEquality)
                                    {
                                        if (d.matchData[i - 1, j - 1])
                                        {
                                            match = true;
                                        }
                                        else
                                        {
                                            cancel = true;
                                        }
                                    }
                                    else
                                    {
                                        match = true;
                                    }
                                }
                                else
                                {
                                    if (inQuoteExpectEquality && d.matchData[i - 1, j - 1])
                                    {
                                        cancel = true;
                                    }
                                }

                                if (cancel)
                                {
                                    // cancel quote
                                    var j_start = 0; //filter.indexOfQuoteStart(j);

                                    var n = j - j_start;
                                    for (var k = 1; k <= n; k++)
                                    {
                                        //Assert.IsTrue(matchData[i - k, j - k]);
                                        if (i < k || j < k)
                                        {
                                            break;
                                        }
                                        d.matchData[i - k, j - k] = false;
                                    }
                                }

                                if (i >= d.matchData.GetLength(0) || j >= d.matchData.GetLength(1))
                                {
                                    return(false);
                                }
                                d.matchData[i, j] = match;

                                if (match)
                                {
                                    if (first_match)
                                    {
                                        min_i       = i;
                                        first_match = false;
                                    }

                                    d.matches_indx[j].Add(new ScoreIndx {
                                        i = i, score = 1, prev_mi = -1
                                    });
                                }
                            }

                            if (first_match)
                            {
                                return(false); // no match for pattern[j]
                            }
                            prev_min_i = min_i;
                        }


                    const int sequential_bonus           = 75;  // bonus for adjacent matches
                    const int separator_bonus            = 30;  // bonus if match occurs after a separator
                    const int camel_bonus                = 30;  // bonus if match is uppercase and prev is lower or symbol
                    const int first_letter_bonus         = 35;  // bonus if the first letter is matched
                    const int leading_letter_penalty     = -5;  // penalty applied for every letter in str before the first match
                    const int max_leading_letter_penalty = -15; // maximum penalty for leading letters
                    const int unmatched_letter_penalty   = -1;  // penalty for every letter that doesn't matter
                    int       unmatched = str_n - matches.Count;

                    // find best score
                    using (new ScopedProfiler("[FM] Best score 0"))
                        for (var mi = 0; mi < d.matches_indx[0].Count; ++mi)
                        {
                            var i  = d.matches_indx[0][mi].i;
                            var si = str_start + i;
                            var s  = 100 + unmatched_letter_penalty * unmatched;

                            var penalty = leading_letter_penalty * si;
                            if (penalty < max_leading_letter_penalty)
                            {
                                penalty = max_leading_letter_penalty;
                            }

                            s += penalty;
                            if (si == 0)
                            {
                                s += first_letter_bonus;
                            }

                            else
                            {
                                var currOrigI = str_origin[si];
                                var prevOrigI = str_origin[si - 1];

                                if (char.IsUpper(currOrigI) && char.IsUpper(prevOrigI) == false)
                                {
                                    s += camel_bonus;
                                }
                                else if (prevOrigI == '_' || prevOrigI == ' ')
                                {
                                    s += separator_bonus;
                                }
                            }

                            d.matches_indx[0][mi] = new ScoreIndx
                            {
                                i       = i,
                                score   = s,
                                prev_mi = -1
                            };
                        }

                    using (new ScopedProfiler("[FM] Best score 1..pattern_n"))
                        for (var j = 1; j < pattern_n; ++j)
                        {
                            for (var mi = 0; mi < d.matches_indx[j].Count; ++mi)
                            {
                                var match = d.matches_indx[j][mi];

                                var si = str_start + d.matches_indx[j][mi].i;

                                var currOrigI = str_origin[si];
                                var prevOrigI = str_origin[si - 1];

                                if (char.IsUpper(currOrigI) && char.IsUpper(prevOrigI) == false)
                                {
                                    match.score += camel_bonus;
                                }
                                else if (prevOrigI == '_' || prevOrigI == ' ')
                                {
                                    match.score += separator_bonus;
                                }

                                // select from prev
                                var best_pmi = 0;

                                var best_score = -1;
                                for (var pmi = 0; pmi < d.matches_indx[j - 1].Count; ++pmi)
                                {
                                    var prev_i = d.matches_indx[j - 1][pmi].i;
                                    if (prev_i >= match.i)
                                    {
                                        break;
                                    }

                                    var pmi_score = d.matches_indx[j - 1][pmi].score;

                                    if (prev_i == match.i - 1)
                                    {
                                        pmi_score += sequential_bonus;
                                    }

                                    if (best_score < pmi_score)
                                    {
                                        best_score = pmi_score;
                                        best_pmi   = pmi;
                                    }
                                }

                                match.score  += best_score;
                                match.prev_mi = best_pmi;

                                d.matches_indx[j][mi] = match;
                            }
                        }

                    var best_mi = 0;
                    var max_j   = pattern_n - 1;
                    for (var mi = 1; mi < d.matches_indx[max_j].Count; ++mi)
                    {
                        if (d.matches_indx[max_j][best_mi].score < d.matches_indx[max_j][mi].score)
                        {
                            best_mi = mi;
                        }
                    }

                    var bestScore = d.matches_indx[max_j][best_mi];

                    //                DebugPrint(pattern, str_origin, str_start, str_n, (i, j) =>
                    //                {
                    //                    var arr = d.matches_indx[j];
                    //                    for (int k = 0; k < arr.Count; ++k)
                    //                   {
                    //                       if (arr[k].i == i)
                    //                        return $"{arr[k].i + str_start} {arr[k].score} {arr[k].prev_mi}";
                    //                    }
                    //
                    //                    return ".";
                    //                });
                    outScore = bestScore.score;
                    using (new ScopedProfiler("[FM] Matches calc"))
                    {
                        matches.Capacity = pattern_n;
                        matches.Add(bestScore.i + str_start);
                        {
                            var mi = bestScore.prev_mi;
                            for (var j = pattern_n - 2; j >= 0; --j)
                            {
                                matches.Add(d.matches_indx[j][mi].i + str_start);
                                mi = d.matches_indx[j][mi].prev_mi;
                            }
                        }
                        matches.Reverse();
                    }

                    /*var prev_si = -1;
                     * foreach (var si in matches)
                     * {
                     *  Assert.IsTrue(si > prev_si, $"find <{pattern}> in <{str_origin}>. {si} > {prev_si}");
                     *
                     *  Assert.IsTrue(si >= 0);
                     *  Assert.IsTrue(si < str_origin.Length);
                     *
                     *  prev_si = si;
                     * }*/
                    return(true);
                }
        }
예제 #2
0
        /// <summary>
        /// Performs a fuzzy search on a string to see if it matches a pattern.
        /// </summary>
        /// <param name="pattern">Pattern that we try to match the source string</param>
        /// <param name="origin">String we are looking into for a match</param>
        /// <param name="outScore">Score of the match. A higher score means the pattern is a better match for the string.</param>
        /// <param name="matches">List of indices in the source string where a match was found.</param>
        /// <returns>Returns true if a match was found</returns>
        public static bool FuzzyMatch(string pattern, string origin, ref long outScore, List <int> matches = null)
        {
            int str_n;
            int pattern_n;
            int str_start;

            string str;

            using (new ScopedProfiler("[FM] Init"))
            {
                outScore = -100000;
                matches?.Clear();

                if (string.IsNullOrEmpty(origin))
                {
                    return(false);
                }

                if (string.IsNullOrEmpty(pattern))
                {
                    return(true);
                }

                str       = origin.ToLowerInvariant();
                pattern_n = pattern.Length;

                // find [str_start..str_end) that contains pattern's first and last letter
                str_start = 0;
                var str_end = str.Length - 1;

                var pattern_first_lower = pattern[0];
                var pattern_end_lower   = pattern[pattern.Length - 1];

                for (; str_start < str.Length; ++str_start)
                {
                    if (pattern_first_lower == str[str_start])
                    {
                        break;
                    }
                }

                for (; str_end >= 0; --str_end)
                {
                    if (pattern_end_lower == str[str_end])
                    {
                        break;
                    }
                }
                ++str_end;

                str_n = str_end - str_start;

                // str subset is shorter than pattern
                if (str_n < pattern_n)
                {
                    return(false);
                }

                // do check that pattern is fully inside [str_start..str_end)
                var pattern_i = 0;
                var str_i     = str_start;
                while (pattern_i < pattern_n && str_i < str_end)
                {
                    if (pattern[pattern_i] == str[str_i])
                    {
                        ++pattern_i;
                    }
                    ++str_i;
                }

                if (pattern_i < pattern_n)
                {
                    return(false);
                }
            }

            using (new ScopedProfiler("[FM] Body"))
                using (var d = FuzzyMatchData.Request(str_n, pattern_n))
                {
                    var str_n_minus_pattern_n_plus_1 = str_n - pattern_n + 1;

                    var prev_min_i = 0;

                    using (var _3 = new ScopedProfiler("[FM] Match loop"))
                        for (var j = 0; j < pattern_n; ++j)
                        {
                            var min_i       = str_n + 1;
                            var first_match = true;

                            for (int i = Math.Max(j, prev_min_i), end_i = str_n_minus_pattern_n_plus_1 + j; i < end_i; ++i)
                            {
                                // Skip existing <> tags
                                if (str[i] == '<')
                                {
                                    for (; i < end_i; ++i)
                                    {
                                        if (str[i] == '>')
                                        {
                                            break;
                                        }
                                    }
                                }

                                var si = i + str_start;

                                var match = false;

                                if (pattern[j] == str[si])
                                {
                                    match = true;
                                }

                                if (i >= d.matchData.GetLength(0) || j >= d.matchData.GetLength(1))
                                {
                                    return(false);
                                }
                                d.matchData[i, j] = match;

                                if (match)
                                {
                                    if (first_match)
                                    {
                                        min_i       = i;
                                        first_match = false;
                                    }

                                    d.matches_indx[j].Add(new ScoreIndx {
                                        i = i, score = 1, prev_mi = -1
                                    });
                                }
                            }

                            if (first_match)
                            {
                                return(false); // no match for pattern[j]
                            }
                            prev_min_i = min_i;
                        }


                    const int sequential_bonus           = 75;  // bonus for adjacent matches
                    const int separator_bonus            = 30;  // bonus if match occurs after a separator
                    const int camel_bonus                = 30;  // bonus if match is uppercase and prev is lower or symbol
                    const int first_letter_bonus         = 35;  // bonus if the first letter is matched
                    const int leading_letter_penalty     = -5;  // penalty applied for every letter in str before the first match
                    const int max_leading_letter_penalty = -15; // maximum penalty for leading letters
                    const int unmatched_letter_penalty   = -1;  // penalty for every letter that doesn't matter
                    int       unmatched = str_n - (matches?.Count ?? 0);

                    // find best score
                    using (new ScopedProfiler("[FM] Best score 0"))
                        for (var mi = 0; mi < d.matches_indx[0].Count; ++mi)
                        {
                            var i  = d.matches_indx[0][mi].i;
                            var si = str_start + i;
                            var s  = 100 + unmatched_letter_penalty * unmatched;

                            var penalty = leading_letter_penalty * si;
                            if (penalty < max_leading_letter_penalty)
                            {
                                penalty = max_leading_letter_penalty;
                            }

                            s += penalty;
                            if (si == 0)
                            {
                                s += first_letter_bonus;
                            }
                            else
                            {
                                var currOrigI = origin[si];
                                var prevOrigI = origin[si - 1];

                                if (char.IsUpper(currOrigI) && char.IsUpper(prevOrigI) == false)
                                {
                                    s += camel_bonus;
                                }
                                else if (prevOrigI == '_' || prevOrigI == ' ')
                                {
                                    s += separator_bonus;
                                }
                            }

                            d.matches_indx[0][mi] = new ScoreIndx
                            {
                                i       = i,
                                score   = s,
                                prev_mi = -1
                            };
                        }

                    using (new ScopedProfiler("[FM] Best score 1..pattern_n"))
                        for (var j = 1; j < pattern_n; ++j)
                        {
                            for (var mi = 0; mi < d.matches_indx[j].Count; ++mi)
                            {
                                var match = d.matches_indx[j][mi];

                                var si = str_start + d.matches_indx[j][mi].i;

                                var currOrigI = origin[si];
                                var prevOrigI = origin[si - 1];

                                if (char.IsUpper(currOrigI) && char.IsUpper(prevOrigI) == false)
                                {
                                    match.score += camel_bonus;
                                }
                                else if (prevOrigI == '_' || prevOrigI == ' ')
                                {
                                    match.score += separator_bonus;
                                }

                                // select from prev
                                var best_pmi = 0;

                                var best_score = -1;
                                for (var pmi = 0; pmi < d.matches_indx[j - 1].Count; ++pmi)
                                {
                                    var prev_i = d.matches_indx[j - 1][pmi].i;
                                    if (prev_i >= match.i)
                                    {
                                        break;
                                    }

                                    var pmi_score = d.matches_indx[j - 1][pmi].score;

                                    if (prev_i == match.i - 1)
                                    {
                                        pmi_score += sequential_bonus;
                                    }

                                    if (best_score < pmi_score)
                                    {
                                        best_score = pmi_score;
                                        best_pmi   = pmi;
                                    }
                                }

                                match.score  += best_score;
                                match.prev_mi = best_pmi;

                                d.matches_indx[j][mi] = match;
                            }
                        }

                    var best_mi = 0;
                    var max_j   = pattern_n - 1;
                    for (var mi = 1; mi < d.matches_indx[max_j].Count; ++mi)
                    {
                        if (d.matches_indx[max_j][best_mi].score < d.matches_indx[max_j][mi].score)
                        {
                            best_mi = mi;
                        }
                    }

                    var bestScore = d.matches_indx[max_j][best_mi];

                    outScore = bestScore.score;
                    if (matches != null)
                    {
                        using (new ScopedProfiler("[FM] Matches calc"))
                        {
                            matches.Capacity = pattern_n;
                            matches.Add(bestScore.i + str_start);
                            {
                                var mi = bestScore.prev_mi;
                                for (var j = pattern_n - 2; j >= 0; --j)
                                {
                                    matches.Add(d.matches_indx[j][mi].i + str_start);
                                    mi = d.matches_indx[j][mi].prev_mi;
                                }
                            }
                            matches.Reverse();
                        }
                    }

                    return(true);
                }
        }