Ejemplo n.º 1
0
        public void OverloadsTest()
        {
            var ac = new AhoCorasick(new List <string> {
                "a"
            });

            CollectionAssert.AreEqual(new WordMatchList {
                { 0, "a" }
            }, ac.Search("a").ToList());
            Assert.AreEqual(0, ac.Search("b").Count());

            ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, new List <string> {
                "a", "ab", "bab", "bC", "bca", "c", "caa"
            });
            var m        = ac.Search("abCcab").ToList();
            var expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };

            CollectionAssert.AreEqual(expected, m);

            ac = new AhoCorasick();
            ac.Add("a");
            ac.BuildFail();
            CollectionAssert.AreEqual(new WordMatchList {
                { 0, "a" }
            }, ac.Search("a").ToList());
            Assert.AreEqual(0, ac.Search("b").Count());

            ac       = new AhoCorasick(CharComparer.Create(CultureInfo.InvariantCulture, true), "a", "ab", "bab", "bc", "bca", "c", "caa");
            m        = ac.Search("abccab").ToList();
            expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };
            CollectionAssert.AreEqual(expected, m);
        }
Ejemplo n.º 2
0
        public static AhoCorasick ComputeRules(
            List <string> rules,
            Dictionary <string, List <Expression> > groupByLonguestCommonExpression)
        {
            int nextId = 0;

            for (int i = 0; i < rules.Count; i++)
            {
                var pattern = rules[i];
                pattern = RemoveAccents(pattern);
                pattern = pattern.ToLower();

                // We will take the longuest fixed expression.
                var longuestCommonExpression = pattern
                                               .Split(new char[] { '*' }, StringSplitOptions.RemoveEmptyEntries)
                                               .OrderByDescending(m => m.Length)
                                               .First();

                var matches = regexKeepWordOnly.Matches(longuestCommonExpression);
                var words   = matches.Select(m => m.Value);
                longuestCommonExpression = string.Join(' ', words);

                // Exact expression is faster.
                var exactExpression = !pattern.Contains("*");

                if (exactExpression)
                {
                    // Exact word or group of words. (add extra space to prevent matching)
                    longuestCommonExpression = " " + longuestCommonExpression + " ";
                }
                else
                {
                    // We are using only a part of the expression.
                    // We will not append extra space.
                }

                // Prepare the future Regex.
                var matches2 = regexKeepWordOnlyAndSpecialCharacters.Matches(pattern);
                var words2   = matches2.Select(m => m.Value.Replace("*", "[\\p{L}]+"));
                pattern = string.Join(' ', words2);

                if (!groupByLonguestCommonExpression.ContainsKey(longuestCommonExpression))
                {
                    groupByLonguestCommonExpression[longuestCommonExpression] =
                        new List <Expression>();
                }

                var expr = new Expression
                {
                    Id   = nextId++,
                    Expr = rules[i],
                    IsExactExpression = exactExpression
                };

                if (!exactExpression)
                {
                    // The pattern will be use if it is not an exact expression
                    // to resolve the wildcard.
                    expr.Pattern = pattern;
                }

                groupByLonguestCommonExpression[longuestCommonExpression].Add(expr);
            }
            var keywords = groupByLonguestCommonExpression.Select(m => m.Key).ToList();

            // We are using the Aho–Corasick algorithm.
            // (https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm)

            // Unfortunately, it doesn't support wildcard.
            // But, we will use the longuest fixed expression.
            // So, we reduce the complexities and the number of regex check.

            // The algorithm will tell us which Regex to evaluate.
            // But, if it is an exact expression, no need to do extra processing.

            AhoCorasick treeAhoCorasick = new AhoCorasick();

            treeAhoCorasick.Add(keywords);
            treeAhoCorasick.BuildFail();
            return(treeAhoCorasick);
        }