public void OverloadsTest() { var ac = new AhoCorasick(new List <string> { "a" }); CollectionAssert.AreEqual(new WordMatchList { { 0, "a" } }, ac.Search("a").ToList()); Assert.AreEqual(0, ac.Search("b").Count()); ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, new List <string> { "a", "ab", "bab", "bC", "bca", "c", "caa" }); var m = ac.Search("abCcab").ToList(); var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); ac = new AhoCorasick(); ac.Add("a"); ac.BuildFail(); CollectionAssert.AreEqual(new WordMatchList { { 0, "a" } }, ac.Search("a").ToList()); Assert.AreEqual(0, ac.Search("b").Count()); ac = new AhoCorasick(CharComparer.Create(CultureInfo.InvariantCulture, true), "a", "ab", "bab", "bc", "bca", "c", "caa"); m = ac.Search("abccab").ToList(); expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); }
public static AhoCorasick ComputeRules( List <string> rules, Dictionary <string, List <Expression> > groupByLonguestCommonExpression) { int nextId = 0; for (int i = 0; i < rules.Count; i++) { var pattern = rules[i]; pattern = RemoveAccents(pattern); pattern = pattern.ToLower(); // We will take the longuest fixed expression. var longuestCommonExpression = pattern .Split(new char[] { '*' }, StringSplitOptions.RemoveEmptyEntries) .OrderByDescending(m => m.Length) .First(); var matches = regexKeepWordOnly.Matches(longuestCommonExpression); var words = matches.Select(m => m.Value); longuestCommonExpression = string.Join(' ', words); // Exact expression is faster. var exactExpression = !pattern.Contains("*"); if (exactExpression) { // Exact word or group of words. (add extra space to prevent matching) longuestCommonExpression = " " + longuestCommonExpression + " "; } else { // We are using only a part of the expression. // We will not append extra space. } // Prepare the future Regex. var matches2 = regexKeepWordOnlyAndSpecialCharacters.Matches(pattern); var words2 = matches2.Select(m => m.Value.Replace("*", "[\\p{L}]+")); pattern = string.Join(' ', words2); if (!groupByLonguestCommonExpression.ContainsKey(longuestCommonExpression)) { groupByLonguestCommonExpression[longuestCommonExpression] = new List <Expression>(); } var expr = new Expression { Id = nextId++, Expr = rules[i], IsExactExpression = exactExpression }; if (!exactExpression) { // The pattern will be use if it is not an exact expression // to resolve the wildcard. expr.Pattern = pattern; } groupByLonguestCommonExpression[longuestCommonExpression].Add(expr); } var keywords = groupByLonguestCommonExpression.Select(m => m.Key).ToList(); // We are using the Aho–Corasick algorithm. // (https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm) // Unfortunately, it doesn't support wildcard. // But, we will use the longuest fixed expression. // So, we reduce the complexities and the number of regex check. // The algorithm will tell us which Regex to evaluate. // But, if it is an exact expression, no need to do extra processing. AhoCorasick treeAhoCorasick = new AhoCorasick(); treeAhoCorasick.Add(keywords); treeAhoCorasick.BuildFail(); return(treeAhoCorasick); }