private void runTest(String dictionaryPath, String textPath) { HashSet <String> dictionary = loadDictionary(dictionaryPath); String text = loadText(textPath); var ahoCorasickDoubleArrayTrie = new AhoCorasickDoubleArrayTrie <String>(); var dictionaryMap = new Dictionary <String, String>(); foreach (String word in dictionary) { dictionaryMap[word] = word; // we use the same text as the property of a word } var swBuild = new Stopwatch(); ahoCorasickDoubleArrayTrie.Build(dictionaryMap, true); swBuild.Stop(); WriteLine("Automata build time: {0}ms.\n", swBuild.ElapsedMilliseconds); // Let's test the speed of the two Aho-Corasick automata WriteLine("Parsing document which contains {0} characters, with a dictionary of {1} words.\n", text.Length, dictionary.Count); var sw = new Stopwatch(); sw.Start(); int hitCount = 0; ahoCorasickDoubleArrayTrie.ParseText(text, (hit) => { hitCount++; }); sw.Stop(); Assert.True(hitCount > 0); WriteLine("{0}ms, speed {1:0.##} char/s", sw.ElapsedMilliseconds, text.Length / (sw.ElapsedMilliseconds / 1000.0)); }
public void testCancellation() { // Collect test data set var map = new Dictionary <String, String>() { { "foo", "foo" }, { "bar", "bar" } }; // Build an AhoCorasickDoubleArrayTrie AhoCorasickDoubleArrayTrie <String> acdat = new AhoCorasickDoubleArrayTrie <String>(); acdat.Build(map); // count matches String haystack = "sfwtfoowercwbarqwrcq"; int count = 0; int countCancel = 0; Func <AhoCorasickDoubleArrayTrie <string> .Hit, bool> cancellingMatcher = (hit) => { countCancel++; return(false); }; Func <AhoCorasickDoubleArrayTrie <string> .Hit, bool> countingMatcher = (hit) => { count++; return(true); }; acdat.ParseText(haystack, cancellingMatcher); acdat.ParseText(haystack, countingMatcher); Assert.Equal(1, countCancel); Assert.Equal(2, count); }
public void testSaveLoad() { var acdat = buildASimpleAhoCorasickDoubleArrayTrie("hers", "his", "she", "he"); var memStream = new MemoryStream(); acdat.Save(memStream, true); WriteLine($"4 keywords, saved {memStream.Length} bytes"); var acdat2 = new AhoCorasickDoubleArrayTrie <string>(); memStream.Position = 0; acdat2.Load(memStream); Assert.Equal(acdat.Count, acdat2.Count); Assert.Equal("his", acdat2["his"]); validateASimpleAhoCorasickDoubleArrayTrie(acdat2, "uhers", new[] { "he", "hers" }); // large dictionary var dictionary = loadDictionary("dictionary.txt"); var keywords = dictionary.Select(k => new KeyValuePair <string, string>(k, k)); var acdat3 = new AhoCorasickDoubleArrayTrie <string>(keywords); var memStream2 = new MemoryStream(); acdat3.Save(memStream2, false); WriteLine($"{dictionary.Count} keywords, saved {memStream2.Length} bytes (without values)"); }
private void validateASimpleAhoCorasickDoubleArrayTrie(AhoCorasickDoubleArrayTrie <String> acdat, string text, string[] expected) { // Test it acdat.ParseText(text, (hit) => { WriteLine("[{0}:{1}]={2}", hit.Begin, hit.End, hit.Value); Assert.Equal(text.Substring(hit.Begin, hit.Length), hit.Value); }); // Or simply use var wordList = acdat.ParseText(text); AssertSeqEqual(expected, wordList.Select(h => h.Value)); }
public void testCaseInsensitive() { var text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."; var keywords = new[] { "doLor", "iT" }; var acdat = new AhoCorasickDoubleArrayTrie <string>(); acdat.Build(keywords.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString())), true); var collectedValues = new List <string>(); acdat.ParseText(text, hit => { collectedValues.Add(hit.Value); return(true); }); AssertSeqEqual(new[] { "0", "1", "1", "0" }, collectedValues); }
private AhoCorasickDoubleArrayTrie <string> buildASimpleAhoCorasickDoubleArrayTrie(params string[] keywords) { // Collect test data set var map = new Dictionary <String, String>(); foreach (var key in keywords) { map[key] = key; } var acdat = new AhoCorasickDoubleArrayTrie <string>(); acdat.Build(map); return(acdat); }
public void testParseCharArray() { var chars = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.".ToCharArray(); var keywords = new[] { "dolor", "it" }; var acdat = new AhoCorasickDoubleArrayTrie <string>(keywords.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString()))); var collectedValues = new List <string>(); acdat.ParseText(chars, hit => { collectedValues.Add(hit.Value); return(true); }); AssertSeqEqual(new[] { "0", "1", "1", "0" }, collectedValues); var collectedValues2 = new List <string>(); acdat.ParseText(chars, 14, 10, hit => { collectedValues2.Add(hit.Value); return(true); }); AssertSeqEqual(new[] { "1" }, collectedValues2); }
public void testBuildAndParseWithBigFile() { // Load test data from disk var dictionary = loadDictionary("dictionary.txt"); var text = loadText("text.txt"); // You can use any type of Map to hold data var map = new Dictionary <String, String>(); foreach (var key in dictionary) { map[key] = key; } // Build an AhoCorasickDoubleArrayTrie var acdat = new AhoCorasickDoubleArrayTrie <String>(); acdat.Build(map); // Test it acdat.ParseText(text, (hit) => { Assert.Equal(text.Substring(hit.Begin, hit.Length), hit.Value); }); }
internal Builder(AhoCorasickDoubleArrayTrie <V> trie) { this.trie = trie; }