Ejemplo n.º 1
0
        private void runTest(String dictionaryPath, String textPath)
        {
            HashSet <String> dictionary = loadDictionary(dictionaryPath);
            String           text       = loadText(textPath);

            var ahoCorasickDoubleArrayTrie = new AhoCorasickDoubleArrayTrie <String>();
            var dictionaryMap = new Dictionary <String, String>();

            foreach (String word in dictionary)
            {
                dictionaryMap[word] = word;                  // we use the same text as the property of a word
            }

            var swBuild = new Stopwatch();

            ahoCorasickDoubleArrayTrie.Build(dictionaryMap, true);
            swBuild.Stop();
            WriteLine("Automata build time: {0}ms.\n", swBuild.ElapsedMilliseconds);

            // Let's test the speed of the two Aho-Corasick automata
            WriteLine("Parsing document which contains {0} characters, with a dictionary of {1} words.\n", text.Length, dictionary.Count);
            var sw = new Stopwatch();

            sw.Start();
            int hitCount = 0;

            ahoCorasickDoubleArrayTrie.ParseText(text, (hit) => { hitCount++; });
            sw.Stop();
            Assert.True(hitCount > 0);
            WriteLine("{0}ms, speed {1:0.##} char/s", sw.ElapsedMilliseconds, text.Length / (sw.ElapsedMilliseconds / 1000.0));
        }
Ejemplo n.º 2
0
        public void testCancellation()
        {
            // Collect test data set
            var map = new Dictionary <String, String>()
            {
                { "foo", "foo" },
                { "bar", "bar" }
            };
            // Build an AhoCorasickDoubleArrayTrie
            AhoCorasickDoubleArrayTrie <String> acdat = new AhoCorasickDoubleArrayTrie <String>();

            acdat.Build(map);
            // count matches
            String haystack    = "sfwtfoowercwbarqwrcq";
            int    count       = 0;
            int    countCancel = 0;
            Func <AhoCorasickDoubleArrayTrie <string> .Hit, bool> cancellingMatcher = (hit) => { countCancel++; return(false); };
            Func <AhoCorasickDoubleArrayTrie <string> .Hit, bool> countingMatcher   = (hit) => { count++; return(true); };

            acdat.ParseText(haystack, cancellingMatcher);
            acdat.ParseText(haystack, countingMatcher);

            Assert.Equal(1, countCancel);
            Assert.Equal(2, count);
        }
Ejemplo n.º 3
0
        public void testSaveLoad()
        {
            var acdat     = buildASimpleAhoCorasickDoubleArrayTrie("hers", "his", "she", "he");
            var memStream = new MemoryStream();

            acdat.Save(memStream, true);

            WriteLine($"4 keywords, saved {memStream.Length} bytes");

            var acdat2 = new AhoCorasickDoubleArrayTrie <string>();

            memStream.Position = 0;
            acdat2.Load(memStream);

            Assert.Equal(acdat.Count, acdat2.Count);
            Assert.Equal("his", acdat2["his"]);
            validateASimpleAhoCorasickDoubleArrayTrie(acdat2, "uhers", new[] { "he", "hers" });

            // large dictionary
            var dictionary = loadDictionary("dictionary.txt");
            var keywords   = dictionary.Select(k => new KeyValuePair <string, string>(k, k));
            var acdat3     = new AhoCorasickDoubleArrayTrie <string>(keywords);
            var memStream2 = new MemoryStream();

            acdat3.Save(memStream2, false);
            WriteLine($"{dictionary.Count} keywords, saved {memStream2.Length} bytes (without values)");
        }
Ejemplo n.º 4
0
        private void validateASimpleAhoCorasickDoubleArrayTrie(AhoCorasickDoubleArrayTrie <String> acdat, string text, string[] expected)
        {
            // Test it
            acdat.ParseText(text, (hit) => {
                WriteLine("[{0}:{1}]={2}", hit.Begin, hit.End, hit.Value);
                Assert.Equal(text.Substring(hit.Begin, hit.Length), hit.Value);
            });
            // Or simply use
            var wordList = acdat.ParseText(text);

            AssertSeqEqual(expected, wordList.Select(h => h.Value));
        }
Ejemplo n.º 5
0
        public void testCaseInsensitive()
        {
            var text     = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.";
            var keywords = new[] { "doLor", "iT" };

            var acdat = new AhoCorasickDoubleArrayTrie <string>();

            acdat.Build(keywords.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString())), true);
            var collectedValues = new List <string>();

            acdat.ParseText(text, hit => { collectedValues.Add(hit.Value); return(true); });
            AssertSeqEqual(new[] { "0", "1", "1", "0" }, collectedValues);
        }
Ejemplo n.º 6
0
        private AhoCorasickDoubleArrayTrie <string> buildASimpleAhoCorasickDoubleArrayTrie(params string[] keywords)
        {
            // Collect test data set
            var map = new Dictionary <String, String>();

            foreach (var key in keywords)
            {
                map[key] = key;
            }
            var acdat = new AhoCorasickDoubleArrayTrie <string>();

            acdat.Build(map);
            return(acdat);
        }
Ejemplo n.º 7
0
        public void testParseCharArray()
        {
            var chars    = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.".ToCharArray();
            var keywords = new[] { "dolor", "it" };


            var acdat           = new AhoCorasickDoubleArrayTrie <string>(keywords.Select((k, i) => new KeyValuePair <string, string>(k, i.ToString())));
            var collectedValues = new List <string>();

            acdat.ParseText(chars, hit => { collectedValues.Add(hit.Value); return(true); });
            AssertSeqEqual(new[] { "0", "1", "1", "0" }, collectedValues);

            var collectedValues2 = new List <string>();

            acdat.ParseText(chars, 14, 10, hit => { collectedValues2.Add(hit.Value); return(true); });
            AssertSeqEqual(new[] { "1" }, collectedValues2);
        }
Ejemplo n.º 8
0
        public void testBuildAndParseWithBigFile()
        {
            // Load test data from disk
            var dictionary = loadDictionary("dictionary.txt");
            var text       = loadText("text.txt");
            // You can use any type of Map to hold data
            var map = new Dictionary <String, String>();

            foreach (var key in dictionary)
            {
                map[key] = key;
            }

            // Build an AhoCorasickDoubleArrayTrie
            var acdat = new AhoCorasickDoubleArrayTrie <String>();

            acdat.Build(map);
            // Test it
            acdat.ParseText(text, (hit) => {
                Assert.Equal(text.Substring(hit.Begin, hit.Length), hit.Value);
            });
        }
Ejemplo n.º 9
0
 internal Builder(AhoCorasickDoubleArrayTrie <V> trie)
 {
     this.trie = trie;
 }