public Runner(List <string> rules)
 {
     GroupByLonguestCommonExpression = new Dictionary <string, List <Expression> >();
     TreeAhoCorasick = TextUtils.ComputeRules(
         rules,
         GroupByLonguestCommonExpression);
 }
示例#2
0
        /// <summary>
        /// Search file for list of words
        /// </summary>
        /// <param name="file"></param>
        /// <param name="trie"></param>
        public async Task SearchFile(string file, AhoCorasick trie)
        {
            try
            {
                // find occurrences of search word and synonyms in file
                using (var fs = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
                    using (var reader = new StreamReader(fs))
                    {
                        var text = await reader.ReadToEndAsync();

                        // TODO: StringSplitOptions.RemoveEmptyEntries is creating a problem with incorrect line numbers for matches!
                        var matchLines        = Regex.Matches(text, @"[^\r\n]*(\n|\r\n?)", RegexOptions.Multiline | RegexOptions.Compiled);
                        var currentLineNumber = 0;
                        var matches           = new List <Match>();
                        int position          = 0;
                        int endOfLinePosition = 0;
                        foreach (System.Text.RegularExpressions.Match matchLine in matchLines)
                        {
                            if (_checkForCancellation())
                            {
                                return;
                            }

                            var line = matchLine.Value;
                            endOfLinePosition = position + line.Length;

                            matches.AddRange(
                                trie
                                .Search(line)
                                .Where(m =>
                            {
                                var chars      = line.ToCharArray();
                                var leftSpace  = m.Index == 0 || IsWhiteSpace(chars[Math.Max(m.Index - 1, 0)]);
                                var rightSpace =
                                    IsWhiteSpace(chars[Math.Min(m.Index + m.Word.Length, line.Length - 1)]);
                                return(leftSpace && rightSpace);
                            })
                                .ToList()
                                .Select(m => new Match(file, line, m.Word, 1 + currentLineNumber, m.Index + 1, position + m.Index + 1, position + 1, endOfLinePosition + 1))
                                );

                            position = endOfLinePosition;
                            currentLineNumber++;
                        }
                        if (matches.Any())
                        {
                            var matchCount = matches.Count();
                            OnFileMatch(file, matchCount);
                            foreach (var m in matches)
                            {
                                OnMatch(m, matchCount);
                            }
                        }
                    }
            }
            catch (Exception ex)
            {
                OnException(file, ex);
            }
        }
示例#3
0
    private static AhoCorasick BuildAhoCorasick(SqlXml xml, SqlString culture)
    {
        var          xe         = XElement.Load(xml.CreateReader());
        var          words      = xe.Elements().Select(e => e.FirstAttribute.Value);
        var          c          = culture.Value.Split(':');
        var          ignoreCase = c.Length > 1 && c[1] == "i";
        CharComparer cc;

        switch (c[0])
        {
        case "c":
            cc = CharComparer.Create(CultureInfo.CurrentCulture, ignoreCase);
            break;

        case "n":
            cc = CharComparer.Create(CultureInfo.InvariantCulture, ignoreCase);
            break;

        case "o":
        case "":
            cc = ignoreCase ? CharComparer.OrdinalIgnoreCase : CharComparer.Ordinal;
            break;

        default:
            cc = CharComparer.Create(CultureInfo.GetCultureInfo(c[0]), ignoreCase);
            break;
        }
        var ac = new AhoCorasick(cc, words);

        return(ac);
    }
示例#4
0
        private void btnSaveMergedData_Click(object sender, EventArgs e)
        {
            // Validate that we have the data we need
            if (!(TheirDataLoaded && OurDataLoaded))
            {
                MessageBox.Show(
                    "Please wait until both data sets are finished loading.",
                    "Loading Data",
                    MessageBoxButtons.OK);
                return;
            }

            var result = saveMergedData.ShowDialog();

            if (result == DialogResult.OK)
            {
                Stopwatch sw = new Stopwatch();
                sw.Restart();
                FileInfo mergedFileInfo = new FileInfo(saveMergedData.FileName);

                using (var output = mergedFileInfo.CreateText())
                {
                    var theirDataList = reader.TheirDataList
                                        .Cast <TheirClickAndImpressionData>()
                                        .ToList();
                    var ourDataList = reader.OurDataList
                                      .Cast <OurNameAndTrackingIdData>()
                                      .Where(o => !string.IsNullOrWhiteSpace(o.TrackingID))
                                      .ToList();

                    // Populate Aho-Corasick
                    ahoCorasick = new AhoCorasick(ourDataList.Select(o => o.TrackingID));
                    var acDictionary = new ConcurrentDictionary <string, OurNameAndTrackingIdData>();
                    ourDataList.ForEach(o => acDictionary.TryAdd(o.TrackingID, o));

                    StringBuilder sb = new StringBuilder();

                    Parallel.ForEach(theirDataList, theirData =>
                    {
                        var results = ahoCorasick.Search(theirData.IndexString);
                        foreach (var item in results)
                        {
                            var value = acDictionary[item.Word];
                            {
                                lock (output)
                                {
                                    sb.AppendLine($"{value.ID},{value.Name},{value.TrackingID},{theirData.IndexString},{theirData.Clicks},{theirData.Impressions},{theirData.DateStamp}");
                                }
                            }
                        }
                    });
                    output.WriteLine(sb.ToString());
                }
                sw.Stop();
                MessageBox.Show(
                    $"This algorithn took {sw.ElapsedMilliseconds} ms.",
                    "Algorithm Performance");
            }
        }
示例#5
0
        public void SearchNullEmptyTest()
        {
            var ac = new AhoCorasick("a");
            var m  = ac.Search(null).ToList();

            Assert.AreEqual(0, m.Count);
            m = ac.Search("").ToList();
            Assert.AreEqual(0, m.Count);
        }
示例#6
0
        public void SimpleTest()
        {
            var ac = new AhoCorasick("a");

            CollectionAssert.AreEqual(new WordMatchList {
                { 0, "a" }
            }, ac.Search("a").ToList());
            Assert.AreEqual(0, ac.Search("b").Count());
        }
示例#7
0
        public void SearchMultipleTest()
        {
            var ac       = new AhoCorasick("her", "their", "eye", "iris", "he", "is", "si");
            var m        = ac.Search("theye iris irisis").ToList();
            var expected = new WordMatchList {
                { 1, "he" }, { 2, "eye" }, { 6, "iris" }, { 8, "is" }, { 11, "iris" }, { 13, "is" }, { 14, "si" }, { 15, "is" }
            };

            CollectionAssert.AreEqual(expected, m);
        }
示例#8
0
        public void UpperCaseTest()
        {
            var ac       = new AhoCorasick("a", "ab", "bab", "bC", "bca", "c", "caa");
            var m        = ac.Search("abCcab").ToList();
            var expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };

            CollectionAssert.AreEqual(expected, m);
        }
示例#9
0
        public void OrdinalIgnoreCaseTest()
        {
            var ac       = new AhoCorasick(CharComparer.OrdinalIgnoreCase, "a", "ab", "bab", "bC", "bca", "c", "caa");
            var m        = ac.Search("abCcab").ToList();
            var expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };

            CollectionAssert.AreEqual(expected, m);
        }
示例#10
0
        public void SearchIvankTest()
        {
            // from http://blog.ivank.net/aho-corasick-algorithm-in-as3.html
            var ac       = new AhoCorasick("take", "fast", "sofa");
            var m        = ac.Search("takeso fasofast fassofatake sosso sofastake so").ToList();
            var expected = new WordMatchList {
                { 0, "take" }, { 9, "sofa" }, { 11, "fast" }, { 19, "sofa" }, { 23, "take" }, { 34, "sofa" }, { 36, "fast" }, { 39, "take" }
            };

            CollectionAssert.AreEqual(expected, m);
        }
示例#11
0
        public void SearchWikipediaTest()
        {
            // from https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
            var ac       = new AhoCorasick("a", "ab", "bab", "bc", "bca", "c", "caa");
            var m        = ac.Search("abccab").ToList();
            var expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };

            CollectionAssert.AreEqual(expected, m);
        }
示例#12
0
        public static List <Expression> FindAwkwardExpressions(
            Dictionary <string, List <Expression> > groupByLonguestCommonExpression,
            AhoCorasick treeAhoCorasick,
            string text)
        {
            List <Expression> expressions = new List <Expression>();

            text = RemoveAccents(text);
            text = text.ToLower();

            var matches = regexKeepWordOnly.Matches(text);
            var words   = matches.Select(m => m.Value);

            text = " " + string.Join(' ', words) + " ";

            var results = treeAhoCorasick.Search(text).ToList();

            if (results.Any())
            {
                foreach (var item in results)
                {
                    var curRules        = groupByLonguestCommonExpression[item.Word];
                    var exactExpression = curRules.FirstOrDefault(m => m.IsExactExpression);
                    if (exactExpression != null)
                    {
                        expressions.Add(exactExpression);
                    }
                    else
                    {
                        foreach (var rule in curRules.Where(m => !m.IsExactExpression))
                        {
                            if (rule.Regexp == null)
                            {
                                // Lazy load Regex.
                                rule.Regexp = new Regex(rule.Pattern,
                                                        RegexOptions.Compiled | RegexOptions.IgnoreCase);
                            }
                            var hasMatch = rule.Regexp.IsMatch(text);
                            if (hasMatch)
                            {
                                expressions.Add(rule);
                            }
                        }
                    }
                }
            }

            return(expressions);
        }
示例#13
0
        static void Main(string[] args)
        {
            Setting st = new Setting(args[0]);

            st.pattern.makeFailure();

            byte[] buf = File.ReadAllBytes(st.target);
            AhoCorasick <byte> .convert(st.pattern, buf);

            File.WriteAllBytes(st.target + @".out", buf);
            File.Replace(st.target + @".out", st.target, st.target + @".old");

/*            fs = File.CreateText( args[0] + ".txt");
 *
 * //            print_tree(st.pattern.root);
 *          fs.Write("default(byte) == ");
 *          fs.WriteLine(default(byte).ToString());
 *          fs.Flush();
 *
 *          convert(st.pattern,buf);
 * */
        }
示例#14
0
        public void SearchFiles(string[] files, IList <string> words)
        {
            var trie = new AhoCorasick(CharComparer.OrdinalIgnoreCase, words);

            if (files != null && files.Length > 0)
            {
                var tasks = new List <Task>();
                foreach (var file in files)
                {
                    if (_checkForCancellation())
                    {
                        return;
                    }

                    OnFile(file);
                    Action search = async() => await SearchFile(file, trie);

                    tasks.Add(Task.Run(search));
                }

                Task.WaitAll(tasks.ToArray());
                OnComplete();
            }
        }
示例#15
0
        public void OverloadsTest()
        {
            var ac = new AhoCorasick(new List <string> {
                "a"
            });

            CollectionAssert.AreEqual(new WordMatchList {
                { 0, "a" }
            }, ac.Search("a").ToList());
            Assert.AreEqual(0, ac.Search("b").Count());

            ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, new List <string> {
                "a", "ab", "bab", "bC", "bca", "c", "caa"
            });
            var m        = ac.Search("abCcab").ToList();
            var expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };

            CollectionAssert.AreEqual(expected, m);

            ac = new AhoCorasick();
            ac.Add("a");
            ac.BuildFail();
            CollectionAssert.AreEqual(new WordMatchList {
                { 0, "a" }
            }, ac.Search("a").ToList());
            Assert.AreEqual(0, ac.Search("b").Count());

            ac       = new AhoCorasick(CharComparer.Create(CultureInfo.InvariantCulture, true), "a", "ab", "bab", "bc", "bca", "c", "caa");
            m        = ac.Search("abccab").ToList();
            expected = new WordMatchList {
                { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" }
            };
            CollectionAssert.AreEqual(expected, m);
        }
示例#16
0
        public static AhoCorasick ComputeRules(
            List <string> rules,
            Dictionary <string, List <Expression> > groupByLonguestCommonExpression)
        {
            int nextId = 0;

            for (int i = 0; i < rules.Count; i++)
            {
                var pattern = rules[i];
                pattern = RemoveAccents(pattern);
                pattern = pattern.ToLower();

                // We will take the longuest fixed expression.
                var longuestCommonExpression = pattern
                                               .Split(new char[] { '*' }, StringSplitOptions.RemoveEmptyEntries)
                                               .OrderByDescending(m => m.Length)
                                               .First();

                var matches = regexKeepWordOnly.Matches(longuestCommonExpression);
                var words   = matches.Select(m => m.Value);
                longuestCommonExpression = string.Join(' ', words);

                // Exact expression is faster.
                var exactExpression = !pattern.Contains("*");

                if (exactExpression)
                {
                    // Exact word or group of words. (add extra space to prevent matching)
                    longuestCommonExpression = " " + longuestCommonExpression + " ";
                }
                else
                {
                    // We are using only a part of the expression.
                    // We will not append extra space.
                }

                // Prepare the future Regex.
                var matches2 = regexKeepWordOnlyAndSpecialCharacters.Matches(pattern);
                var words2   = matches2.Select(m => m.Value.Replace("*", "[\\p{L}]+"));
                pattern = string.Join(' ', words2);

                if (!groupByLonguestCommonExpression.ContainsKey(longuestCommonExpression))
                {
                    groupByLonguestCommonExpression[longuestCommonExpression] =
                        new List <Expression>();
                }

                var expr = new Expression
                {
                    Id   = nextId++,
                    Expr = rules[i],
                    IsExactExpression = exactExpression
                };

                if (!exactExpression)
                {
                    // The pattern will be use if it is not an exact expression
                    // to resolve the wildcard.
                    expr.Pattern = pattern;
                }

                groupByLonguestCommonExpression[longuestCommonExpression].Add(expr);
            }
            var keywords = groupByLonguestCommonExpression.Select(m => m.Key).ToList();

            // We are using the Aho–Corasick algorithm.
            // (https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm)

            // Unfortunately, it doesn't support wildcard.
            // But, we will use the longuest fixed expression.
            // So, we reduce the complexities and the number of regex check.

            // The algorithm will tell us which Regex to evaluate.
            // But, if it is an exact expression, no need to do extra processing.

            AhoCorasick treeAhoCorasick = new AhoCorasick();

            treeAhoCorasick.Add(keywords);
            treeAhoCorasick.BuildFail();
            return(treeAhoCorasick);
        }
示例#17
0
        static void Main(string[] args)
        {
            string        textFiles    = @"D:\Corpus Related\UrduCorpus\Complete\OCR\";
            string        output       = @"D:\Corpus Related\UrduOCR\";
            int           exampleCount = 5;
            int           spaceCount   = 25;
            List <string> words        = new List <string>();

            //word list for search
            words.AddRange(File.ReadAllLines(output + "WordList.txt"));
            //final ouput will be stored here
            Dictionary <string, concLines> concordanceLinesDict = populateDict(words);

            //Looped through all available text files
            foreach (string file in Directory.GetFiles(textFiles, "*.txt", SearchOption.AllDirectories))
            {
                //Remove new lines and create single line text
                string text = Regex.Replace(File.ReadAllText(file), @"[\r\n]+", " ");
                text = Regex.Replace(text, " {2,}", " ");
                //build AhoCorasick tree
                var ac = new AhoCorasick(words);
                //words that are there in text string
                var results = ac.Search(text).ToList();
                //words to be removed at the end of each text file
                List <string> toRemove = new List <string>();
                if (text != "")
                {
                    //loop through resulting strings
                    //for (int j = 0; j < results.Count; j++)
                    int j = 0;
                    Parallel.ForEach(results,
                                     new ParallelOptions {
                        MaxDegreeOfParallelism = Convert.ToInt32(Math.Ceiling((Environment.ProcessorCount * 0.75) * 1.0))
                    },
                                     (result) =>
                    {
                        string word = result.Word;
                        Console.WriteLine(file + "\t" + word + "\t" + results.Count + "\t" + j + "\t" + words.Count);
                        if (concordanceLinesDict.ContainsKey(word) && concordanceLinesDict[word].count < exampleCount)
                        {
                            {
                                concLines tmpConcLines = concordanceLinesDict[word];
                                string m = getConcordance(word, text, result.Index, spaceCount);
                                if (m != "")
                                {
                                    tmpConcLines.lines.Add(m);
                                    tmpConcLines.count = tmpConcLines.count + 1;
                                    if (tmpConcLines.count >= exampleCount)
                                    {
                                        //add word for removal at the end of loop
                                        toRemove.Add(word);
                                    }
                                }
                                concordanceLinesDict[word] = tmpConcLines;
                                Console.WriteLine(word + "\t" + tmpConcLines.count);
                            }
                        }
                        j++;
                    });
                }
                //remove all completed strings
                words.RemoveAll(item => toRemove.Contains(item));
            }
            //Write down words which are not found in the given list of text files
            StreamWriter sw = new StreamWriter(output + "NotFound.txt");

            foreach (KeyValuePair <string, concLines> kvp in concordanceLinesDict)
            {
                if (kvp.Value.count == 0)
                {
                    sw.WriteLine(kvp.Key);
                }
            }
            sw.Close();
            //Write the concordance lines generated using the main method
            concordanceLinesDict.WriteAllLines(output + "ConcordanceLines.txt");
        }
示例#18
0
        public IEnumerable <CustomWordMatch> Search(List <string> keywords, List <string> inputs)
        {
            List <string>   words    = new List <string>();
            List <Synonyms> synonyms = new List <Synonyms>();

            foreach (var k in keywords)
            {
                var syns = GetSynonyms(k);
                if (syns.Count == 0)
                {
                    words.Add(k);
                }
                else
                {
                    synonyms.AddRange(syns);

                    words.AddRange(syns.Select(s => s.Words)
                                   .SelectMany(s => s)
                                   .Distinct());
                }
            }

            var query = new AhoCorasick(words);

            foreach (var i in inputs)
            {
                var matches = query.Search(i.ToLower());

                if (matches.Any())
                {
                    return(ToCustomWordMatch(matches
                                             //.Select(m => {
                                             //    if (!keywords.Contains(m.Word)) {
                                             //        var syns = synonyms.FindAll(s => s.Words.Contains(m.Word));
                                             //        var keyword = syns.Select(e => e.Words.Find(s => keywords.Contains(s))).First();

                                             //        if (keyword != null) {
                                             //            m.Word = keyword;
                                             //        }

                                             //        return m;
                                             //    }

                                             //    return m;
                                             //})
                                             .Distinct(WordMatchComparer.Instance)
                                             .OrderBy(m => m.Index), keywords, synonyms));
                }
                else
                {
                    List <CustomWordMatch> fuzzyMatches = new List <CustomWordMatch>();
                    var i_split = i.Split(' ');

                    for (int k = 0; k < i_split.Length; ++k)
                    {
                        var w = i_split[k];
                        foreach (var keyword in keywords)
                        {
                            if (LevenshteinDistance(w, keyword) <= ConfigManager.GetConfig().SearchFuzziness)
                            {
                                // TODO: return a wordMatch with the word and index
                                fuzzyMatches.Add(
                                    new CustomWordMatch(new WordMatch()
                                {
                                    Index = FindWordIndexFromSplit(k, w, i_split),
                                    Word  = w
                                })
                                    );
                            }
                        }
                    }

                    return(fuzzyMatches);
                }
            }

            return(new List <CustomWordMatch>());
        }