public void SearchNullEmptyTest() { var ac = new AhoCorasick("a"); var m = ac.Search(null).ToList(); Assert.AreEqual(0, m.Count); m = ac.Search("").ToList(); Assert.AreEqual(0, m.Count); }
public void SimpleTest() { var ac = new AhoCorasick("a"); CollectionAssert.AreEqual(new WordMatchList { { 0, "a" } }, ac.Search("a").ToList()); Assert.AreEqual(0, ac.Search("b").Count()); }
/// <summary> /// Search file for list of words /// </summary> /// <param name="file"></param> /// <param name="trie"></param> public async Task SearchFile(string file, AhoCorasick trie) { try { // find occurrences of search word and synonyms in file using (var fs = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)) using (var reader = new StreamReader(fs)) { var text = await reader.ReadToEndAsync(); // TODO: StringSplitOptions.RemoveEmptyEntries is creating a problem with incorrect line numbers for matches! var matchLines = Regex.Matches(text, @"[^\r\n]*(\n|\r\n?)", RegexOptions.Multiline | RegexOptions.Compiled); var currentLineNumber = 0; var matches = new List <Match>(); int position = 0; int endOfLinePosition = 0; foreach (System.Text.RegularExpressions.Match matchLine in matchLines) { if (_checkForCancellation()) { return; } var line = matchLine.Value; endOfLinePosition = position + line.Length; matches.AddRange( trie .Search(line) .Where(m => { var chars = line.ToCharArray(); var leftSpace = m.Index == 0 || IsWhiteSpace(chars[Math.Max(m.Index - 1, 0)]); var rightSpace = IsWhiteSpace(chars[Math.Min(m.Index + m.Word.Length, line.Length - 1)]); return(leftSpace && rightSpace); }) .ToList() .Select(m => new Match(file, line, m.Word, 1 + currentLineNumber, m.Index + 1, position + m.Index + 1, position + 1, endOfLinePosition + 1)) ); position = endOfLinePosition; currentLineNumber++; } if (matches.Any()) { var matchCount = matches.Count(); OnFileMatch(file, matchCount); foreach (var m in matches) { OnMatch(m, matchCount); } } } } catch (Exception ex) { OnException(file, ex); } }
private void btnSaveMergedData_Click(object sender, EventArgs e) { // Validate that we have the data we need if (!(TheirDataLoaded && OurDataLoaded)) { MessageBox.Show( "Please wait until both data sets are finished loading.", "Loading Data", MessageBoxButtons.OK); return; } var result = saveMergedData.ShowDialog(); if (result == DialogResult.OK) { Stopwatch sw = new Stopwatch(); sw.Restart(); FileInfo mergedFileInfo = new FileInfo(saveMergedData.FileName); using (var output = mergedFileInfo.CreateText()) { var theirDataList = reader.TheirDataList .Cast <TheirClickAndImpressionData>() .ToList(); var ourDataList = reader.OurDataList .Cast <OurNameAndTrackingIdData>() .Where(o => !string.IsNullOrWhiteSpace(o.TrackingID)) .ToList(); // Populate Aho-Corasick ahoCorasick = new AhoCorasick(ourDataList.Select(o => o.TrackingID)); var acDictionary = new ConcurrentDictionary <string, OurNameAndTrackingIdData>(); ourDataList.ForEach(o => acDictionary.TryAdd(o.TrackingID, o)); StringBuilder sb = new StringBuilder(); Parallel.ForEach(theirDataList, theirData => { var results = ahoCorasick.Search(theirData.IndexString); foreach (var item in results) { var value = acDictionary[item.Word]; { lock (output) { sb.AppendLine($"{value.ID},{value.Name},{value.TrackingID},{theirData.IndexString},{theirData.Clicks},{theirData.Impressions},{theirData.DateStamp}"); } } } }); output.WriteLine(sb.ToString()); } sw.Stop(); MessageBox.Show( $"This algorithn took {sw.ElapsedMilliseconds} ms.", "Algorithm Performance"); } }
public void SearchMultipleTest() { var ac = new AhoCorasick("her", "their", "eye", "iris", "he", "is", "si"); var m = ac.Search("theye iris irisis").ToList(); var expected = new WordMatchList { { 1, "he" }, { 2, "eye" }, { 6, "iris" }, { 8, "is" }, { 11, "iris" }, { 13, "is" }, { 14, "si" }, { 15, "is" } }; CollectionAssert.AreEqual(expected, m); }
public void OrdinalIgnoreCaseTest() { var ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, "a", "ab", "bab", "bC", "bca", "c", "caa"); var m = ac.Search("abCcab").ToList(); var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); }
public void UpperCaseTest() { var ac = new AhoCorasick("a", "ab", "bab", "bC", "bca", "c", "caa"); var m = ac.Search("abCcab").ToList(); var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); }
public void SearchIvankTest() { // from http://blog.ivank.net/aho-corasick-algorithm-in-as3.html var ac = new AhoCorasick("take", "fast", "sofa"); var m = ac.Search("takeso fasofast fassofatake sosso sofastake so").ToList(); var expected = new WordMatchList { { 0, "take" }, { 9, "sofa" }, { 11, "fast" }, { 19, "sofa" }, { 23, "take" }, { 34, "sofa" }, { 36, "fast" }, { 39, "take" } }; CollectionAssert.AreEqual(expected, m); }
public void SearchWikipediaTest() { // from https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm var ac = new AhoCorasick("a", "ab", "bab", "bc", "bca", "c", "caa"); var m = ac.Search("abccab").ToList(); var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); }
public static List <Expression> FindAwkwardExpressions( Dictionary <string, List <Expression> > groupByLonguestCommonExpression, AhoCorasick treeAhoCorasick, string text) { List <Expression> expressions = new List <Expression>(); text = RemoveAccents(text); text = text.ToLower(); var matches = regexKeepWordOnly.Matches(text); var words = matches.Select(m => m.Value); text = " " + string.Join(' ', words) + " "; var results = treeAhoCorasick.Search(text).ToList(); if (results.Any()) { foreach (var item in results) { var curRules = groupByLonguestCommonExpression[item.Word]; var exactExpression = curRules.FirstOrDefault(m => m.IsExactExpression); if (exactExpression != null) { expressions.Add(exactExpression); } else { foreach (var rule in curRules.Where(m => !m.IsExactExpression)) { if (rule.Regexp == null) { // Lazy load Regex. rule.Regexp = new Regex(rule.Pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase); } var hasMatch = rule.Regexp.IsMatch(text); if (hasMatch) { expressions.Add(rule); } } } } } return(expressions); }
public void OverloadsTest() { var ac = new AhoCorasick(new List <string> { "a" }); CollectionAssert.AreEqual(new WordMatchList { { 0, "a" } }, ac.Search("a").ToList()); Assert.AreEqual(0, ac.Search("b").Count()); ac = new AhoCorasick(CharComparer.OrdinalIgnoreCase, new List <string> { "a", "ab", "bab", "bC", "bca", "c", "caa" }); var m = ac.Search("abCcab").ToList(); var expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bC" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); ac = new AhoCorasick(); ac.Add("a"); ac.BuildFail(); CollectionAssert.AreEqual(new WordMatchList { { 0, "a" } }, ac.Search("a").ToList()); Assert.AreEqual(0, ac.Search("b").Count()); ac = new AhoCorasick(CharComparer.Create(CultureInfo.InvariantCulture, true), "a", "ab", "bab", "bc", "bca", "c", "caa"); m = ac.Search("abccab").ToList(); expected = new WordMatchList { { 0, "a" }, { 0, "ab" }, { 1, "bc" }, { 2, "c" }, { 3, "c" }, { 4, "a" }, { 4, "ab" } }; CollectionAssert.AreEqual(expected, m); }
static void Main(string[] args) { string textFiles = @"D:\Corpus Related\UrduCorpus\Complete\OCR\"; string output = @"D:\Corpus Related\UrduOCR\"; int exampleCount = 5; int spaceCount = 25; List <string> words = new List <string>(); //word list for search words.AddRange(File.ReadAllLines(output + "WordList.txt")); //final ouput will be stored here Dictionary <string, concLines> concordanceLinesDict = populateDict(words); //Looped through all available text files foreach (string file in Directory.GetFiles(textFiles, "*.txt", SearchOption.AllDirectories)) { //Remove new lines and create single line text string text = Regex.Replace(File.ReadAllText(file), @"[\r\n]+", " "); text = Regex.Replace(text, " {2,}", " "); //build AhoCorasick tree var ac = new AhoCorasick(words); //words that are there in text string var results = ac.Search(text).ToList(); //words to be removed at the end of each text file List <string> toRemove = new List <string>(); if (text != "") { //loop through resulting strings //for (int j = 0; j < results.Count; j++) int j = 0; Parallel.ForEach(results, new ParallelOptions { MaxDegreeOfParallelism = Convert.ToInt32(Math.Ceiling((Environment.ProcessorCount * 0.75) * 1.0)) }, (result) => { string word = result.Word; Console.WriteLine(file + "\t" + word + "\t" + results.Count + "\t" + j + "\t" + words.Count); if (concordanceLinesDict.ContainsKey(word) && concordanceLinesDict[word].count < exampleCount) { { concLines tmpConcLines = concordanceLinesDict[word]; string m = getConcordance(word, text, result.Index, spaceCount); if (m != "") { tmpConcLines.lines.Add(m); tmpConcLines.count = tmpConcLines.count + 1; if (tmpConcLines.count >= exampleCount) { //add word for removal at the end of loop toRemove.Add(word); } } concordanceLinesDict[word] = tmpConcLines; Console.WriteLine(word + "\t" + tmpConcLines.count); } } j++; }); } //remove all completed strings words.RemoveAll(item => toRemove.Contains(item)); } //Write down words which are not found in the given list of text files StreamWriter sw = new StreamWriter(output + "NotFound.txt"); foreach (KeyValuePair <string, concLines> kvp in concordanceLinesDict) { if (kvp.Value.count == 0) { sw.WriteLine(kvp.Key); } } sw.Close(); //Write the concordance lines generated using the main method concordanceLinesDict.WriteAllLines(output + "ConcordanceLines.txt"); }
public IEnumerable <CustomWordMatch> Search(List <string> keywords, List <string> inputs) { List <string> words = new List <string>(); List <Synonyms> synonyms = new List <Synonyms>(); foreach (var k in keywords) { var syns = GetSynonyms(k); if (syns.Count == 0) { words.Add(k); } else { synonyms.AddRange(syns); words.AddRange(syns.Select(s => s.Words) .SelectMany(s => s) .Distinct()); } } var query = new AhoCorasick(words); foreach (var i in inputs) { var matches = query.Search(i.ToLower()); if (matches.Any()) { return(ToCustomWordMatch(matches //.Select(m => { // if (!keywords.Contains(m.Word)) { // var syns = synonyms.FindAll(s => s.Words.Contains(m.Word)); // var keyword = syns.Select(e => e.Words.Find(s => keywords.Contains(s))).First(); // if (keyword != null) { // m.Word = keyword; // } // return m; // } // return m; //}) .Distinct(WordMatchComparer.Instance) .OrderBy(m => m.Index), keywords, synonyms)); } else { List <CustomWordMatch> fuzzyMatches = new List <CustomWordMatch>(); var i_split = i.Split(' '); for (int k = 0; k < i_split.Length; ++k) { var w = i_split[k]; foreach (var keyword in keywords) { if (LevenshteinDistance(w, keyword) <= ConfigManager.GetConfig().SearchFuzziness) { // TODO: return a wordMatch with the word and index fuzzyMatches.Add( new CustomWordMatch(new WordMatch() { Index = FindWordIndexFromSplit(k, w, i_split), Word = w }) ); } } } return(fuzzyMatches); } } return(new List <CustomWordMatch>()); }