public static void TestParser(string[] srcNameList, List <string> trgNameList) { var sbp = new StatsBasedParser(trgNameList); //int z = 0; foreach (var srcName in srcNameList) { Debug.WriteLine("{0}", ++z); foreach (var m in sbp.ClosestMatches(srcName)) Debug.WriteLine("{0,64} => {1}", srcName, m); } if (Debugger.IsAttached) { Debugger.Break(); } }
public List <string> ClosestMatches(string src) { //wp>Debug.WriteLine(src, "\n::"); var matches = new List <string>(); //0: if there is an exact match: if (_trgList.Contains(src, StringComparer.OrdinalIgnoreCase)) //tu: !!!!!!!!! //todo: remove ToLower from word stats view !!!!!!! { var exactMatch = _trgList.FirstOrDefault(r => string.Compare(r, src, true) == 0); MatchBaseCsv = exactMatch; matches.Add(exactMatch); return(matches); } //1a: order by FoU: var wa = src.Split(OlpPresets.DelimAll, StringSplitOptions.RemoveEmptyEntries); var srcWordsByFoU = GetWordStats(wa, _wordStatCollection); //1: if there is a SINGLE match by a single word with Fou==1: if (srcWordsByFoU.Count(r => r.Usage == 1) == 1) { var singleUseWord = srcWordsByFoU.First(r => r.Usage == 1); var trgMatch = _trgList.First(r => r.ToLower().Contains(singleUseWord.Word.ToLower())); matches.Add(trgMatch); MatchBaseCsv = string.Join("·", srcWordsByFoU.Select(r => r.Word)); return(matches); } //1: if there is more than 1 match by a single word with Fou==1: if (srcWordsByFoU.Count(r => r.Usage == 1) > 1) { foreach (var matchByFoU1 in srcWordsByFoU.Where(r => r.Usage == 1)) { if (!matches.Contains(matchByFoU1.Word, StringComparer.OrdinalIgnoreCase)) { matches.Add(matchByFoU1.Word); } } Debug.Assert(matches.Count == 1, "Need more wits to find a better way to tell which one matches"); MatchBaseCsv = string.Join("·", srcWordsByFoU.Select(r => r.Word)); return(matches); } //3: if contains all the words from src name, ignoring the org order var matchByAllWords = _trgList.Where(r => StatsBasedParser.ContainsAllWordsOpt(r, wa)); if (matchByAllWords.Any()) { MatchBaseCsv = string.Join("·", wa); return(matchByAllWords.ToList()); } //3: if contains the words from src name and stats array if (srcWordsByFoU.Count() > 1) { for (int i = srcWordsByFoU.Count(); i > 0; i--) { var notAllOrdered = srcWordsByFoU.OrderBy(r => r.Usage).Take(i).Select(r => r.Word).ToArray(); var matchBy1stWords = _trgList.Where(r => StatsBasedParser.ContainsAllWordsOpt(r, notAllOrdered)); if (matchBy1stWords.Any()) { MatchBaseCsv = string.Join("·", notAllOrdered); return(matchBy1stWords.ToList()); } } } return(matches); }