public static Pairs <TKey, TValue> ToPairs <T, TKey, TValue>(this IEnumerable <T> seq, Func <T, TKey> getKey, Func <T, TValue> getValue) { var pairs = new Pairs <TKey, TValue>(); foreach (var item in seq) { pairs.Add(getKey(item), getValue(item)); } return(pairs); }
public void CalcAffixSubstitutions(string[] words, bool isPrefixNotSuffix, Func <string, int, string> toBase, Func <string, int, string> toAffix) { var wordsHash = new HashSet <string>(words); const int minLenRemaining = 1; const int minRatio = 2; const int minPopularAffixesCount = 30; var affixesDict = new Dictionary <string, string[]>(); for (var len = 1; len <= 12; len++) { var groups = words.Where(w => w.Length - len >= minLenRemaining) .GroupBy(w => toAffix(w, len)) .Select(gr => new { affix = gr.Key, words = gr.ToArray() }) .Where(a => a.words.Length >= minPopularAffixesCount) .OrderByDescending(a => a.words.Length) .ToDictionary(a => a.affix, a => a.words); foreach (var a in groups) { affixesDict.Add(a.Key, a.Value); } } var affixes = affixesDict.Keys.ToArray(); var affixSubsts = new Pairs <string, string>(); var result = new List <SubstData>(); var name = isPrefixNotSuffix ? "prefix" : "suffix"; for (var i = 0; i < affixes.Length; i++) { for (var j = 0; j < affixes.Length; j++) { var affixFrom = affixes[i]; var affixTo = i == j ? "" : affixes[j]; if (affixTo.Length > affixFrom.Length) { continue; } var substOk = 0; var substFail = 0; foreach (var wordFrom in affixesDict[affixFrom]) { var subWord = toBase(wordFrom, affixFrom.Length); var wordTo = isPrefixNotSuffix ? (affixTo + subWord) : (subWord + affixTo); if (wordsHash.Contains(wordTo)) { substOk++; } else { substFail++; } } if (substOk >= substFail * minRatio) { var cutLen = Math.Min(affixFrom.Length, affixTo.Length); var isOk = true; for (var c = 1; c <= cutLen; c++) { var aFrom = isPrefixNotSuffix ? affixFrom.Substring(0, affixFrom.Length - c) : affixFrom.Substring(c); var aTo = isPrefixNotSuffix ? affixTo.Substring(0, affixTo.Length - c) : affixTo.Substring(c); if (affixSubsts.ContainsKey(aFrom) && affixSubsts.GetValuesByKey(aFrom).Any(v => v == aTo)) { //Console.WriteLine($"// Skip because exist: {aFrom} -> {aTo}"); isOk = false; break; } } if (isOk) { affixSubsts.Add(affixFrom, affixTo); var okRatio = ((double)substOk / substFail).ToString("0.00"); Console.WriteLine($"{name} subst: {{\"{affixFrom}\", \"{affixTo}\"}}, // Ok {substOk}, fail {substFail}, r {okRatio}"); result.Add(new SubstData { SuffixFrom = affixFrom, SuffixTo = affixTo, Ok = substOk, Fail = substFail, }); } } } } File.WriteAllText(Path.Combine(dir, $"{name}.json"), jsonSerializer.SerializeUserFriendly(result)); }
public void CalcInfixSubstitutions(string[] words) { var wordsHash = words.ToHashSet(); const int minLenRemaining = 3; const int minRatio = 3; const int minPopularCount = 50; var ffixesDict = new DictionaryList <string, string>(); for (var len = 1; len <= 8; len++) { foreach (var word in words) { if (word.Length - len >= minLenRemaining) { for (var i = 1; i + len < word.Length; i++) { ffixesDict.AddToList(word.Substring(i, len), word); } } } } ffixesDict.RemoveWhere((k, v) => v.Count < minPopularCount); var infixes = ffixesDict.Keys.OrderBy(s => s).ToArray(); var infixSubsts = new Pairs <string, string>(); var result = new List <SubstData>(); for (var i = 0; i < infixes.Length; i++) { var infixFrom = infixes[i]; Console.WriteLine(infixFrom); for (var j = 0; j < infixes.Length; j++) { var infixTo = i == j ? "" : infixes[j]; if (infixTo.Length > infixFrom.Length) { continue; } var substOk = 0; var substFail = 0; foreach (var wordFrom in ffixesDict[infixFrom]) { // Не очень точно - одновременные замены могут повлиять var first = wordFrom[0]; var middle = wordFrom.Substring(1, wordFrom.Length - 2); var last = wordFrom[wordFrom.Length - 1]; var wordTo = first.ToString() + middle.Replace(infixFrom, infixTo) + last.ToString(); if (wordsHash.Contains(wordTo)) { substOk++; } else { substFail++; } } if (substOk >= substFail * minRatio) { infixSubsts.Add(infixFrom, infixTo); var okRatio = ((double)substOk / substFail).ToString("0.00"); Console.WriteLine($"Infix subst: {{\"{infixFrom}\", \"{infixTo}\"}}, // Ok {substOk}, fail {substFail}, r {okRatio}"); result.Add(new SubstData { SuffixFrom = infixFrom, SuffixTo = infixTo, Ok = substOk, Fail = substFail, }); } } } File.WriteAllText(Path.Combine(dir, "infix.json"), jsonSerializer.Serialize(result)); }