public void CalcInfixSubstitutions(string[] words) { var wordsHash = words.ToHashSet(); const int minLenRemaining = 3; const int minRatio = 3; const int minPopularCount = 50; var ffixesDict = new DictionaryList <string, string>(); for (var len = 1; len <= 8; len++) { foreach (var word in words) { if (word.Length - len >= minLenRemaining) { for (var i = 1; i + len < word.Length; i++) { ffixesDict.AddToList(word.Substring(i, len), word); } } } } ffixesDict.RemoveWhere((k, v) => v.Count < minPopularCount); var infixes = ffixesDict.Keys.OrderBy(s => s).ToArray(); var infixSubsts = new Pairs <string, string>(); var result = new List <SubstData>(); for (var i = 0; i < infixes.Length; i++) { var infixFrom = infixes[i]; Console.WriteLine(infixFrom); for (var j = 0; j < infixes.Length; j++) { var infixTo = i == j ? "" : infixes[j]; if (infixTo.Length > infixFrom.Length) { continue; } var substOk = 0; var substFail = 0; foreach (var wordFrom in ffixesDict[infixFrom]) { // Не очень точно - одновременные замены могут повлиять var first = wordFrom[0]; var middle = wordFrom.Substring(1, wordFrom.Length - 2); var last = wordFrom[wordFrom.Length - 1]; var wordTo = first.ToString() + middle.Replace(infixFrom, infixTo) + last.ToString(); if (wordsHash.Contains(wordTo)) { substOk++; } else { substFail++; } } if (substOk >= substFail * minRatio) { infixSubsts.Add(infixFrom, infixTo); var okRatio = ((double)substOk / substFail).ToString("0.00"); Console.WriteLine($"Infix subst: {{\"{infixFrom}\", \"{infixTo}\"}}, // Ok {substOk}, fail {substFail}, r {okRatio}"); result.Add(new SubstData { SuffixFrom = infixFrom, SuffixTo = infixTo, Ok = substOk, Fail = substFail, }); } } } File.WriteAllText(Path.Combine(dir, "infix.json"), jsonSerializer.Serialize(result)); }
public static void Save(JsonSerializerMaster jsonSerializer, string dir, string[] words) { var wordFeatures = Convert(words); File.WriteAllText(Path.Combine(dir, "word_features.json"), jsonSerializer.Serialize(wordFeatures)); }