예제 #1
0
        public static Pairs <TKey, TValue> ToPairs <T, TKey, TValue>(this IEnumerable <T> seq, Func <T, TKey> getKey, Func <T, TValue> getValue)
        {
            var pairs = new Pairs <TKey, TValue>();

            foreach (var item in seq)
            {
                pairs.Add(getKey(item), getValue(item));
            }

            return(pairs);
        }
        public string[] ReplaceInfix(string[] words)
        {
            var replaceDict = new Pairs <string, string>
            {
                //{"cativ" ,"cat"   }, //"ok":61,"fail":16},
                { "eabl", "" },                      //"ok":172,"fail":22},
                { "erativ", "erat" },                //"ok":51,"fail":5},
                //{"iase"  ,"iasi"  }, //"ok":41,"fail":12},
                //{"igerou","iferou"}, //"ok":58,"fail":18},
                { "rativ", "rat" },                  //"ok":123,"fail":23},
                //{"rrhaph","tom"   }, //"ok":39,"fail":12},
                //{"tmen"  ,""      }, //"ok":106,"fail":29},
                { "vabl", "v" },                     //"ok":53,"fail":9}
            };

            var result = new List <string>();
            var count  = 0;

            foreach (var word in words)
            {
                if (word.Length < 3)
                {
                    continue;
                }

                var first  = word[0];
                var middle = word.Substring(1, word.Length - 2);
                var last   = word[word.Length - 1];

                var addme = word;

                foreach (var kvp in replaceDict)
                {
                    if (middle.Contains(kvp.Key))
                    {
                        addme = first.ToString() + middle.Replace(kvp.Key, kvp.Value) + last.ToString();
                        count++;
                        break;
                    }
                }

                result.Add(addme);
            }

            Console.WriteLine($"Infix replace {count} words");
            return(result.ToArray());
        }
예제 #3
0
        public void CalcAffixSubstitutions(string[] words, bool isPrefixNotSuffix, Func <string, int, string> toBase, Func <string, int, string> toAffix)
        {
            var wordsHash = new HashSet <string>(words);

            const int minLenRemaining        = 1;
            const int minRatio               = 2;
            const int minPopularAffixesCount = 30;

            var affixesDict = new Dictionary <string, string[]>();

            for (var len = 1; len <= 12; len++)
            {
                var groups = words.Where(w => w.Length - len >= minLenRemaining)
                             .GroupBy(w => toAffix(w, len))
                             .Select(gr => new { affix = gr.Key, words = gr.ToArray() })
                             .Where(a => a.words.Length >= minPopularAffixesCount)
                             .OrderByDescending(a => a.words.Length)
                             .ToDictionary(a => a.affix, a => a.words);

                foreach (var a in groups)
                {
                    affixesDict.Add(a.Key, a.Value);
                }
            }

            var affixes     = affixesDict.Keys.ToArray();
            var affixSubsts = new Pairs <string, string>();
            var result      = new List <SubstData>();
            var name        = isPrefixNotSuffix ? "prefix" : "suffix";

            for (var i = 0; i < affixes.Length; i++)
            {
                for (var j = 0; j < affixes.Length; j++)
                {
                    var affixFrom = affixes[i];
                    var affixTo   = i == j ? "" : affixes[j];

                    if (affixTo.Length > affixFrom.Length)
                    {
                        continue;
                    }

                    var substOk   = 0;
                    var substFail = 0;

                    foreach (var wordFrom in affixesDict[affixFrom])
                    {
                        var subWord = toBase(wordFrom, affixFrom.Length);

                        var wordTo = isPrefixNotSuffix
                                                        ? (affixTo + subWord)
                                                        : (subWord + affixTo);

                        if (wordsHash.Contains(wordTo))
                        {
                            substOk++;
                        }
                        else
                        {
                            substFail++;
                        }
                    }

                    if (substOk >= substFail * minRatio)
                    {
                        var cutLen = Math.Min(affixFrom.Length, affixTo.Length);
                        var isOk   = true;
                        for (var c = 1; c <= cutLen; c++)
                        {
                            var aFrom = isPrefixNotSuffix ? affixFrom.Substring(0, affixFrom.Length - c) : affixFrom.Substring(c);
                            var aTo   = isPrefixNotSuffix ? affixTo.Substring(0, affixTo.Length - c) : affixTo.Substring(c);

                            if (affixSubsts.ContainsKey(aFrom) && affixSubsts.GetValuesByKey(aFrom).Any(v => v == aTo))
                            {
                                //Console.WriteLine($"// Skip because exist: {aFrom} -> {aTo}");
                                isOk = false;
                                break;
                            }
                        }
                        if (isOk)
                        {
                            affixSubsts.Add(affixFrom, affixTo);
                            var okRatio = ((double)substOk / substFail).ToString("0.00");
                            Console.WriteLine($"{name} subst: {{\"{affixFrom}\", \"{affixTo}\"}},  // Ok {substOk}, fail {substFail}, r {okRatio}");
                            result.Add(new SubstData
                            {
                                SuffixFrom = affixFrom,
                                SuffixTo   = affixTo,
                                Ok         = substOk,
                                Fail       = substFail,
                            });
                        }
                    }
                }
            }

            File.WriteAllText(Path.Combine(dir, $"{name}.json"), jsonSerializer.SerializeUserFriendly(result));
        }
        public void CalcInfixSubstitutions(string[] words)
        {
            var wordsHash = words.ToHashSet();

            const int minLenRemaining = 3;
            const int minRatio        = 3;
            const int minPopularCount = 50;

            var ffixesDict = new DictionaryList <string, string>();

            for (var len = 1; len <= 8; len++)
            {
                foreach (var word in words)
                {
                    if (word.Length - len >= minLenRemaining)
                    {
                        for (var i = 1; i + len < word.Length; i++)
                        {
                            ffixesDict.AddToList(word.Substring(i, len), word);
                        }
                    }
                }
            }

            ffixesDict.RemoveWhere((k, v) => v.Count < minPopularCount);

            var infixes     = ffixesDict.Keys.OrderBy(s => s).ToArray();
            var infixSubsts = new Pairs <string, string>();
            var result      = new List <SubstData>();

            for (var i = 0; i < infixes.Length; i++)
            {
                var infixFrom = infixes[i];
                Console.WriteLine(infixFrom);

                for (var j = 0; j < infixes.Length; j++)
                {
                    var infixTo = i == j ? "" : infixes[j];

                    if (infixTo.Length > infixFrom.Length)
                    {
                        continue;
                    }

                    var substOk   = 0;
                    var substFail = 0;

                    foreach (var wordFrom in ffixesDict[infixFrom])
                    {
                        // Не очень точно - одновременные замены могут повлиять
                        var first  = wordFrom[0];
                        var middle = wordFrom.Substring(1, wordFrom.Length - 2);
                        var last   = wordFrom[wordFrom.Length - 1];

                        var wordTo = first.ToString() + middle.Replace(infixFrom, infixTo) + last.ToString();
                        if (wordsHash.Contains(wordTo))
                        {
                            substOk++;
                        }
                        else
                        {
                            substFail++;
                        }
                    }

                    if (substOk >= substFail * minRatio)
                    {
                        infixSubsts.Add(infixFrom, infixTo);
                        var okRatio = ((double)substOk / substFail).ToString("0.00");
                        Console.WriteLine($"Infix subst: {{\"{infixFrom}\", \"{infixTo}\"}},  // Ok {substOk}, fail {substFail}, r {okRatio}");
                        result.Add(new SubstData
                        {
                            SuffixFrom = infixFrom,
                            SuffixTo   = infixTo,
                            Ok         = substOk,
                            Fail       = substFail,
                        });
                    }
                }
            }

            File.WriteAllText(Path.Combine(dir, "infix.json"), jsonSerializer.Serialize(result));
        }