コード例 #1
0
        public void CalcInfixSubstitutions(string[] words)
        {
            var wordsHash = words.ToHashSet();

            const int minLenRemaining = 3;
            const int minRatio        = 3;
            const int minPopularCount = 50;

            var ffixesDict = new DictionaryList <string, string>();

            for (var len = 1; len <= 8; len++)
            {
                foreach (var word in words)
                {
                    if (word.Length - len >= minLenRemaining)
                    {
                        for (var i = 1; i + len < word.Length; i++)
                        {
                            ffixesDict.AddToList(word.Substring(i, len), word);
                        }
                    }
                }
            }

            ffixesDict.RemoveWhere((k, v) => v.Count < minPopularCount);

            var infixes     = ffixesDict.Keys.OrderBy(s => s).ToArray();
            var infixSubsts = new Pairs <string, string>();
            var result      = new List <SubstData>();

            for (var i = 0; i < infixes.Length; i++)
            {
                var infixFrom = infixes[i];
                Console.WriteLine(infixFrom);

                for (var j = 0; j < infixes.Length; j++)
                {
                    var infixTo = i == j ? "" : infixes[j];

                    if (infixTo.Length > infixFrom.Length)
                    {
                        continue;
                    }

                    var substOk   = 0;
                    var substFail = 0;

                    foreach (var wordFrom in ffixesDict[infixFrom])
                    {
                        // Не очень точно - одновременные замены могут повлиять
                        var first  = wordFrom[0];
                        var middle = wordFrom.Substring(1, wordFrom.Length - 2);
                        var last   = wordFrom[wordFrom.Length - 1];

                        var wordTo = first.ToString() + middle.Replace(infixFrom, infixTo) + last.ToString();
                        if (wordsHash.Contains(wordTo))
                        {
                            substOk++;
                        }
                        else
                        {
                            substFail++;
                        }
                    }

                    if (substOk >= substFail * minRatio)
                    {
                        infixSubsts.Add(infixFrom, infixTo);
                        var okRatio = ((double)substOk / substFail).ToString("0.00");
                        Console.WriteLine($"Infix subst: {{\"{infixFrom}\", \"{infixTo}\"}},  // Ok {substOk}, fail {substFail}, r {okRatio}");
                        result.Add(new SubstData
                        {
                            SuffixFrom = infixFrom,
                            SuffixTo   = infixTo,
                            Ok         = substOk,
                            Fail       = substFail,
                        });
                    }
                }
            }

            File.WriteAllText(Path.Combine(dir, "infix.json"), jsonSerializer.Serialize(result));
        }
コード例 #2
0
        public static void Save(JsonSerializerMaster jsonSerializer, string dir, string[] words)
        {
            var wordFeatures = Convert(words);

            File.WriteAllText(Path.Combine(dir, "word_features.json"), jsonSerializer.Serialize(wordFeatures));
        }