Ejemplo n.º 1
0
        public async Task SetupAsync()
        {
            SimilarRadicals = new SafeMap <NiaiSimilarRadicalsEntry>();

            var fileInfo = await _dictionaryProvider.GetNiaiSimilarRadicalsAsync();

            using var fs = fileInfo.OpenRead();
            using var sr = new StreamReader(fs, Encoding.UTF8);

            NiaiSimilarRadicalsEntry GetEntry(string radical)
            {
                if (!SimilarRadicals.TryGetValue(radical, out var entry1))
                {
                    entry1 = SimilarRadicals[radical] = new NiaiSimilarRadicalsEntry
                    {
                        Radical = radical,
                        Similar = new System.Collections.Generic.List <(string Radical, double Score)>(),
                    };
                }
                return(entry1);
            }

            while (!sr.EndOfStream)
            {
                var line = await sr.ReadLineAsync();

                if (string.IsNullOrWhiteSpace(line) || line.StartsWith("#"))
                {
                    continue;
                }

                // Format is: R1 R2 Score
                var r1    = line[0].ToString();
                var r2    = line[2].ToString();
                var score = double.Parse(line[4..]);

                var e1 = GetEntry(r1);
                var e2 = GetEntry(r2);

                if (e1.Similar.Any(x => x.Radical == r2))
                {
                    Console.WriteLine($"Found duplicates in niai/similar-radicals.txt: {r1} {r2}");
                    continue;
                }

                e1.Similar.Add((r2, score));
                e2.Similar.Add((r1, score));
            }
Ejemplo n.º 2
0
        /// <summary>
        /// Scores all similar kanjis of all models.
        /// How we score: percentage of shared radicals over the max of the total number of radicals.
        ///	This helps create distance between 2 similar kanjis having same number
        ///	of shared radicals but one is more complex than the other (i.e has more non-shared radicals).
        /// </summary>
        public Task ScoreCoreAsync()
        {
            var p            = _kradFileProvider;
            var model        = new SafeMap <List <SimilarKanji> >();
            var processedMap = new SafeMap <SafeMap <bool> >();

            // Score

            List <SimilarKanji> GetSimilarList(string kanji)
            {
                if (!model.TryGetValue(kanji, out var list))
                {
                    list = model[kanji] = new List <SimilarKanji>();
                }
                return(list);
            }

            var minScore = 0.0;
            var maxScore = 0.0;

            void TrackScore(double score)
            {
                minScore = Math.Min(minScore, score);
                maxScore = Math.Max(maxScore, score);
            }

            double ComputeScore(string k1, string k2)
            {
                //var weightRadicalSimilar = 0.1;
                var weightStrokes = 0.1;

                var k1Radicals = p.Model[k1].Radicals;
                var k2Radicals = p.Model[k2].Radicals;

                var sharedRadicalCount = k1Radicals.Count(r => k2Radicals.Contains(r));
                var totalRadicalCount  = Math.Max(k1Radicals.Count, k2Radicals.Count);
                //var totalRadicalCount = k1Radicals.Count + k2Radicals.Count;

                var score = (double)sharedRadicalCount / totalRadicalCount;

                // This won't work without taking into account the places of the radicals.
                //foreach (var r1 in k1Radicals)
                //{
                //	foreach (var r2 in k2Radicals.Where(r2 => r2 != r1))
                //	{
                //		var similar = _niaiFileProvider.SimilarRadicals[r1]
                //			?.Similar
                //			.FirstOrDefault(x => x.Radical == r2);
                //		if (similar == null) continue;

                //		score += similar.Value.Score * weightRadicalSimilar;
                //	}
                //}

                var k1Strokes = _kanjiDictionaryService.Kanjis[k1].Strokes;
                var k2Strokes = _kanjiDictionaryService.Kanjis[k2].Strokes;

                score += ((double)Math.Min(k1Strokes, k2Strokes) / Math.Max(k1Strokes, k2Strokes)) * weightStrokes;

                return(score);
            }

            void ProcessPair(string k1, string k2)
            {
                if (processedMap.ContainsKey(k2))
                {
                    // The potential similar kanji was already processed, this means
                    // this pair was already processed.
                    return;
                }

                if (!processedMap.TryGetValue(k1, out var innerMap))
                {
                    innerMap = processedMap[k1] = new SafeMap <bool>();
                }
                if (innerMap.ContainsKey(k2))
                {
                    return;
                }

                var score = ComputeScore(k1, k2);

                TrackScore(score);

                var similarList1 = GetSimilarList(k1);
                var similarList2 = GetSimilarList(k2);

                similarList1.Add(new SimilarKanji
                {
                    Kanji = k2,
                    Score = score,
                });
                similarList2.Add(new SimilarKanji
                {
                    Kanji = k1,
                    Score = score,
                });

                innerMap[k2] = true;
            }

            foreach (var e in p.Model.Values)
            {
                var radicals = e.Radicals;
                foreach (var radical in radicals)
                {
                    var radicalKanjis = p.InverseModel[radical].Kanjis;
                    foreach (var potentialSimilar in radicalKanjis)
                    {
                        if (potentialSimilar == e.Kanji)
                        {
                            continue;
                        }
                        ProcessPair(e.Kanji, potentialSimilar);
                    }
                }
            }

            // Translate scores. Make sure scores are always between 0 and 1
            // even no matter what the scoring logic does.
            foreach (var m in model.Values.SelectMany(x => x))
            {
                var score = m.Score;
                m.Score = (score - minScore) / (maxScore - minScore);
            }

            foreach (var m in model.ToList())
            {
                // Sort and remove below minimum
                model[m.Key] = m.Value
                               .OrderByDescending(x => x.Score)
                               .Take(20)
                               .ToList();
            }

            _model = model;

            WriteScoreSampleFile();

            return(Task.CompletedTask);
        }