public static string Normalize(this IMorphologicalAnalyzer <IpadicEntry> analyzer, string input)
        {
            var morphemes = analyzer
                            .ParseToEntries(input)
                            .Where(e => e.IsRegular)
                            .Where(IsNotStopWord)
                            .Select(e => e.DictionaryForm);
            var text = string.Join(" ", morphemes);

            return(text);
        }
Пример #2
0
        public IEnumerable <IEnumerable <WordInfo> > BreakIntoSentences(string input)
        {
            if (string.IsNullOrWhiteSpace(input))
            {
                yield break;
            }

            var entries = analyzer.ParseToEntries(input)
                          .Where(a => a.IsRegular);

            var list          = new List <WordInfo>();
            int previousIndex = 0;
            int currentIndex  = 0;

            foreach (var entry in entries)
            {
                previousIndex = currentIndex;
                currentIndex  = input.IndexOf(entry.SurfaceForm, currentIndex, StringComparison.Ordinal);
                var newlines = input.SubstringFromTo(previousIndex, currentIndex).ReplaceLineEndings("\n")
                               .Count(c => c == '\n');
                for (int i = 0; i < newlines; i++)
                {
                    yield return(list);

                    list = new List <WordInfo>();
                }
                list.Add(Map(entry));
            }

            if (list.Count != 0)
            {
                yield return(list);
            }

            WordInfo Map(IEntry word)
            {
                var reading = word.Reading ??
                              lookup.Lookup(word.DictionaryForm ?? word.SurfaceForm)?.FirstOrDefault()
                              ?.ReadingEntries.First().Reading;

                return(new WordInfo(
                           word.SurfaceForm,
                           word.PartOfSpeech,
                           word.DictionaryForm,
                           word.GetPartOfSpeechInfo().Contains(PartOfSpeechInfo.Pronoun)
                        ? Option.Some(EdictPartOfSpeech.pn)
                        : word.Type,
                           reading,
                           word is UnidicEntry unidicEntry
                        ? unidicEntry.DictionaryFormReading
                        : null));
            };
        }
        public static (string sentence, string?highlighter) Highlight(this IMorphologicalAnalyzer <IpadicEntry> analyzer, string haystack, string needle)
        {
            var needleMorphemes = analyzer
                                  .ParseToEntries(needle)
                                  .Where(e => e.IsRegular)
                                  .ToList();
            var haystackMorphemes = analyzer
                                    .ParseToEntries(haystack)
                                    .Where(e => e.IsRegular)
                                    .ToList();

            if (haystackMorphemes.Count == 0 || needleMorphemes.Count == 0)
            {
                return(haystack, null);
            }

            var values = new List <int>(haystackMorphemes.Count);

            for (int i = 0; i < haystackMorphemes.Count; ++i)
            {
                var count = Enumerable
                            .Zip(haystackMorphemes.Skip(i), needleMorphemes)
                            .Count(pair => pair.First.DictionaryForm == pair.Second.DictionaryForm);
                values.Add(count);
            }

            var start           = EnumerableExt.MaxBy(values.Indexed(), p => p.element).index;
            var highlighter     = GetHighlighterFor(haystack);
            var haystackStrings = haystackMorphemes.Select(m => m.SurfaceForm).Materialize();
            var strings         = haystackStrings
                                  .Take(start)
                                  .Append(highlighter)
                                  .Concat(haystackStrings.Skip(start).Take(values[start]))
                                  .Append(highlighter)
                                  .Concat(haystackStrings.Skip(start + values[start]));

            return(string.Concat(strings), highlighter);
        }
Пример #4
0
        public string ToRomaji(string input)
        {
            var words = mecab.ParseToEntries(input)
                        .Where(entry => entry.IsRegular)
                        .Select(entry => entry.Pronunciation ?? entry.SurfaceForm)
                        .ToList();

            bool first = true;
            var  sb    = new StringBuilder();

            foreach (var(word, nextWord) in Utility.Utils.EnumerableExt.Zip(words, words.Skip(1).Concat(EnumerableExt.OfSingle(""))))
            {
                if (!first)
                {
                    sb.Append(" ");
                }
                first = false;
                for (int i = 0; i < word.Length; ++i)
                {
                    var c       = word[i];
                    var hasNext = i + 1 < word.Length;
                    var n       = hasNext ? word[i + 1] : '\0';

                    if (c == 'ー' && i > 0)
                    {
                        var lastLetter = sb[sb.Length - 1];
                        if (lastLetter == 'o')
                        {
                            lastLetter = 'u';
                        }
                        sb.Append(lastLetter);
                        continue;
                    }

                    if (c == 'っ' || c == 'ッ')
                    {
                        var next = hasNext ? n : nextWord.ElementAtOrDefault(0);
                        if (next != '\0')
                        {
                            var r = props.LookupRomaji(next.ToString());
                            sb.Append(r[0]);
                            continue;
                        }
                    }
                    if ((c == 'ん' || c == 'ン') &&
                        "あいうえおアイウエオ".Contains(n.ToString()))
                    {
                        sb.Append(props.LookupRomaji(c.ToString()) ?? c.ToString());
                        sb.Append('\'');
                        sb.Append(props.LookupRomaji(n.ToString()) ?? n.ToString());
                        ++i;
                        continue;
                    }

                    var romaji = props.LookupRomaji(new string(new[] { c, n }));
                    if (romaji != null)
                    {
                        sb.Append(romaji);
                        i++;
                    }
                    else
                    {
                        sb.Append(props.LookupRomaji(c.ToString()) ?? c.ToString());
                    }
                }
            }

            return(sb.ToString());
        }