Пример #1
0
        public void Tanaka()
        {
            var sentences         = new Tanaka(TestDataPaths.Tanaka, Encoding.UTF8).AllSentences();
            var features          = new HashSet <string>();
            var sentencesFiltered = new HashSet <string>();
            var n = 0;

            foreach (var rawSentence in sentences.Select(s => s.JapaneseSentence))
            {
                Console.WriteLine(tagger.Parse(rawSentence));
                var c = tagger.ParseToNodes(rawSentence);
                foreach (var morpheme in c)
                {
                    var feature = morpheme.Feature;
                    if (feature != null)
                    {
                        Console.WriteLine($"{morpheme.Surface} {feature}");
                    }
                    n++;
                    if (n == 20)
                    {
                        Assert.Fail();
                    }
                }
            }
        }
Пример #2
0
        public IEnumerable <MecabWordInfo> MecabWordEnumerable(string sentence)
        {
            foreach (var node in tagger.ParseToNodes(sentence))
            {
                if (node.CharType > 0)
                {
                    var features = node.Feature.Split(',');

                    #region 填充 MecabWordInfo 各项 Property
                    MecabWordInfo word = new MecabWordInfo
                    {
                        Word         = node.Surface,
                        PartOfSpeech = features[0],
                        Description  = features[1],
                        Feature      = node.Feature,
                        Kana         = " "
                    };
                    // 加这一步是为了防止乱码进入分词导致无法读取假名
                    if (features.Length >= 8)
                    {
                        word.Kana = features[7];
                    }

                    if (word.PartOfSpeech == "記号" ||
                        WanaKana.IsHiragana(node.Surface) ||
                        WanaKana.IsKatakana(node.Surface))
                    {
                        word.Kana = " ";
                    }
                    #endregion

                    yield return(word);
                }
            }
        }
Пример #3
0
        /// <summary>
        /// 处理句子,对句子进行分词,得到结果
        /// </summary>
        /// <param name="sentence"></param>
        /// <returns></returns>
        public List <MecabWordInfo> SentenceHandle(string sentence)
        {
            List <MecabWordInfo> ret = new List <MecabWordInfo>();

            foreach (var node in Tagger.ParseToNodes(sentence))
            {
                if (node.CharType > 0)
                {
                    var features = node.Feature.Split(',');


                    MecabWordInfo mwi = new MecabWordInfo {
                        Word         = node.Surface,
                        PartOfSpeech = features[0],
                        Description  = features[1],
                        Feature      = node.Feature
                    };

                    //加这一步是为了防止乱码进入分词导致无法读取假名
                    if (features.Length >= 8)
                    {
                        mwi.Kana = features[7];
                    }

                    ret.Add(mwi);
                }
            }

            return(ret);
        }
Пример #4
0
        /// <summary>
        /// 分句转为罗马音
        /// </summary>
        /// <param name="str"></param>
        /// <param name="isSpace"></param>
        /// <returns></returns>
        public static string UnitToRomaji(string str, bool isSpace)
        {
            var list = _tagger.ParseToNodes(str);

            var result = "";

            foreach (var item in list)
            {
                var nextFeatures = item.Next?.Feature?.Split(',') ?? new string[] { };
                var space        = (!isSpace || nextFeatures.Length <= 6 || new string[] { "記号", "補助記号" }.Contains(nextFeatures[0] ?? "記号")) ? "" : " ";
                if (item.CharType > 0)
                {
                    string[] features;
                    features = item.Feature.Split(',');
                    if (TryCustomConvert(item.Surface, out var customResult))
                    {
                        //用户自定义词典
                        result += customResult;
                    }
                    else if (features.Length > 0 && features[0] != "助詞" && IsJapanese(item.Surface))
                    {
                        //纯假名
                        result += WanaKana.ToRomaji(item.Surface) + space;
                    }
                    else if (features.Length <= 6 || new string[] { "補助記号" }.Contains(features[0]))
                    {
                        //标点符号
                        result += item.Surface;
                    }
                    else if (IsEnglish(item.Surface))
                    {
                        //英文
                        result += item.Surface;
                    }
                    else
                    {
                        //汉字
                        result += WanaKana.ToRomaji(features[ChooseIndexByType(features[0])]) + space;
                    }
                }
                else if (item.Stat != MeCabNodeStat.Bos)
                {
                    result += item.Surface + space;
                }
            }

            if (result.LastIndexOf(' ') == -1)
            {
                return(result);
            }

            if (result.LastIndexOf(' ') == result.Length - 1)
            {
                result = result.Substring(0, result.Length - 1);
            }

            return(result);
        }
Пример #5
0
        public IEnumerable <TMeCabEntry> ParseToEntries(string text)
        {
            TMeCabEntry FromNode(MeCabNode node)
            {
                bool IsRegular(MeCabNode n) =>
                !(n.Stat == MeCabNodeStat.Eos || n.Stat == MeCabNodeStat.Bos);

                return(ToEntry(
                           node.Surface,
                           node.SomeWhen(IsRegular)
                           .Map(n => n.Feature)));
            }

            return(tagger.ParseToNodes(text).Select(FromNode));
        }
Пример #6
0
        /// <summary>
        /// Progress sentence
        /// </summary>
        /// <param name="sentence"></param>
        /// <returns></returns>
        public List <MecabWordInfo> SentenceHandle(string sentence)
        {
            List <MecabWordInfo> ret = new List <MecabWordInfo>();

            foreach (var node in tagger.ParseToNodes(sentence))
            {
                if (node.CharType > 0)
                {
                    var features = node.Feature.Split(',');

                    #region 填充 MecabWordInfo 各项 Property
                    MecabWordInfo word = new MecabWordInfo
                    {
                        Word         = node.Surface,
                        PartOfSpeech = features[0],
                        Description  = features[1],
                        Feature      = node.Feature
                    };
                    // 加这一步是为了防止乱码进入分词导致无法读取假名
                    if (features.Length >= 8)
                    {
                        word.Kana = features[7];
                    }
                    // 清理不需要的假名
                    if (word.PartOfSpeech == "記号")
                    {
                        word.Kana = "";
                    }

                    if (WanaKana.IsHiragana(node.Surface))
                    {
                        word.Kana = "";
                    }
                    if (WanaKana.IsKatakana(node.Surface))
                    {
                        word.Kana = "";
                    }
                    #endregion

                    ret.Add(word);
                }
            }

            return(ret);
        }