public void Tanaka() { var sentences = new Tanaka(TestDataPaths.Tanaka, Encoding.UTF8).AllSentences(); var features = new HashSet <string>(); var sentencesFiltered = new HashSet <string>(); var n = 0; foreach (var rawSentence in sentences.Select(s => s.JapaneseSentence)) { Console.WriteLine(tagger.Parse(rawSentence)); var c = tagger.ParseToNodes(rawSentence); foreach (var morpheme in c) { var feature = morpheme.Feature; if (feature != null) { Console.WriteLine($"{morpheme.Surface} {feature}"); } n++; if (n == 20) { Assert.Fail(); } } } }
public IEnumerable <MecabWordInfo> MecabWordEnumerable(string sentence) { foreach (var node in tagger.ParseToNodes(sentence)) { if (node.CharType > 0) { var features = node.Feature.Split(','); #region 填充 MecabWordInfo 各项 Property MecabWordInfo word = new MecabWordInfo { Word = node.Surface, PartOfSpeech = features[0], Description = features[1], Feature = node.Feature, Kana = " " }; // 加这一步是为了防止乱码进入分词导致无法读取假名 if (features.Length >= 8) { word.Kana = features[7]; } if (word.PartOfSpeech == "記号" || WanaKana.IsHiragana(node.Surface) || WanaKana.IsKatakana(node.Surface)) { word.Kana = " "; } #endregion yield return(word); } } }
/// <summary> /// 处理句子,对句子进行分词,得到结果 /// </summary> /// <param name="sentence"></param> /// <returns></returns> public List <MecabWordInfo> SentenceHandle(string sentence) { List <MecabWordInfo> ret = new List <MecabWordInfo>(); foreach (var node in Tagger.ParseToNodes(sentence)) { if (node.CharType > 0) { var features = node.Feature.Split(','); MecabWordInfo mwi = new MecabWordInfo { Word = node.Surface, PartOfSpeech = features[0], Description = features[1], Feature = node.Feature }; //加这一步是为了防止乱码进入分词导致无法读取假名 if (features.Length >= 8) { mwi.Kana = features[7]; } ret.Add(mwi); } } return(ret); }
/// <summary> /// 分句转为罗马音 /// </summary> /// <param name="str"></param> /// <param name="isSpace"></param> /// <returns></returns> public static string UnitToRomaji(string str, bool isSpace) { var list = _tagger.ParseToNodes(str); var result = ""; foreach (var item in list) { var nextFeatures = item.Next?.Feature?.Split(',') ?? new string[] { }; var space = (!isSpace || nextFeatures.Length <= 6 || new string[] { "記号", "補助記号" }.Contains(nextFeatures[0] ?? "記号")) ? "" : " "; if (item.CharType > 0) { string[] features; features = item.Feature.Split(','); if (TryCustomConvert(item.Surface, out var customResult)) { //用户自定义词典 result += customResult; } else if (features.Length > 0 && features[0] != "助詞" && IsJapanese(item.Surface)) { //纯假名 result += WanaKana.ToRomaji(item.Surface) + space; } else if (features.Length <= 6 || new string[] { "補助記号" }.Contains(features[0])) { //标点符号 result += item.Surface; } else if (IsEnglish(item.Surface)) { //英文 result += item.Surface; } else { //汉字 result += WanaKana.ToRomaji(features[ChooseIndexByType(features[0])]) + space; } } else if (item.Stat != MeCabNodeStat.Bos) { result += item.Surface + space; } } if (result.LastIndexOf(' ') == -1) { return(result); } if (result.LastIndexOf(' ') == result.Length - 1) { result = result.Substring(0, result.Length - 1); } return(result); }
public IEnumerable <TMeCabEntry> ParseToEntries(string text) { TMeCabEntry FromNode(MeCabNode node) { bool IsRegular(MeCabNode n) => !(n.Stat == MeCabNodeStat.Eos || n.Stat == MeCabNodeStat.Bos); return(ToEntry( node.Surface, node.SomeWhen(IsRegular) .Map(n => n.Feature))); } return(tagger.ParseToNodes(text).Select(FromNode)); }
/// <summary> /// Progress sentence /// </summary> /// <param name="sentence"></param> /// <returns></returns> public List <MecabWordInfo> SentenceHandle(string sentence) { List <MecabWordInfo> ret = new List <MecabWordInfo>(); foreach (var node in tagger.ParseToNodes(sentence)) { if (node.CharType > 0) { var features = node.Feature.Split(','); #region 填充 MecabWordInfo 各项 Property MecabWordInfo word = new MecabWordInfo { Word = node.Surface, PartOfSpeech = features[0], Description = features[1], Feature = node.Feature }; // 加这一步是为了防止乱码进入分词导致无法读取假名 if (features.Length >= 8) { word.Kana = features[7]; } // 清理不需要的假名 if (word.PartOfSpeech == "記号") { word.Kana = ""; } if (WanaKana.IsHiragana(node.Surface)) { word.Kana = ""; } if (WanaKana.IsKatakana(node.Surface)) { word.Kana = ""; } #endregion ret.Add(word); } } return(ret); }