/// <summary> /// 分かち書き /// </summary> /// <returns></returns> private IEnumerable <string> SplitDocumentsWithMeCab(string rawDocument, bool containIsNotPhrageStart) { var words = new List <string>(); MeCabParam param = new MeCabParam(); param.DicDir = @"lib\MeCab\dic\ipadic"; MeCabTagger t = MeCabTagger.Create(param); //形態素解析を行い結果を記録 string result = t.Parse(rawDocument).Replace("\t", ","); var results = result.Split(new string[] { "\r\n" }, StringSplitOptions.None); foreach (var feature in results) { // MeCabの結果を要素ごとに分割 var featureElements = feature.Split(','); // 品詞を解析 // BOS/EOS(開始、終端)を除去する if ("EOS" == featureElements[0] || String.IsNullOrWhiteSpace(featureElements[0]) // || containIsNotPhrageStart && "助詞" == featureElements[1] // || containIsNotPhrageStart && "助動詞" == featureElements[1] || containIsNotPhrageStart && "記号" == featureElements[1]) { continue; } // 文節を結果のリストに格納 yield return(featureElements[0]); } }
public void Tanaka() { var sentences = new Tanaka(TestDataPaths.Tanaka, Encoding.UTF8).AllSentences(); var features = new HashSet <string>(); var sentencesFiltered = new HashSet <string>(); var n = 0; foreach (var rawSentence in sentences.Select(s => s.JapaneseSentence)) { Console.WriteLine(tagger.Parse(rawSentence)); var c = tagger.ParseToNodes(rawSentence); foreach (var morpheme in c) { var feature = morpheme.Feature; if (feature != null) { Console.WriteLine($"{morpheme.Surface} {feature}"); } n++; if (n == 20) { Assert.Fail(); } } } }
public static string MeCabParse(string input) { try { MeCabTagger tagger = MeCabTagger.Create(); tagger.LatticeLevel = MeCabLatticeLevel.Zero; tagger.OutPutFormatType = "lattice"; tagger.AllMorphs = false; tagger.Partial = false; return(tagger.Parse(input)); } catch (Exception ex) { return(ex.ToString()); } }
private void FuriganaKanaLabel_SelectedTextChangedEvent(string itext) { if (defWindow != null) { defWindow.Hide(); } var words_base = tagger.Parse(itext).Split('\n').Where(a => !filters.Contains(a.Split('\t')[0]) && !string.IsNullOrEmpty(a)); var base_words = words_base.Select(a => a.Split('\t')[1].Split(',')[6]).ToArray(); var kana = words_base.Select(a => { var strs = a.Split('\t')[1].Split(','); if (strs.Length > 7) { return(KanaConverter.KatakanaToHiragana(strs[7])); } else { return(a.Split('\t')[0].Trim()); } }).ToArray(); var romaji = kana.Select(a => { try { return(transliterator.GetRomaji(a)); } catch (TransliterationException) { return(a); } }).ToArray(); var words = words_base.Select(a => a.Split('\t')[0]).ToArray(); if (base_words.Length == 0) { return; } int i = 0; if (base_words[i].Trim() == "*") { base_words[i] = words[i]; } if (japDict.ContainsKey(base_words[i])) { defWindow = new DefinitionWindow(base_words[i], words[i], romaji[i], japDict[base_words[i]].Senses.First(a => a.Glosses.Any(b => b.Language == Language.English)).Glosses.First().Term); defWindow.Location = Cursor.Position; defWindow.TopMost = true; defWindow.Show(); } /*else * foreach (char c in base_words[i]) * if (kanjiDict.ContainsKey(c.ToString())) * { * var kan = kanjiDict[c.ToString()]; * f_defs += kan.Literal + " - " + kan.Meanings.First(a => a.Language == Language.English).Term + "\n"; * }*/ }
static void Main(string[] args) { Properties.Settings settings = Properties.Settings.Default; string targetFile = settings.TargetFile; Encoding encoding = Encoding.GetEncoding(settings.TargetEncoding); Stopwatch sw = new Stopwatch(); //開始指示を待機 Console.WriteLine("Press Enter key to start."); Console.ReadLine(); Console.WriteLine("\t\t\tProcessTime\tTotalMemory"); //解析準備処理 GC.Collect(); sw.Start(); MeCabTagger tagger = MeCabTagger.Create(); sw.Stop(); Console.WriteLine("OpenTagger:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //ファイル読込だけの場合 using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { } sw.Stop(); } Console.WriteLine("ReadLine:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(Nodeの出力) using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { MeCabNode node = tagger.ParseToNode(line); } sw.Stop(); } Console.WriteLine("ParseToNode:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(latticeモードの文字列出力) tagger.OutPutFormatType = "lattice"; using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { string ret = tagger.Parse(line); } sw.Stop(); } Console.WriteLine("Parse(lattice):\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(Best解5件のNodeの出力) tagger.LatticeLevel = MeCabLatticeLevel.One; using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { int i = 0; foreach (MeCabNode node in tagger.ParseNBestToNode(line)) { if (++i == 5) { break; } } } sw.Stop(); } Console.WriteLine("ParseNBestToNode:\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //対象の情報 using (StreamReader reader = new StreamReader(targetFile, encoding)) { long charCount = 0; long lineCount = 0; long wordCount = 0; for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { charCount += line.Length; lineCount++; MeCabNode node = tagger.ParseToNode(line); for (node = node.Next; node.Next != null; node = node.Next) { wordCount++; } } Console.WriteLine(); Console.WriteLine("Target: {0} {1:#,000}byte {2:#,000}char {3:#,000}line ({4:#,000}word)", targetFile, reader.BaseStream.Position, charCount, lineCount, wordCount); } tagger.Dispose(); //終了したことを通知 Console.WriteLine(); Console.WriteLine("Finish!"); Console.WriteLine("Press Enter key to close."); Console.ReadLine(); }
public TextReader ParseText(string text) => new StringReader(_tagger.Parse(text));