/// <summary> /// 形態素解析を行い、結果を保存します。 /// </summary> public void Execute() { var allText = File.ReadAllText(FileName); var mecabParam = new MeCabParam { DicDir = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"..\..\..\Chapter04.Core\dic\ipadic") }; MeCabTagger meCabTagger = MeCabTagger.Create(mecabParam); using (var writer = new StreamWriter(MecabFileName, false)) { MeCabNode node = meCabTagger.ParseToNode(allText); while (node != null) { if (node.CharType > 0) { writer.WriteLine(node.Surface + "," + node.Feature); } node = node.Next; } writer.Flush(); } }
public static IEnumerable <POS> Extract(string text, ref NLPCount count) { var segments = new List <POS>(); if (string.IsNullOrEmpty(text)) { return(segments); } MeCabNode node = tagger.ParseToNode(text); while (node != null) { if (node.CharType > 0) { if (node.Surface.Length <= 100) { segments.Add(new POS() { Text = node.Surface, PosTag = node.Feature.Split(',')[0] }); } } node = node.Next; } return(segments); }
public void Load(List <string> sentences) { List <string> words = new List <string>(); using (MeCabTagger mecab = MeCabTagger.Create()) { foreach (string sentence in sentences) { var nodes = mecab.ParseToNode(sentence) as MeCabNode; words.Add(BOS); nodes = nodes.Next; while (nodes != null) { words.Add(nodes.Surface); nodes = nodes.Next; } words.Add(EOS); } } for (int i = 0; i < words.Count() / 3; i++) { triplets.Add(new Triplet(new string[] { words[3 * i], words[3 * i + 1], words[3 * i + 2] })); } }
public static IEnumerable <MeCabNode> ParseToNodes(this MeCabTagger tagger, string text) { for (var node = tagger.ParseToNode(text); node != null; node = node.Next) { yield return(node); } }
static public String GetHiragana(String dicPathFromExe, String str) { MeCabParam param = new MeCabParam(); param.DicDir = dicPathFromExe; MeCabTagger tagger = MeCabTagger.Create(param); MeCabNode node = tagger.ParseToNode(str); String hiragana = ""; while (node != null) { if (node.CharType > 0) { String[] splitStrArray = node.Feature.Split(','); String splitStr; if (splitStrArray.Length < 9) { splitStr = node.Surface; } else { splitStr = splitStrArray[7]; } hiragana = hiragana + splitStr; } node = node.Next; } return(hiragana); }
public void Run() { try { string sentence = "ユーザが本明細書において提供れるような方法"; MeCabParam param = new MeCabParam(); param.DicDir = @"..\..\dic\ipadic"; MeCabTagger t = MeCabTagger.Create(param); MeCabNode node = t.ParseToNode(sentence); while (node != null) { if (node.CharType > 0) { Console.WriteLine(node.Surface + "\t" + node.Feature); } node = node.Next; } Console.WriteLine(); } catch (Exception ex) { Console.WriteLine(ex.Message); } finally { Console.Read(); } }
public static void ParseSentences(string path, MeCabTagger tagger) { using (StreamReader sr = File.OpenText(path)) { string line = ""; MeCabNode node = tagger.ParseToNode(""); while ((line = sr.ReadLine()) != null) { node = tagger.ParseToNode(line); while (node != null) { if (node.CharType > 0) { Console.Write(node.Surface); Console.Write("_"); } node = node.Next; } break; } } }
public static List <(string, string, string)> GetLemmatized(string sentence) { int[] outputPos = null; if (StorageHelper.GetSetting <int>("LemmatizerMode") == 0) { outputPos = outputPos1; } else if (StorageHelper.GetSetting <int>("LemmatizerMode") == 1) { outputPos = outputPos2; } List <(string, string, string)> err = new List <(string, string, string)>(); err.Add((sentence, "", "")); try { if (!string.IsNullOrWhiteSpace(sentence)) { MeCabNode node = t.ParseToNode(sentence); List <(string, string, string)> lemmatized = new List <(string, string, string)>(); while (node != null) { if (node.CharType > 0) { if (outputPos == null || (outputPos != null && outputPos.Contains(node.PosId))) { var features = node.Feature.Split(','); if (node.Surface == "死ね") { lemmatized.Add(("死ぬ", "しぬ", "動詞")); } else if (node.Surface == "しね") { lemmatized.Add(("しぬ", "しぬ", "動詞")); } else { string str = features[features.Count() - 3]; if (str != "ない" && str != "する") { if (node.PosId == 33 && str == "いる") { lemmatized.Add(("居る", features[features.Count() - 2], features[0])); } else if (node.PosId == 37) { lemmatized.Add((str + "ない", features[features.Count() - 2] + "ナイ", features[0])); } else { lemmatized.Add((str, features[features.Count() - 2], features[0]));
public MeCab_kaiseki(string sentence, string file_name, string[] header_names) { this.surrogate_sentence = new System.Globalization.StringInfo(sentence); w_tail = 0; goiso_w_tail = 0; w_array1 = new SortedList <int, int>(); goiso_w_array1 = new SortedList <int, int>(); w_array2 = new SortedList <int, string>(); goiso_w_array2 = new SortedList <int, string>(); tag = MeCabTagger.Create(); node = tag.ParseToNode(sentence); this.file_name = file_name; this.header_names = header_names; }
/// <summary> /// 指定テキスト内に含まれるキーワードとカウントを取得する /// </summary> /// <param name="text">検出元テキスト</param> /// <returns>キーワードをキー、カウントを値とするディクショナリ</returns> static Dictionary <string, int> DetectKeywords(string text) { var result = new Dictionary <string, int>(); var receivers = new List <Func <string, bool> >(); var node = MeCab.ParseToNode(text); var addToDic = new Action <string>(k => { if (result.ContainsKey(k)) { result[k]++; } else { result[k] = 1; } }); while (node != null) { if (node.CharType != 0) { var keyword = node.Surface.ToLower(); if (!(node.Feature.StartsWith("記号,") || node.Feature.StartsWith("名詞,数,"))) { SpecialKeywordProc(receivers, keyword); var r = SpecialKeyword(keyword, addToDic); if (r is null) { addToDic(keyword); } else { receivers.Add(r); } } else { SpecialKeywordProc(receivers, keyword); } } node = node.Next; } SpecialKeyword("", addToDic); return(result); }
private static string[] CheckMeCab(string sentence) { var node = _meCabTagger.ParseToNode(sentence); var resultList = new List <string>(); while (node != null) { if (node.CharType > 0) { resultList.Add(node.Surface); } node = node.Next; } return(resultList.ToArray()); }
public static string[] SplitWord(string result, MeCabTagger tagger) { var rlist = new List <string>(); var tagresult = tagger.ParseToNode(result); while (tagresult != null) { if (tagresult.Stat != MeCabNodeStat.Bos && tagresult.Stat != MeCabNodeStat.Eos) { rlist.Add(tagresult.Surface); } tagresult = tagresult.Next; } return(rlist.ToArray()); }
/// <summary> /// トークンリスト取得処理 /// </summary> /// <param name="text">テキスト</param> /// <returns>トークンリスト</returns> public static List <TokenData> GetTokenList(string text) { var tokenList = new List <TokenData>(); var node = sTagger.ParseToNode(text.Replace("\0", "")); // 一つ目は原文が入っているため読み飛ばす node = node.Next; while (node != null) { tokenList.Add(new TokenData(node.Surface, node.Feature)); node = node.Next; } //return RefineTokenList(tokenList); return(tokenList); }
public Dictionary <string, Dictionary <string, int> > getWeightedDistribution(string sentence) { Dictionary <string, Dictionary <string, int> > dict = new Dictionary <string, Dictionary <string, int> > (); string targetSentence = "xxxSTARTxxx " + sentence + " xxxENDxxx"; MeCabNode node = mecabTagger.ParseToNode(targetSentence); string preWord = null; while (node != null) { if (node.CharType > 0) { if (preWord != null) { Dictionary <string, int> wordDict = null; if (dict.ContainsKey(preWord)) { wordDict = dict [preWord]; } else { wordDict = new Dictionary <string, int> (); dict [preWord] = wordDict; } if (wordDict.ContainsKey(node.Surface)) { int count = wordDict [node.Surface]; count++; wordDict [node.Surface] = count; } else { wordDict [node.Surface] = 1; } } preWord = node.Surface; } node = node.Next; } return(dict); }
private void ButtonUpdateClick(object sender, EventArgs e) { richTextBox.SuspendLayout(); dataGridView.SuspendLayout(); dataGridView.Rows.Clear(); dataGridView.Columns.Clear(); richTextBox.Clear(); MeCabTagger meCabTagger = MeCabTagger.Create(); MeCabNode node = meCabTagger.ParseToNode(textBoxArea.Text); while (node != null) { if (node.CharType > 0) { var data = new List <string> { node.Surface }; data.AddRange(node.Feature.Split(',')); while (dataGridView.Columns.Count < data.Count) { dataGridView.Columns.Add(new DataGridViewTextBoxColumn() { }); } dataGridView.Rows.Add(data.ToArray()); if (node.Feature.Contains("固有名詞")) { richTextBox.SelectionBackColor = Color.Yellow; } else { richTextBox.SelectionBackColor = Color.Transparent; } richTextBox.SelectedText = node.Surface; } node = node.Next; } richTextBox.ResumeLayout(); dataGridView.ResumeLayout(); }
public IEnumerable <Word> ParseText(MeCabTagger meCabTagger, string message) { var node = meCabTagger.ParseToNode(message); while (node != null) { if (node.CharType > 0) { string word = node.Surface; var data = new List <string>(node.Feature.Split(',')); yield return(new Word { Surface = word, Elements = data }); } node = node.Next; } }
/// <summary> /// /// </summary> public void ParseMessage(MeCabTagger meCabTagger) { Words = new List <Word>(); var node = meCabTagger.ParseToNode(Message); while (node != null) { if (node.CharType > 0) { string word = node.Surface; var data = new List <string>(node.Feature.Split(',')); Words.Add(new Word { Surface = word, Elements = data }); } node = node.Next; } }
private string parse(string sentence) { MeCabParam mecabParam = new MeCabParam(); mecabParam.DicDir = @"Assets/dic/ipadic"; MeCabTagger t = MeCabTagger.Create(mecabParam); MeCabNode node = t.ParseToNode(sentence); string result = ""; while (node != null) { if (node.CharType > 0) { result += node.Surface + "\t" + node.Feature + "\n"; } node = node.Next; } return(result); }
/// <summary> /// 文字列を原始人化 /// </summary> /// <param name="source"></param> /// <returns></returns> public String Convert(String source) { var parts = new List <String>(); var node = tagger.ParseToNode(source); node = node.Next; // 最初のノードは抜く while (true) { if (node.Next == null) { break; } var surfaces = node.Feature.Split(','); var surface = node.Surface; node = node.Next; if (surfaces[0] == "助詞") { continue; // 助詞を抜く } if (surfaces.Count() <= 7) { parts.Add(surface); } else { parts.Add(surfaces[typeSurfaces[type]]); } } if (type == GenshiType.Gien) { parts = parts.Select(m => Strings.StrConv(m, VbStrConv.Katakana)).ToList(); } return(String.Join(typeSeparator[type], parts.ToArray())); }
public Token GetToken(string sentence) { var token = new Token(); var node = mecab.ParseToNode(sentence); while (node != null) { if (node.Feature.StartsWith("名詞")) { token.Nouns.Add(node.Surface); } else if (node.Feature.StartsWith("動詞")) { var features = node.Feature.Split(','); var verb = features[features.Length - 3]; token.Verbs.Add(verb); } node = node.Next; } return(token); }
public static string GetNMeCabToFurigana(string strInput) { string str = string.Empty, strYomi; try { MeCabParam mcp = new MeCabParam(); MeCabTagger mct = MeCabTagger.Create(); MeCabNode mcn = mct.ParseToNode(strInput); MecabResult mcr = new MecabResult(mcn); foreach (MecabResult.MecabResultItem mcri in mcr.nodes) { strYomi = String.Empty; if (String.Compare(mcri.読み, "*") == 0) { // 形態素解析を持ってしても読めない場合、OutputYomiganaを使用する。 strYomi = ModuleReuseClass.OutputYomigana(mcri.表層形); } else { strYomi = mcri.読み; } str += strYomi; } //カタカナをひらがなに変換する str = Strings.StrConv(str, VbStrConv.Hiragana, 0x411); //あいうえおかきくけこサシスセソnaninuneno } catch (Exception ex) { MessageBox.Show(ex.Message, "エラー", MessageBoxButtons.OK, MessageBoxIcon.Error); } return(str); }
/// <summary> /// 受け取った文字列を形態素解析し、その結果を返す /// </summary> /// <param name="target">解析対象の文字列</param> /// <returns>解析結果(List形式)</returns> public static IEnumerable <MeCabResult> Parse(string target) { MeCabNode node = tagger.ParseToNode(target); return(node.ToMeCabResultEnumerable()); }
static void Main(string[] args) { if (File.Exists("index.txt")) { Console.WriteLine("Detect index.txt"); } else { Console.WriteLine("Make inverted index."); Console.WriteLine("Calculating Term Frequency ..."); var weightList = new Dictionary <string, Dictionary <string, double> >(); // Dictionary<word, Dictionary<filename, weight>> var invertedIndex = new Dictionary <string, List <string> >(); // Dictionary<word, List<filename orderby weight>> var targetFiles = Directory.GetFiles(dataDir, @"*.txt"); MeCabParam param = new MeCabParam(); param.DicDir = dicDir; MeCabTagger t = MeCabTagger.Create(param); Stopwatch sw = new Stopwatch(); sw.Start(); { Parallel.ForEach(targetFiles, fileName => { Console.WriteLine("Processing " + fileName); var wordList = new Dictionary <string, int>(); // 単語数カウント用リスト int wordCount = 0; var lockObject = new Object(); Parallel.ForEach(File.ReadLines(fileName), line => { var node = t.ParseToNode(line); while (node != null) { if (node.CharType > 0) { lock (lockObject) { ++wordCount; } var normalized = node.Feature.Split(',')[6]; var originalForm = (normalized == null || normalized == "" || normalized == "*") ? node.Surface : normalized; // 原形がないものは表装文字を代表とし、原形がある場合はそちらを代表とする lock (wordList) { if (!wordList.ContainsKey(originalForm)) { wordList[originalForm] = 0; } ++wordList[originalForm]; } } node = node.Next; } }); Parallel.ForEach(wordList.Keys, word => { lock (weightList) { if (!weightList.ContainsKey(word)) { weightList[word] = new Dictionary <string, double>(); } weightList[word][fileName] = wordList[word] / (double)wordCount; } }); }); } sw.Stop(); Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed."); Console.WriteLine("Constructing Inverted Index ..."); sw.Restart(); { /* * invertedIndex = weightList.Keys * .AsParallel() * .ToDictionary( * word => word, * word => weightList[word].Keys * .OrderByDescending(fileName => weightList[word][fileName]) * .ThenBy(fileName => fileName) * .ToList()); */ Parallel.ForEach(weightList.Keys, word => { var ks = weightList[word].Keys.OrderByDescending(fileName => weightList[word][fileName]).ThenBy(fileName => fileName).ToList(); lock (invertedIndex) { invertedIndex[word] = ks; } if (!invertedIndex.ContainsKey(word)) { Console.WriteLine($"{word}は転置インデックスに含まれていません"); } }); } sw.Stop(); Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed."); Console.WriteLine("Calculating Inverse Document Frequency and Recording Weight to weightList ..."); sw.Restart(); { weightList = weightList.AsParallel() .ToDictionary( kv1 => kv1.Key, kv1 => { var idf = Math.Log(targetFiles.Length / kv1.Value.Count, 2) + 1; return(kv1.Value.ToDictionary(kv2 => kv2.Key, kv2 => kv2.Value * idf)); }); } sw.Stop(); Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed."); StreamWriter writer = new StreamWriter(@"index.txt", false, Encoding.GetEncoding("utf-8")); foreach (var word in invertedIndex.Keys) { writer.Write($"{word}\t"); foreach (var filename in invertedIndex[word]) { writer.Write($"({filename}, {weightList[word][filename]}), "); } writer.WriteLine(); } writer.Close(); Console.WriteLine("Successfully finishing all procedures."); } Console.Read(); }
static void Main(string[] args) { Properties.Settings settings = Properties.Settings.Default; string targetFile = settings.TargetFile; Encoding encoding = Encoding.GetEncoding(settings.TargetEncoding); Stopwatch sw = new Stopwatch(); //開始指示を待機 Console.WriteLine("Press Enter key to start."); Console.ReadLine(); Console.WriteLine("\t\t\tProcessTime\tTotalMemory"); //解析準備処理 GC.Collect(); sw.Start(); MeCabTagger tagger = MeCabTagger.Create(); sw.Stop(); Console.WriteLine("OpenTagger:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //ファイル読込だけの場合 using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { } sw.Stop(); } Console.WriteLine("ReadLine:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(Nodeの出力) using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { MeCabNode node = tagger.ParseToNode(line); } sw.Stop(); } Console.WriteLine("ParseToNode:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(latticeモードの文字列出力) tagger.OutPutFormatType = "lattice"; using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { string ret = tagger.Parse(line); } sw.Stop(); } Console.WriteLine("Parse(lattice):\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(Best解5件のNodeの出力) tagger.LatticeLevel = MeCabLatticeLevel.One; using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { int i = 0; foreach (MeCabNode node in tagger.ParseNBestToNode(line)) { if (++i == 5) { break; } } } sw.Stop(); } Console.WriteLine("ParseNBestToNode:\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //対象の情報 using (StreamReader reader = new StreamReader(targetFile, encoding)) { long charCount = 0; long lineCount = 0; long wordCount = 0; for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { charCount += line.Length; lineCount++; MeCabNode node = tagger.ParseToNode(line); for (node = node.Next; node.Next != null; node = node.Next) { wordCount++; } } Console.WriteLine(); Console.WriteLine("Target: {0} {1:#,000}byte {2:#,000}char {3:#,000}line ({4:#,000}word)", targetFile, reader.BaseStream.Position, charCount, lineCount, wordCount); } tagger.Dispose(); //終了したことを通知 Console.WriteLine(); Console.WriteLine("Finish!"); Console.WriteLine("Press Enter key to close."); Console.ReadLine(); }