Example #1
0
        /// <summary>
        /// 形態素解析を行い、結果を保存します。
        /// </summary>
        public void Execute()
        {
            var allText    = File.ReadAllText(FileName);
            var mecabParam = new MeCabParam
            {
                DicDir = Path.Combine(AppDomain.CurrentDomain.BaseDirectory,
                                      @"..\..\..\Chapter04.Core\dic\ipadic")
            };
            MeCabTagger meCabTagger = MeCabTagger.Create(mecabParam);

            using (var writer = new StreamWriter(MecabFileName, false))
            {
                MeCabNode node = meCabTagger.ParseToNode(allText);
                while (node != null)
                {
                    if (node.CharType > 0)
                    {
                        writer.WriteLine(node.Surface + "," + node.Feature);
                    }

                    node = node.Next;
                }

                writer.Flush();
            }
        }
        public static IEnumerable <POS> Extract(string text, ref NLPCount count)
        {
            var segments = new List <POS>();

            if (string.IsNullOrEmpty(text))
            {
                return(segments);
            }

            MeCabNode node = tagger.ParseToNode(text);

            while (node != null)
            {
                if (node.CharType > 0)
                {
                    if (node.Surface.Length <= 100)
                    {
                        segments.Add(new POS()
                        {
                            Text = node.Surface, PosTag = node.Feature.Split(',')[0]
                        });
                    }
                }
                node = node.Next;
            }
            return(segments);
        }
Example #3
0
    public void Load(List <string> sentences)
    {
        List <string> words = new List <string>();

        using (MeCabTagger mecab = MeCabTagger.Create())
        {
            foreach (string sentence in sentences)
            {
                var nodes = mecab.ParseToNode(sentence) as MeCabNode;
                words.Add(BOS);
                nodes = nodes.Next;
                while (nodes != null)
                {
                    words.Add(nodes.Surface);
                    nodes = nodes.Next;
                }
                words.Add(EOS);
            }
        }

        for (int i = 0; i < words.Count() / 3; i++)
        {
            triplets.Add(new Triplet(new string[] { words[3 * i], words[3 * i + 1], words[3 * i + 2] }));
        }
    }
Example #4
0
 public static IEnumerable <MeCabNode> ParseToNodes(this MeCabTagger tagger, string text)
 {
     for (var node = tagger.ParseToNode(text); node != null; node = node.Next)
     {
         yield return(node);
     }
 }
Example #5
0
        static public String GetHiragana(String dicPathFromExe, String str)
        {
            MeCabParam param = new MeCabParam();

            param.DicDir = dicPathFromExe;
            MeCabTagger tagger   = MeCabTagger.Create(param);
            MeCabNode   node     = tagger.ParseToNode(str);
            String      hiragana = "";

            while (node != null)
            {
                if (node.CharType > 0)
                {
                    String[] splitStrArray = node.Feature.Split(',');
                    String   splitStr;
                    if (splitStrArray.Length < 9)
                    {
                        splitStr = node.Surface;
                    }
                    else
                    {
                        splitStr = splitStrArray[7];
                    }
                    hiragana = hiragana + splitStr;
                }
                node = node.Next;
            }
            return(hiragana);
        }
Example #6
0
        public void Run()
        {
            try
            {
                string sentence = "ユーザが本明細書において提供れるような方法";

                MeCabParam param = new MeCabParam();
                param.DicDir = @"..\..\dic\ipadic";

                MeCabTagger t    = MeCabTagger.Create(param);
                MeCabNode   node = t.ParseToNode(sentence);
                while (node != null)
                {
                    if (node.CharType > 0)
                    {
                        Console.WriteLine(node.Surface + "\t" + node.Feature);
                    }
                    node = node.Next;
                }
                Console.WriteLine();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
            finally
            {
                Console.Read();
            }
        }
Example #7
0
 public static void ParseSentences(string path, MeCabTagger tagger)
 {
     using (StreamReader sr = File.OpenText(path)) {
         string    line = "";
         MeCabNode node = tagger.ParseToNode("");
         while ((line = sr.ReadLine()) != null)
         {
             node = tagger.ParseToNode(line);
             while (node != null)
             {
                 if (node.CharType > 0)
                 {
                     Console.Write(node.Surface);
                     Console.Write("_");
                 }
                 node = node.Next;
             }
             break;
         }
     }
 }
Example #8
0
        public static List <(string, string, string)> GetLemmatized(string sentence)
        {
            int[] outputPos = null;
            if (StorageHelper.GetSetting <int>("LemmatizerMode") == 0)
            {
                outputPos = outputPos1;
            }
            else if (StorageHelper.GetSetting <int>("LemmatizerMode") == 1)
            {
                outputPos = outputPos2;
            }
            List <(string, string, string)> err = new List <(string, string, string)>();

            err.Add((sentence, "", ""));
            try
            {
                if (!string.IsNullOrWhiteSpace(sentence))
                {
                    MeCabNode node = t.ParseToNode(sentence);
                    List <(string, string, string)> lemmatized = new List <(string, string, string)>();
                    while (node != null)
                    {
                        if (node.CharType > 0)
                        {
                            if (outputPos == null || (outputPos != null && outputPos.Contains(node.PosId)))
                            {
                                var features = node.Feature.Split(',');
                                if (node.Surface == "死ね")
                                {
                                    lemmatized.Add(("死ぬ", "しぬ", "動詞"));
                                }
                                else if (node.Surface == "しね")
                                {
                                    lemmatized.Add(("しぬ", "しぬ", "動詞"));
                                }
                                else
                                {
                                    string str = features[features.Count() - 3];
                                    if (str != "ない" && str != "する")
                                    {
                                        if (node.PosId == 33 && str == "いる")
                                        {
                                            lemmatized.Add(("居る", features[features.Count() - 2], features[0]));
                                        }
                                        else if (node.PosId == 37)
                                        {
                                            lemmatized.Add((str + "ない", features[features.Count() - 2] + "ナイ", features[0]));
                                        }
                                        else
                                        {
                                            lemmatized.Add((str, features[features.Count() - 2], features[0]));
Example #9
0
        public MeCab_kaiseki(string sentence, string file_name, string[] header_names)
        {
            this.surrogate_sentence = new System.Globalization.StringInfo(sentence);
            w_tail         = 0;
            goiso_w_tail   = 0;
            w_array1       = new SortedList <int, int>();
            goiso_w_array1 = new SortedList <int, int>();
            w_array2       = new SortedList <int, string>();
            goiso_w_array2 = new SortedList <int, string>();

            tag               = MeCabTagger.Create();
            node              = tag.ParseToNode(sentence);
            this.file_name    = file_name;
            this.header_names = header_names;
        }
Example #10
0
        /// <summary>
        /// 指定テキスト内に含まれるキーワードとカウントを取得する
        /// </summary>
        /// <param name="text">検出元テキスト</param>
        /// <returns>キーワードをキー、カウントを値とするディクショナリ</returns>
        static Dictionary <string, int> DetectKeywords(string text)
        {
            var result    = new Dictionary <string, int>();
            var receivers = new List <Func <string, bool> >();
            var node      = MeCab.ParseToNode(text);

            var addToDic = new Action <string>(k => {
                if (result.ContainsKey(k))
                {
                    result[k]++;
                }
                else
                {
                    result[k] = 1;
                }
            });

            while (node != null)
            {
                if (node.CharType != 0)
                {
                    var keyword = node.Surface.ToLower();
                    if (!(node.Feature.StartsWith("記号,") || node.Feature.StartsWith("名詞,数,")))
                    {
                        SpecialKeywordProc(receivers, keyword);

                        var r = SpecialKeyword(keyword, addToDic);
                        if (r is null)
                        {
                            addToDic(keyword);
                        }
                        else
                        {
                            receivers.Add(r);
                        }
                    }
                    else
                    {
                        SpecialKeywordProc(receivers, keyword);
                    }
                }
                node = node.Next;
            }

            SpecialKeyword("", addToDic);

            return(result);
        }
Example #11
0
        private static string[] CheckMeCab(string sentence)
        {
            var node       = _meCabTagger.ParseToNode(sentence);
            var resultList = new List <string>();

            while (node != null)
            {
                if (node.CharType > 0)
                {
                    resultList.Add(node.Surface);
                }
                node = node.Next;
            }

            return(resultList.ToArray());
        }
Example #12
0
        public static string[] SplitWord(string result, MeCabTagger tagger)
        {
            var rlist     = new List <string>();
            var tagresult = tagger.ParseToNode(result);

            while (tagresult != null)
            {
                if (tagresult.Stat != MeCabNodeStat.Bos &&
                    tagresult.Stat != MeCabNodeStat.Eos)
                {
                    rlist.Add(tagresult.Surface);
                }
                tagresult = tagresult.Next;
            }
            return(rlist.ToArray());
        }
Example #13
0
        /// <summary>
        /// トークンリスト取得処理
        /// </summary>
        /// <param name="text">テキスト</param>
        /// <returns>トークンリスト</returns>
        public static List <TokenData> GetTokenList(string text)
        {
            var tokenList = new List <TokenData>();

            var node = sTagger.ParseToNode(text.Replace("\0", ""));

            // 一つ目は原文が入っているため読み飛ばす
            node = node.Next;
            while (node != null)
            {
                tokenList.Add(new TokenData(node.Surface, node.Feature));

                node = node.Next;
            }

            //return RefineTokenList(tokenList);
            return(tokenList);
        }
Example #14
0
    public Dictionary <string, Dictionary <string, int> > getWeightedDistribution(string sentence)
    {
        Dictionary <string, Dictionary <string, int> > dict = new Dictionary <string, Dictionary <string, int> > ();

        string targetSentence = "xxxSTARTxxx " + sentence + " xxxENDxxx";

        MeCabNode node = mecabTagger.ParseToNode(targetSentence);

        string preWord = null;

        while (node != null)
        {
            if (node.CharType > 0)
            {
                if (preWord != null)
                {
                    Dictionary <string, int> wordDict = null;
                    if (dict.ContainsKey(preWord))
                    {
                        wordDict = dict [preWord];
                    }
                    else
                    {
                        wordDict       = new Dictionary <string, int> ();
                        dict [preWord] = wordDict;
                    }
                    if (wordDict.ContainsKey(node.Surface))
                    {
                        int count = wordDict [node.Surface];
                        count++;
                        wordDict [node.Surface] = count;
                    }
                    else
                    {
                        wordDict [node.Surface] = 1;
                    }
                }
                preWord = node.Surface;
            }
            node = node.Next;
        }

        return(dict);
    }
Example #15
0
        private void ButtonUpdateClick(object sender, EventArgs e)
        {
            richTextBox.SuspendLayout();
            dataGridView.SuspendLayout();
            dataGridView.Rows.Clear();
            dataGridView.Columns.Clear();
            richTextBox.Clear();
            MeCabTagger meCabTagger = MeCabTagger.Create();
            MeCabNode   node        = meCabTagger.ParseToNode(textBoxArea.Text);

            while (node != null)
            {
                if (node.CharType > 0)
                {
                    var data = new List <string> {
                        node.Surface
                    };
                    data.AddRange(node.Feature.Split(','));
                    while (dataGridView.Columns.Count < data.Count)
                    {
                        dataGridView.Columns.Add(new DataGridViewTextBoxColumn()
                        {
                        });
                    }

                    dataGridView.Rows.Add(data.ToArray());

                    if (node.Feature.Contains("固有名詞"))
                    {
                        richTextBox.SelectionBackColor = Color.Yellow;
                    }
                    else
                    {
                        richTextBox.SelectionBackColor = Color.Transparent;
                    }

                    richTextBox.SelectedText = node.Surface;
                }
                node = node.Next;
            }
            richTextBox.ResumeLayout();
            dataGridView.ResumeLayout();
        }
        public IEnumerable <Word> ParseText(MeCabTagger meCabTagger, string message)
        {
            var node = meCabTagger.ParseToNode(message);

            while (node != null)
            {
                if (node.CharType > 0)
                {
                    string word = node.Surface;
                    var    data = new List <string>(node.Feature.Split(','));

                    yield return(new Word
                    {
                        Surface = word,
                        Elements = data
                    });
                }

                node = node.Next;
            }
        }
Example #17
0
        /// <summary>
        ///
        /// </summary>
        public void ParseMessage(MeCabTagger meCabTagger)
        {
            Words = new List <Word>();
            var node = meCabTagger.ParseToNode(Message);

            while (node != null)
            {
                if (node.CharType > 0)
                {
                    string word = node.Surface;
                    var    data = new List <string>(node.Feature.Split(','));

                    Words.Add(new Word
                    {
                        Surface  = word,
                        Elements = data
                    });
                }
                node = node.Next;
            }
        }
Example #18
0
    private string parse(string sentence)
    {
        MeCabParam mecabParam = new MeCabParam();

        mecabParam.DicDir = @"Assets/dic/ipadic";

        MeCabTagger t    = MeCabTagger.Create(mecabParam);
        MeCabNode   node = t.ParseToNode(sentence);

        string result = "";

        while (node != null)
        {
            if (node.CharType > 0)
            {
                result += node.Surface + "\t" + node.Feature + "\n";
            }
            node = node.Next;
        }

        return(result);
    }
Example #19
0
        /// <summary>
        /// 文字列を原始人化
        /// </summary>
        /// <param name="source"></param>
        /// <returns></returns>
        public String Convert(String source)
        {
            var parts = new List <String>();
            var node  = tagger.ParseToNode(source);

            node = node.Next;   // 最初のノードは抜く

            while (true)
            {
                if (node.Next == null)
                {
                    break;
                }
                var surfaces = node.Feature.Split(',');
                var surface  = node.Surface;

                node = node.Next;

                if (surfaces[0] == "助詞")
                {
                    continue;                       // 助詞を抜く
                }
                if (surfaces.Count() <= 7)
                {
                    parts.Add(surface);
                }
                else
                {
                    parts.Add(surfaces[typeSurfaces[type]]);
                }
            }
            if (type == GenshiType.Gien)
            {
                parts = parts.Select(m => Strings.StrConv(m, VbStrConv.Katakana)).ToList();
            }

            return(String.Join(typeSeparator[type], parts.ToArray()));
        }
Example #20
0
        public Token GetToken(string sentence)
        {
            var token = new Token();

            var node = mecab.ParseToNode(sentence);

            while (node != null)
            {
                if (node.Feature.StartsWith("名詞"))
                {
                    token.Nouns.Add(node.Surface);
                }
                else if (node.Feature.StartsWith("動詞"))
                {
                    var features = node.Feature.Split(',');
                    var verb     = features[features.Length - 3];
                    token.Verbs.Add(verb);
                }

                node = node.Next;
            }

            return(token);
        }
Example #21
0
        public static string GetNMeCabToFurigana(string strInput)
        {
            string str = string.Empty, strYomi;

            try
            {
                MeCabParam  mcp = new MeCabParam();
                MeCabTagger mct = MeCabTagger.Create();
                MeCabNode   mcn = mct.ParseToNode(strInput);
                MecabResult mcr = new MecabResult(mcn);

                foreach (MecabResult.MecabResultItem mcri in mcr.nodes)
                {
                    strYomi = String.Empty;
                    if (String.Compare(mcri.読み, "*") == 0)
                    {
                        // 形態素解析を持ってしても読めない場合、OutputYomiganaを使用する。
                        strYomi = ModuleReuseClass.OutputYomigana(mcri.表層形);
                    }
                    else
                    {
                        strYomi = mcri.読み;
                    }
                    str += strYomi;
                }
                //カタカナをひらがなに変換する
                str = Strings.StrConv(str, VbStrConv.Hiragana, 0x411);
                //あいうえおかきくけこサシスセソnaninuneno
            }
            catch (Exception ex)
            {
                MessageBox.Show(ex.Message, "エラー", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }

            return(str);
        }
Example #22
0
        /// <summary>
        /// 受け取った文字列を形態素解析し、その結果を返す
        /// </summary>
        /// <param name="target">解析対象の文字列</param>
        /// <returns>解析結果(List形式)</returns>
        public static IEnumerable <MeCabResult> Parse(string target)
        {
            MeCabNode node = tagger.ParseToNode(target);

            return(node.ToMeCabResultEnumerable());
        }
Example #23
0
        static void Main(string[] args)
        {
            if (File.Exists("index.txt"))
            {
                Console.WriteLine("Detect index.txt");
            }
            else
            {
                Console.WriteLine("Make inverted index.");
                Console.WriteLine("Calculating Term Frequency ...");
                var weightList    = new Dictionary <string, Dictionary <string, double> >(); // Dictionary<word, Dictionary<filename, weight>>
                var invertedIndex = new Dictionary <string, List <string> >();               // Dictionary<word, List<filename orderby weight>>
                var targetFiles   = Directory.GetFiles(dataDir, @"*.txt");

                MeCabParam param = new MeCabParam();
                param.DicDir = dicDir;
                MeCabTagger t = MeCabTagger.Create(param);

                Stopwatch sw = new Stopwatch();
                sw.Start();
                {
                    Parallel.ForEach(targetFiles, fileName =>
                    {
                        Console.WriteLine("Processing " + fileName);

                        var wordList   = new Dictionary <string, int>(); // 単語数カウント用リスト
                        int wordCount  = 0;
                        var lockObject = new Object();

                        Parallel.ForEach(File.ReadLines(fileName), line =>
                        {
                            var node = t.ParseToNode(line);
                            while (node != null)
                            {
                                if (node.CharType > 0)
                                {
                                    lock (lockObject)
                                    {
                                        ++wordCount;
                                    }

                                    var normalized   = node.Feature.Split(',')[6];
                                    var originalForm = (normalized == null || normalized == "" || normalized == "*") ? node.Surface : normalized;
                                    // 原形がないものは表装文字を代表とし、原形がある場合はそちらを代表とする

                                    lock (wordList)
                                    {
                                        if (!wordList.ContainsKey(originalForm))
                                        {
                                            wordList[originalForm] = 0;
                                        }
                                        ++wordList[originalForm];
                                    }
                                }
                                node = node.Next;
                            }
                        });

                        Parallel.ForEach(wordList.Keys, word =>
                        {
                            lock (weightList)
                            {
                                if (!weightList.ContainsKey(word))
                                {
                                    weightList[word] = new Dictionary <string, double>();
                                }
                                weightList[word][fileName] = wordList[word] / (double)wordCount;
                            }
                        });
                    });
                }
                sw.Stop();
                Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed.");

                Console.WriteLine("Constructing Inverted Index ...");
                sw.Restart();
                {
                    /*
                     * invertedIndex = weightList.Keys
                     *  .AsParallel()
                     *  .ToDictionary(
                     *      word => word,
                     *      word => weightList[word].Keys
                     *          .OrderByDescending(fileName => weightList[word][fileName])
                     *          .ThenBy(fileName => fileName)
                     *          .ToList());
                     */
                    Parallel.ForEach(weightList.Keys, word =>
                    {
                        var ks = weightList[word].Keys.OrderByDescending(fileName => weightList[word][fileName]).ThenBy(fileName => fileName).ToList();
                        lock (invertedIndex)
                        {
                            invertedIndex[word] = ks;
                        }

                        if (!invertedIndex.ContainsKey(word))
                        {
                            Console.WriteLine($"{word}は転置インデックスに含まれていません");
                        }
                    });
                }
                sw.Stop();
                Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed.");

                Console.WriteLine("Calculating Inverse Document Frequency and Recording Weight to weightList ...");
                sw.Restart();
                {
                    weightList = weightList.AsParallel()
                                 .ToDictionary(
                        kv1 => kv1.Key,
                        kv1 =>
                    {
                        var idf = Math.Log(targetFiles.Length / kv1.Value.Count, 2) + 1;
                        return(kv1.Value.ToDictionary(kv2 => kv2.Key, kv2 => kv2.Value * idf));
                    });
                }
                sw.Stop();
                Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed.");

                StreamWriter writer = new StreamWriter(@"index.txt", false, Encoding.GetEncoding("utf-8"));
                foreach (var word in invertedIndex.Keys)
                {
                    writer.Write($"{word}\t");
                    foreach (var filename in invertedIndex[word])
                    {
                        writer.Write($"({filename}, {weightList[word][filename]}), ");
                    }
                    writer.WriteLine();
                }
                writer.Close();

                Console.WriteLine("Successfully finishing all procedures.");
            }

            Console.Read();
        }
Example #24
0
        static void Main(string[] args)
        {
            Properties.Settings settings = Properties.Settings.Default;
            string    targetFile         = settings.TargetFile;
            Encoding  encoding           = Encoding.GetEncoding(settings.TargetEncoding);
            Stopwatch sw = new Stopwatch();

            //開始指示を待機
            Console.WriteLine("Press Enter key to start.");
            Console.ReadLine();

            Console.WriteLine("\t\t\tProcessTime\tTotalMemory");

            //解析準備処理
            GC.Collect();
            sw.Start();
            MeCabTagger tagger = MeCabTagger.Create();

            sw.Stop();
            Console.WriteLine("OpenTagger:\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //ファイル読込だけの場合
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                }
                sw.Stop();
            }
            Console.WriteLine("ReadLine:\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //解析処理(Nodeの出力)
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    MeCabNode node = tagger.ParseToNode(line);
                }
                sw.Stop();
            }
            Console.WriteLine("ParseToNode:\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //解析処理(latticeモードの文字列出力)
            tagger.OutPutFormatType = "lattice";
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    string ret = tagger.Parse(line);
                }
                sw.Stop();
            }
            Console.WriteLine("Parse(lattice):\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));


            //解析処理(Best解5件のNodeの出力)
            tagger.LatticeLevel = MeCabLatticeLevel.One;
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    int i = 0;
                    foreach (MeCabNode node in tagger.ParseNBestToNode(line))
                    {
                        if (++i == 5)
                        {
                            break;
                        }
                    }
                }
                sw.Stop();
            }
            Console.WriteLine("ParseNBestToNode:\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //対象の情報
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                long charCount = 0;
                long lineCount = 0;
                long wordCount = 0;
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    charCount += line.Length;
                    lineCount++;
                    MeCabNode node = tagger.ParseToNode(line);
                    for (node = node.Next; node.Next != null; node = node.Next)
                    {
                        wordCount++;
                    }
                }
                Console.WriteLine();
                Console.WriteLine("Target: {0} {1:#,000}byte {2:#,000}char {3:#,000}line ({4:#,000}word)",
                                  targetFile, reader.BaseStream.Position, charCount, lineCount, wordCount);
            }

            tagger.Dispose();

            //終了したことを通知
            Console.WriteLine();
            Console.WriteLine("Finish!");
            Console.WriteLine("Press Enter key to close.");
            Console.ReadLine();
        }