Esempio n. 1
0
        /// <summary>
        /// 分かち書き
        /// </summary>
        /// <returns></returns>
        private IEnumerable <string> SplitDocumentsWithMeCab(string rawDocument, bool containIsNotPhrageStart)
        {
            var words = new List <string>();

            MeCabParam param = new MeCabParam();

            param.DicDir = @"lib\MeCab\dic\ipadic";
            MeCabTagger t = MeCabTagger.Create(param);

            //形態素解析を行い結果を記録
            string result  = t.Parse(rawDocument).Replace("\t", ",");
            var    results = result.Split(new string[] { "\r\n" }, StringSplitOptions.None);

            foreach (var feature in results)
            {
                // MeCabの結果を要素ごとに分割
                var featureElements = feature.Split(',');
                // 品詞を解析
                // BOS/EOS(開始、終端)を除去する
                if ("EOS" == featureElements[0] ||
                    String.IsNullOrWhiteSpace(featureElements[0])
                    // || containIsNotPhrageStart && "助詞" == featureElements[1]
                    // || containIsNotPhrageStart && "助動詞" == featureElements[1]
                    || containIsNotPhrageStart && "記号" == featureElements[1])
                {
                    continue;
                }
                // 文節を結果のリストに格納
                yield return(featureElements[0]);
            }
        }
Esempio n. 2
0
        public void Tanaka()
        {
            var sentences         = new Tanaka(TestDataPaths.Tanaka, Encoding.UTF8).AllSentences();
            var features          = new HashSet <string>();
            var sentencesFiltered = new HashSet <string>();
            var n = 0;

            foreach (var rawSentence in sentences.Select(s => s.JapaneseSentence))
            {
                Console.WriteLine(tagger.Parse(rawSentence));
                var c = tagger.ParseToNodes(rawSentence);
                foreach (var morpheme in c)
                {
                    var feature = morpheme.Feature;
                    if (feature != null)
                    {
                        Console.WriteLine($"{morpheme.Surface} {feature}");
                    }
                    n++;
                    if (n == 20)
                    {
                        Assert.Fail();
                    }
                }
            }
        }
Esempio n. 3
0
        public static string MeCabParse(string input)
        {
            try
            {
                MeCabTagger tagger = MeCabTagger.Create();
                tagger.LatticeLevel     = MeCabLatticeLevel.Zero;
                tagger.OutPutFormatType = "lattice";
                tagger.AllMorphs        = false;
                tagger.Partial          = false;

                return(tagger.Parse(input));
            }
            catch (Exception ex)
            {
                return(ex.ToString());
            }
        }
Esempio n. 4
0
        private void FuriganaKanaLabel_SelectedTextChangedEvent(string itext)
        {
            if (defWindow != null)
            {
                defWindow.Hide();
            }

            var words_base = tagger.Parse(itext).Split('\n').Where(a => !filters.Contains(a.Split('\t')[0]) && !string.IsNullOrEmpty(a));
            var base_words = words_base.Select(a => a.Split('\t')[1].Split(',')[6]).ToArray();
            var kana       = words_base.Select(a =>
            {
                var strs = a.Split('\t')[1].Split(',');
                if (strs.Length > 7)
                {
                    return(KanaConverter.KatakanaToHiragana(strs[7]));
                }
                else
                {
                    return(a.Split('\t')[0].Trim());
                }
            }).ToArray();
            var romaji = kana.Select(a =>
            {
                try
                {
                    return(transliterator.GetRomaji(a));
                }
                catch (TransliterationException)
                {
                    return(a);
                }
            }).ToArray();
            var words = words_base.Select(a => a.Split('\t')[0]).ToArray();

            if (base_words.Length == 0)
            {
                return;
            }

            int i = 0;

            if (base_words[i].Trim() == "*")
            {
                base_words[i] = words[i];
            }

            if (japDict.ContainsKey(base_words[i]))
            {
                defWindow          = new DefinitionWindow(base_words[i], words[i], romaji[i], japDict[base_words[i]].Senses.First(a => a.Glosses.Any(b => b.Language == Language.English)).Glosses.First().Term);
                defWindow.Location = Cursor.Position;
                defWindow.TopMost  = true;
                defWindow.Show();
            }

            /*else
             *  foreach (char c in base_words[i])
             *      if (kanjiDict.ContainsKey(c.ToString()))
             *      {
             *          var kan = kanjiDict[c.ToString()];
             *          f_defs += kan.Literal + " - " + kan.Meanings.First(a => a.Language == Language.English).Term + "\n";
             *      }*/
        }
Esempio n. 5
0
        static void Main(string[] args)
        {
            Properties.Settings settings = Properties.Settings.Default;
            string    targetFile         = settings.TargetFile;
            Encoding  encoding           = Encoding.GetEncoding(settings.TargetEncoding);
            Stopwatch sw = new Stopwatch();

            //開始指示を待機
            Console.WriteLine("Press Enter key to start.");
            Console.ReadLine();

            Console.WriteLine("\t\t\tProcessTime\tTotalMemory");

            //解析準備処理
            GC.Collect();
            sw.Start();
            MeCabTagger tagger = MeCabTagger.Create();

            sw.Stop();
            Console.WriteLine("OpenTagger:\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //ファイル読込だけの場合
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                }
                sw.Stop();
            }
            Console.WriteLine("ReadLine:\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //解析処理(Nodeの出力)
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    MeCabNode node = tagger.ParseToNode(line);
                }
                sw.Stop();
            }
            Console.WriteLine("ParseToNode:\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //解析処理(latticeモードの文字列出力)
            tagger.OutPutFormatType = "lattice";
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    string ret = tagger.Parse(line);
                }
                sw.Stop();
            }
            Console.WriteLine("Parse(lattice):\t\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));


            //解析処理(Best解5件のNodeの出力)
            tagger.LatticeLevel = MeCabLatticeLevel.One;
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                sw.Reset();
                GC.Collect();
                sw.Start();
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    int i = 0;
                    foreach (MeCabNode node in tagger.ParseNBestToNode(line))
                    {
                        if (++i == 5)
                        {
                            break;
                        }
                    }
                }
                sw.Stop();
            }
            Console.WriteLine("ParseNBestToNode:\t{0:0.000}sec\t{1:#,000}byte",
                              sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false));

            //対象の情報
            using (StreamReader reader = new StreamReader(targetFile, encoding))
            {
                long charCount = 0;
                long lineCount = 0;
                long wordCount = 0;
                for (string line = reader.ReadLine(); line != null; line = reader.ReadLine())
                {
                    charCount += line.Length;
                    lineCount++;
                    MeCabNode node = tagger.ParseToNode(line);
                    for (node = node.Next; node.Next != null; node = node.Next)
                    {
                        wordCount++;
                    }
                }
                Console.WriteLine();
                Console.WriteLine("Target: {0} {1:#,000}byte {2:#,000}char {3:#,000}line ({4:#,000}word)",
                                  targetFile, reader.BaseStream.Position, charCount, lineCount, wordCount);
            }

            tagger.Dispose();

            //終了したことを通知
            Console.WriteLine();
            Console.WriteLine("Finish!");
            Console.WriteLine("Press Enter key to close.");
            Console.ReadLine();
        }
Esempio n. 6
0
 public TextReader ParseText(string text) => new StringReader(_tagger.Parse(text));