public void Open(MeCabParam param) { this.tokenizer.Open(param); this.connector.Open(param); this.costFactor = param.CostFactor; this.Theta = param.Theta; this.LatticeLevel = param.LatticeLevel; this.Partial = param.Partial; this.AllMorphs = param.AllMorphs; }
private void InitializeMeCabTagger() { var param = new MeCabParam(); param.DicDir = Path.Combine( Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), @"dic\ipadic" ); _meCabTagger = MeCabTagger.Create(param); }
public void SetUp() { MeCabParam mecabParam = new MeCabParam { DicDir = Path.Combine(baseDir, @"mecab\ipadic"), }; tagger = MeCabTagger.Create(mecabParam); mecabParam.LatticeLevel = MeCabLatticeLevel.Zero; mecabParam.OutputFormatType = "lattice"; mecabParam.AllMorphs = false; mecabParam.Partial = true; }
public void Open(MeCabParam param) { this.dic = new MeCabDictionary[param.UserDic.Length + 1]; string dicDir = param.DicDir; this.property.Open(dicDir); this.unkDic.Open(Path.Combine(dicDir, "unk.dic")); if (this.unkDic.Type != DictionaryType.Unk) { throw new MeCabInvalidFileException("not a unk dictionary", this.unkDic.FileName); } MeCabDictionary meCabDictionary = new MeCabDictionary(); meCabDictionary.Open(Path.Combine(dicDir, "sys.dic")); if (meCabDictionary.Type != 0) { throw new MeCabInvalidFileException("not a system dictionary", meCabDictionary.FileName); } this.dic[0] = meCabDictionary; for (int i = 0; i < param.UserDic.Length; i++) { MeCabDictionary meCabDictionary2 = new MeCabDictionary(); meCabDictionary2.Open(Path.Combine(dicDir, param.UserDic[i])); if (meCabDictionary2.Type != DictionaryType.Usr) { throw new MeCabInvalidFileException("not a user dictionary", meCabDictionary2.FileName); } if (!meCabDictionary.IsCompatible(meCabDictionary2)) { throw new MeCabInvalidFileException("incompatible dictionary", meCabDictionary2.FileName); } this.dic[i + 1] = meCabDictionary2; } this.unkTokens = new Token[this.property.Size][]; for (int j = 0; j < this.unkTokens.Length; j++) { string text = this.property.Name(j); DoubleArray.ResultPair n = this.unkDic.ExactMatchSearch(text); if (n.Value == -1) { throw new MeCabInvalidFileException("cannot find UNK category: " + text, this.unkDic.FileName); } this.unkTokens[j] = this.unkDic.GetToken(n); } this.space = this.property.GetCharInfo(' '); this.bosFeature = param.BosFeature; this.unkFeature = param.UnkFeature; this.maxGroupingSize = param.MaxGroupingSize; if (this.maxGroupingSize <= 0) { this.maxGroupingSize = 24; } }
public void SetUp() { MeCabParam mecabParam = new MeCabParam { DicDir = TestDataPaths.Unidic, UseMemoryMappedFile = true }; tagger = MeCabTagger.Create(mecabParam); mecabParam.LatticeLevel = MeCabLatticeLevel.Zero; mecabParam.OutputFormatType = "yomi"; mecabParam.AllMorphs = false; mecabParam.Partial = true; }
public void Open(MeCabParam param) { tokenizer = new Tokenizer(); tokenizer.Open(param); connector = param.UseMemoryMappedFile ? new ConnectorMMF() as IConnector : new Connector() as IConnector; connector.Open(param); this.costFactor = param.CostFactor; this.Theta = param.Theta; this.LatticeLevel = param.LatticeLevel; this.Partial = param.Partial; this.AllMorphs = param.AllMorphs; }
/// <summary> /// /// </summary> /// <returns></returns> public async Task CacheAllTalk() { var list = await _webScraypingService.ToListRubyistHotlinksUrl(); var allTalks = new List <Talk>(); foreach (var item in list.ToList()) { allTalks.AddRange(await _webScraypingService.ToListTalks(item)); } var mecabParam = new MeCabParam { DicDir = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"..\..\..\RubyistHotlinksReports.Core\dic\ipadic") }; var meCabTagger = MeCabTagger.Create(mecabParam); foreach (var talk in allTalks) { var words = ParseText(meCabTagger, talk.Message).ToList(); foreach (var word in words) { if (word.Pos != "名詞") { continue; } if (word.Base == "*") { continue; } if (word.Pos1 == "非自立") { continue; } if (!AllTalkDictionary.ContainsKey(talk.User)) { AllTalkDictionary.Add(talk.User, new List <Word>()); } AllTalkDictionary[talk.User].Add(word); } } }
static void Main(string[] args) { var sentence = "SOS団には涼宮ハルヒ、キョン、長門有希、朝比奈みくる、古泉一樹の5人がいる。"; var param = new MeCabParam(); param.DicDir = @"./dic/ipadic"; param.UserDic = new[] { @"../userdic/haruhi.dic" }; var mecab = MeCabTagger.Create(param); var node = mecab.ParseToNode(sentence); while (node != null) { // if (node.CharType > 0) Console.WriteLine(node.Surface + "\t" + node.Feature); node = node.Next; } }
private string parse(string sentence) { MeCabParam mecabParam = new MeCabParam(); mecabParam.DicDir = @"Assets/dic/ipadic"; MeCabTagger t = MeCabTagger.Create(mecabParam); MeCabNode node = t.ParseToNode(sentence); string result = ""; while (node != null) { if (node.CharType > 0) { result += node.Surface + "\t" + node.Feature + "\n"; } node = node.Next; } return(result); }
public string Modified_PunctuationResult(string i_string) { MeCabParam param = new MeCabParam(); param.DicDir = @"C:\Program Files (x86)\MeCab\dic\ipadic"; string o_string = ""; using (var tagger = MeCabTagger.Create(param)) { MeCabNode node = tagger.ParseToNode(i_string); while (node != null) { if (node.CharType > 0) { //var features = node.Feature.Split(','); //var displayFeatures = string.Join(", ", features); o_string += node.Surface + " "; } node = node.Next; } } return(o_string); }
public static string GetNMeCabToFurigana(string strInput) { string str = string.Empty, strYomi; try { MeCabParam mcp = new MeCabParam(); MeCabTagger mct = MeCabTagger.Create(); MeCabNode mcn = mct.ParseToNode(strInput); MecabResult mcr = new MecabResult(mcn); foreach (MecabResult.MecabResultItem mcri in mcr.nodes) { strYomi = String.Empty; if (String.Compare(mcri.読み, "*") == 0) { // 形態素解析を持ってしても読めない場合、OutputYomiganaを使用する。 strYomi = ModuleReuseClass.OutputYomigana(mcri.表層形); } else { strYomi = mcri.読み; } str += strYomi; } //カタカナをひらがなに変換する str = Strings.StrConv(str, VbStrConv.Hiragana, 0x411); //あいうえおかきくけこサシスセソnaninuneno } catch (Exception ex) { MessageBox.Show(ex.Message, "エラー", MessageBoxButtons.OK, MessageBoxIcon.Error); } return(str); }
static void Main(string[] args) { var mPara = new MeCabParam(); //辞書ファイルがあるフォルダを指定(NuGetで入れれば勝手に入る) mPara.DicDir = @"c:\dic\mecab-ipadic-neologd"; var mTagger = MeCabTagger.Create(mPara); string line = null; var receivers = new List <Func <string, bool> >(); while ((line = Console.ReadLine()) != null) { var node = mTagger.ParseToNode(line); while (node != null) { if (node.CharType > 0) { Console.WriteLine("{0}\t{1}", node.Surface, node.Feature); } node = node.Next; } } }
public MecabHelper() { Parameter = new MeCabParam(); Tagger = MeCabTagger.Create(Parameter); }
static void Main(string[] args) { if (File.Exists("index.txt")) { Console.WriteLine("Detect index.txt"); } else { Console.WriteLine("Make inverted index."); Console.WriteLine("Calculating Term Frequency ..."); var weightList = new Dictionary <string, Dictionary <string, double> >(); // Dictionary<word, Dictionary<filename, weight>> var invertedIndex = new Dictionary <string, List <string> >(); // Dictionary<word, List<filename orderby weight>> var targetFiles = Directory.GetFiles(dataDir, @"*.txt"); MeCabParam param = new MeCabParam(); param.DicDir = dicDir; MeCabTagger t = MeCabTagger.Create(param); Stopwatch sw = new Stopwatch(); sw.Start(); { Parallel.ForEach(targetFiles, fileName => { Console.WriteLine("Processing " + fileName); var wordList = new Dictionary <string, int>(); // 単語数カウント用リスト int wordCount = 0; var lockObject = new Object(); Parallel.ForEach(File.ReadLines(fileName), line => { var node = t.ParseToNode(line); while (node != null) { if (node.CharType > 0) { lock (lockObject) { ++wordCount; } var normalized = node.Feature.Split(',')[6]; var originalForm = (normalized == null || normalized == "" || normalized == "*") ? node.Surface : normalized; // 原形がないものは表装文字を代表とし、原形がある場合はそちらを代表とする lock (wordList) { if (!wordList.ContainsKey(originalForm)) { wordList[originalForm] = 0; } ++wordList[originalForm]; } } node = node.Next; } }); Parallel.ForEach(wordList.Keys, word => { lock (weightList) { if (!weightList.ContainsKey(word)) { weightList[word] = new Dictionary <string, double>(); } weightList[word][fileName] = wordList[word] / (double)wordCount; } }); }); } sw.Stop(); Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed."); Console.WriteLine("Constructing Inverted Index ..."); sw.Restart(); { /* * invertedIndex = weightList.Keys * .AsParallel() * .ToDictionary( * word => word, * word => weightList[word].Keys * .OrderByDescending(fileName => weightList[word][fileName]) * .ThenBy(fileName => fileName) * .ToList()); */ Parallel.ForEach(weightList.Keys, word => { var ks = weightList[word].Keys.OrderByDescending(fileName => weightList[word][fileName]).ThenBy(fileName => fileName).ToList(); lock (invertedIndex) { invertedIndex[word] = ks; } if (!invertedIndex.ContainsKey(word)) { Console.WriteLine($"{word}は転置インデックスに含まれていません"); } }); } sw.Stop(); Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed."); Console.WriteLine("Calculating Inverse Document Frequency and Recording Weight to weightList ..."); sw.Restart(); { weightList = weightList.AsParallel() .ToDictionary( kv1 => kv1.Key, kv1 => { var idf = Math.Log(targetFiles.Length / kv1.Value.Count, 2) + 1; return(kv1.Value.ToDictionary(kv2 => kv2.Key, kv2 => kv2.Value * idf)); }); } sw.Stop(); Console.WriteLine($"{sw.ElapsedMilliseconds} msec Elpsed."); StreamWriter writer = new StreamWriter(@"index.txt", false, Encoding.GetEncoding("utf-8")); foreach (var word in invertedIndex.Keys) { writer.Write($"{word}\t"); foreach (var filename in invertedIndex[word]) { writer.Write($"({filename}, {weightList[word][filename]}), "); } writer.WriteLine(); } writer.Close(); Console.WriteLine("Successfully finishing all procedures."); } Console.Read(); }
static void Main(string[] args) { Properties.Settings settings = Properties.Settings.Default; string targetFile = settings.TargetFile; Encoding encoding = Encoding.GetEncoding(settings.TargetEncoding); Stopwatch sw = new Stopwatch(); //開始指示を待機 Console.WriteLine("Press Enter key to start."); Console.ReadLine(); Console.WriteLine("\t\t\tProcessTime\tTotalMemory"); MeCabParam param = new MeCabParam(); param.DicDir = @"D:\DidacticalEnigma-Data\mecab\ipadic"; #if MMF param.UseMemoryMappedFile = true; #endif //解析準備処理 GC.Collect(); sw.Start(); MeCabTagger tagger = MeCabTagger.Create(param); sw.Stop(); Console.WriteLine("OpenTagger:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //ファイル読込だけの場合 using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { } sw.Stop(); } Console.WriteLine("ReadLine:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(Nodeの出力) using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { MeCabNode node = tagger.ParseToNode(line); } sw.Stop(); } Console.WriteLine("ParseToNode:\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(latticeモードの文字列出力) tagger.OutPutFormatType = "lattice"; using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { string ret = tagger.Parse(line); } sw.Stop(); } Console.WriteLine("Parse(lattice):\t\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //解析処理(Best解5件のNodeの出力) tagger.LatticeLevel = MeCabLatticeLevel.One; using (StreamReader reader = new StreamReader(targetFile, encoding)) { sw.Reset(); GC.Collect(); sw.Start(); for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { int i = 0; foreach (MeCabNode node in tagger.ParseNBestToNode(line)) { if (++i == 5) { break; } } } sw.Stop(); } Console.WriteLine("ParseNBestToNode:\t{0:0.000}sec\t{1:#,000}byte", sw.Elapsed.TotalSeconds, GC.GetTotalMemory(false)); //対象の情報 using (StreamReader reader = new StreamReader(targetFile, encoding)) { long charCount = 0; long lineCount = 0; long wordCount = 0; for (string line = reader.ReadLine(); line != null; line = reader.ReadLine()) { charCount += line.Length; lineCount++; MeCabNode node = tagger.ParseToNode(line); for (node = node.Next; node.Next != null; node = node.Next) { wordCount++; } } Console.WriteLine(); Console.WriteLine("Target: {0} {1:#,000}byte {2:#,000}char {3:#,000}line ({4:#,000}word)", targetFile, reader.BaseStream.Position, charCount, lineCount, wordCount); } tagger.Dispose(); //終了したことを通知 Console.WriteLine(); Console.WriteLine("Finish!"); Console.WriteLine("Press Enter key to close."); Console.ReadLine(); }
public unsafe TNode Lookup(char *begin, char *end, byte *bytesBegin, byte *bytesEnd, MeCabParam param, Func <TNode> nodeAllocator) { CharInfo cInfo; if (end - begin > ushort.MaxValue) { end = begin + ushort.MaxValue; } int leftSpaceLen; char *begin2 = property.SeekToOtherType(begin, end, this.space, &cInfo, &leftSpaceLen); if (begin2 >= end) { return(null); } byte *bytesBegin2 = bytesBegin + this.Encoding.GetByteCount(begin, leftSpaceLen); TNode resultNode = null; var daResults = stackalloc DoubleArray.ResultPair[DAResultSize]; foreach (MeCabDictionary it in this.dic) { int n = it.CommonPrefixSearch(bytesBegin2, (int)(bytesEnd - bytesBegin2), daResults, DAResultSize); for (int i = 0; i < n; i++) { int length = this.Encoding.GetCharCount(bytesBegin2, daResults->Length); int rLength = (int)(begin2 - begin) + length; var tokenSize = it.GetTokenSize(daResults->Value); var tokens = it.GetTokens(daResults->Value); for (int j = 0; j < tokenSize; j++) { var newNode = nodeAllocator(); newNode.Surface = new string(begin2, 0, length); newNode.Length = length; newNode.RLength = rLength; newNode.LCAttr = tokens->LcAttr; newNode.RCAttr = tokens->RcAttr; newNode.PosId = tokens->PosId; newNode.WCost = tokens->WCost; newNode.PFeature = it.GetFeature(tokens->Feature); tokens++; newNode.Encoding = this.Encoding; newNode.Stat = MeCabNodeStat.Nor; newNode.CharType = cInfo.DefaultType; newNode.BNext = resultNode; resultNode = newNode; } daResults++; } } if (resultNode != null && !cInfo.Invoke) { return(resultNode); } char *begin3 = begin2 + 1; char *groupBegin3 = null; if (cInfo.Group) { char * tmp = begin3; CharInfo fail; int cLen; begin3 = this.property.SeekToOtherType(begin3, end, cInfo, &fail, &cLen); if (cLen <= param.MaxGroupingSize) { this.AddUnknown(ref resultNode, cInfo, begin, begin2, begin3, nodeAllocator); } groupBegin3 = begin3; begin3 = tmp; } for (int i = 1; i <= cInfo.Length; i++) { if (begin3 > end) { break; } if (begin3 == groupBegin3) { continue; } this.AddUnknown(ref resultNode, cInfo, begin, begin2, begin3, nodeAllocator); if (!cInfo.IsKindOf(this.property.GetCharInfo(*begin3))) { break; } begin3 += 1; } if (resultNode == null) { this.AddUnknown(ref resultNode, cInfo, begin, begin2, begin3, nodeAllocator); } return(resultNode); }
static JapanesePOSExtractor() { meCabParam = new MeCabParam(); meCabParam.DicDir = ConfigurationManager.AppSettings["mecabDicPath"]; tagger = MeCabTagger.Create(meCabParam); }
public Markov(string dictionaryPath) { mecabParam = new MeCabParam(); mecabParam.DicDir = @"Assets/dic/ipadic"; mecabTagger = MeCabTagger.Create(mecabParam); }
public void Open(MeCabParam param) { this.dic = new MeCabDictionary[param.UserDic.Length + 1]; string prefix = param.DicDir; this.property.Open(prefix); this.unkDic.Open(Path.Combine(prefix, UnkDicFile)); if (this.unkDic.Type != DictionaryType.Unk) throw new MeCabInvalidFileException("not a unk dictionary", this.unkDic.FileName); MeCabDictionary sysDic = new MeCabDictionary(); sysDic.Open(Path.Combine(prefix, SysDicFile)); if (sysDic.Type != DictionaryType.Sys) throw new MeCabInvalidFileException("not a system dictionary", sysDic.FileName); this.dic[0] = sysDic; for (int i = 0; i < param.UserDic.Length; i++) { MeCabDictionary d = new MeCabDictionary(); d.Open(Path.Combine(prefix, param.UserDic[i])); if (d.Type != DictionaryType.Usr) throw new MeCabInvalidFileException("not a user dictionary", d.FileName); if (!sysDic.IsCompatible(d)) throw new MeCabInvalidFileException("incompatible dictionary", d.FileName); this.dic[i + 1] = d; } this.unkTokens = new Token[this.property.Size][]; for (int i = 0; i < this.unkTokens.Length; i++) { string key = this.property.Name(i); DoubleArray.ResultPair n = this.unkDic.ExactMatchSearch(key); if (n.Value == -1) throw new MeCabInvalidFileException("cannot find UNK category: " + key, this.unkDic.FileName); this.unkTokens[i] = this.unkDic.GetToken(n); } this.space = this.property.GetCharInfo(' '); this.bosFeature = param.BosFeature; this.unkFeature = param.UnkFeature; this.maxGroupingSize = param.MaxGroupingSize; if (this.maxGroupingSize <= 0) this.maxGroupingSize = DefaltMaxGroupingSize; }
public void Open(MeCabParam param) { this.dic = new MeCabDictionary[param.UserDic.Length + 1]; string prefix = param.DicDir; this.property.Open(prefix); this.unkDic.Open(Path.Combine(prefix, UnkDicFile)); if (this.unkDic.Type != DictionaryType.Unk) { throw new MeCabInvalidFileException("not a unk dictionary", this.unkDic.FileName); } MeCabDictionary sysDic = new MeCabDictionary(); sysDic.Open(Path.Combine(prefix, SysDicFile)); if (sysDic.Type != DictionaryType.Sys) { throw new MeCabInvalidFileException("not a system dictionary", sysDic.FileName); } this.dic[0] = sysDic; for (int i = 0; i < param.UserDic.Length; i++) { MeCabDictionary d = new MeCabDictionary(); d.Open(Path.Combine(prefix, param.UserDic[i])); if (d.Type != DictionaryType.Usr) { throw new MeCabInvalidFileException("not a user dictionary", d.FileName); } if (!sysDic.IsCompatible(d)) { throw new MeCabInvalidFileException("incompatible dictionary", d.FileName); } this.dic[i + 1] = d; } this.unkTokens = new Token[this.property.Size][]; for (int i = 0; i < this.unkTokens.Length; i++) { string key = this.property.Name(i); DoubleArray.ResultPair n = this.unkDic.ExactMatchSearch(key); if (n.Value == -1) { throw new MeCabInvalidFileException("cannot find UNK category: " + key, this.unkDic.FileName); } this.unkTokens[i] = this.unkDic.GetToken(n); } this.space = this.property.GetCharInfo(' '); this.bosFeature = param.BosFeature; this.unkFeature = param.UnkFeature; this.maxGroupingSize = param.MaxGroupingSize; if (this.maxGroupingSize <= 0) { this.maxGroupingSize = DefaltMaxGroupingSize; } }
public void Open(MeCabParam param) { this.OutputFormatType = param.OutputFormatType; }
public MeCabUnidic(MeCabParam mecabParam) : base(mecabParam) { }
/// <summary> /// MeCabインスタンスの作成 /// </summary> /// <returns>MeCabインスタンス</returns> private static MeCabTagger Create() { MeCabParam param = new MeCabParam(); return(MeCabTagger.Create(param)); }
public void Open(MeCabParam param) { tokenizer.Open(param); connector.Open(param); this.costFactor = param.CostFactor; this.Theta = param.Theta; this.LatticeLevel = param.LatticeLevel; this.Partial = param.Partial; this.AllMorphs = param.AllMorphs; }
static void UseLattice() { Console.WriteLine("----------------------------------------------------------------------"); Console.WriteLine("Example of using Lattice :"); Console.WriteLine(); using (var tagger = NMeCabIpaDic.CreateTagger()) { var prm = new MeCabParam() { LatticeLevel = MeCabLatticeLevel.Two, Theta = 1f / 800f / 2f }; var lattice = tagger.ParseToLattice("東京大学", prm); // ラティスを取得 // ラティスから、ベスト解を取得し処理 foreach (var node in lattice.GetBestNodes()) { Console.Write(node.Surface); Console.CursorLeft = 10; Console.Write(node.Feature); Console.WriteLine(); } Console.WriteLine("--------"); // ラティスから、2番目と3番目のベスト解を取得し処理 foreach (var result in lattice.GetNBestResults().Skip(1).Take(2)) { foreach (var node in result) { Console.Write(node.Surface); Console.CursorLeft = 10; Console.Write(node.Feature); Console.WriteLine(); } Console.WriteLine("----"); } Console.WriteLine("--------"); // ラティスから、開始位置別の形態素を取得し処理 for (int i = 0; i < lattice.BeginNodeList.Length - 1; i++) { for (var node = lattice.BeginNodeList[i]; node != null; node = node.BNext) { if (node.Prob <= 0.001f) { continue; } Console.CursorLeft = i * 2; Console.Write(node.Surface); Console.CursorLeft = 10; Console.Write(node.Prob.ToString("F3")); Console.CursorLeft = 16; Console.Write(node.Feature); Console.WriteLine(); } } Console.WriteLine("--------"); // ラティスから、最終的な累積コストのみを取得し表示 Console.WriteLine(lattice.EosNode.Cost); } }
public MeCabIpadic(MeCabParam mecabParam) : base(mecabParam) { }
public void Open(MeCabParam param) { string fileName = Path.Combine(param.DicDir, MatrixFile); this.Open(fileName); }
public MecabHelper() { parameter = new MeCabParam(); }