public void TestLatticeToDot() { GraphvizFormatter gv2 = new GraphvizFormatter(ConnectionCosts.Instance); Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { JapaneseTokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH) { GraphvizFormatter = gv2 }; return(new TokenStreamComponents(tokenizer, tokenizer)); }); String input = "スペースステーションに行きます。うたがわしい。"; String[] surfaceForms = { "スペース", "ステーション", "に", "行き", "ます", "。", "うたがわしい", "。" }; AssertAnalyzesTo(analyzer, input, surfaceForms); assertTrue(gv2.Finish().IndexOf("22.0", StringComparison.Ordinal) != -1); }
public JaRomajiTagGenerator(JaRomajiTagGeneratorConfig config) : base(config) { analyzer = Analyzer.NewAnonymous((fieldName, reader) => { Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); return(new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(tokenizer, false))); }); }
public void Main() { const string s = "関西国際空港"; Console.WriteLine($"対象の文字列:{s}"); using var reader = new StringReader(s); Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.NORMAL); var tokenStreamComponents = new TokenStreamComponents(tokenizer, tokenizer); using var tokenStream = tokenStreamComponents.TokenStream; // note:処理の実行前にResetを実行する必要がある tokenStream.Reset(); while (tokenStream.IncrementToken()) { Console.WriteLine("---"); Console.WriteLine( $"ICharTermAttribute=>{tokenStream.GetAttribute<ICharTermAttribute>().ToString()}"); Console.WriteLine( $"ITermToBytesRefAttribute#BytesRef=>{tokenStream.GetAttribute<ITermToBytesRefAttribute>().BytesRef}"); Console.WriteLine( $"IOffsetAttribute#StartOffset=>{tokenStream.GetAttribute<IOffsetAttribute>().StartOffset}"); Console.WriteLine( $"IOffsetAttribute#EndOffset=>{tokenStream.GetAttribute<IOffsetAttribute>().EndOffset}"); Console.WriteLine( $"IPositionIncrementAttribute=>{tokenStream.GetAttribute<IPositionIncrementAttribute>().PositionIncrement}"); Console.WriteLine( $"IPositionLengthAttribute=>{tokenStream.GetAttribute<IPositionLengthAttribute>().PositionLength}"); Console.WriteLine( $"IBaseFormAttribute#GetBaseForm=>{tokenStream.GetAttribute<IBaseFormAttribute>().GetBaseForm()}"); Console.WriteLine( $"IPartOfSpeechAttribute#GetPartOfSpeech=>{tokenStream.GetAttribute<IPartOfSpeechAttribute>().GetPartOfSpeech()}"); Console.WriteLine( $"IReadingAttribute#GetReading=>{tokenStream.GetAttribute<IReadingAttribute>().GetReading()}"); Console.WriteLine( $"IReadingAttribute#GetPronunciation=>{tokenStream.GetAttribute<IReadingAttribute>().GetPronunciation()}"); Console.WriteLine( $"IInflectionAttribute#GetInflectionForm=>{tokenStream.GetAttribute<IInflectionAttribute>().GetInflectionForm()}"); Console.WriteLine( $"IInflectionAttribute#GetInflectionType=>{tokenStream.GetAttribute<IInflectionAttribute>().GetInflectionType()}"); Console.WriteLine("---"); } }
protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { Tokenizer tokenizer = new JapaneseTokenizer(reader, userDict, true, mode); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); stream = new JapanesePartOfSpeechStopFilter(m_matchVersion, stream, stoptags); stream = new CJKWidthFilter(stream); stream = new StopFilter(m_matchVersion, stream, m_stopwords); stream = new JapaneseKatakanaStemFilter(stream); stream = new LowerCaseFilter(m_matchVersion, stream); return(new TokenStreamComponents(tokenizer, stream)); }
// Backtraces another incremental fragment: internal void OnBacktrace(JapaneseTokenizer tok, WrappedPositionArray positions, int lastBackTracePos, Position endPosData, int fromIDX, char[] fragment, bool isEnd) { SetBestPathMap(positions, lastBackTracePos, endPosData, fromIDX); sb.Append(FormatNodes(tok, positions, lastBackTracePos, endPosData, fragment)); if (isEnd) { sb.Append(" fini [style=invis]\n"); sb.Append(" "); sb.Append(GetNodeID(endPosData.pos, fromIDX)); sb.Append(" -> fini [label=\"" + EOS_LABEL + "\"]"); } }
public void TestRomajiReadingsHalfWidth() { Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); TokenStream stream = new CJKWidthFilter(tokenizer); return(new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true))); }); AssertAnalyzesTo(a, "今夜はロバート先生と話した", new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" } ); }
public void TestKatakanaReadingsHalfWidth() { Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizerMode.SEARCH); TokenStream stream = new CJKWidthFilter(tokenizer); return(new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false))); }); AssertAnalyzesTo(a, "今夜はロバート先生と話した", new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" } ); }
public void TestRandomHugeStringsMockGraphAfter() { // Randomly inject graph tokens after JapaneseTokenizer: Random random = Random; CheckRandomData(random, Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer tokenizer = new JapaneseTokenizer(reader, ReadDict(), false, JapaneseTokenizerMode.SEARCH); TokenStream graph = new MockGraphTokenFilter(Random, tokenizer); return(new TokenStreamComponents(tokenizer, graph)); }), 100 * RandomMultiplier, 8192); }
public void TestKeyword() { CharArraySet exclusionSet = new CharArraySet(TEST_VERSION_CURRENT, AsSet("あり"), false); Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => { Tokenizer source = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.DEFAULT_MODE); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return(new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink))); }); AssertAnalyzesTo(a, "それはまだ実験段階にあります", new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" } ); }
private string FormatNodes(JapaneseTokenizer tok, WrappedPositionArray positions, int startPos, Position endPosData, char[] fragment) { StringBuilder sb = new StringBuilder(); // Output nodes for (int pos = startPos + 1; pos <= endPosData.pos; pos++) { Position posData = positions.Get(pos); for (int idx = 0; idx < posData.count; idx++) { sb.Append(" "); sb.Append(GetNodeID(pos, idx)); sb.Append(" [label=\""); sb.Append(pos); sb.Append(": "); sb.Append(posData.lastRightID[idx]); sb.Append("\"]\n"); } } // Output arcs for (int pos = endPosData.pos; pos > startPos; pos--) { Position posData = positions.Get(pos); for (int idx = 0; idx < posData.count; idx++) { Position backPosData = positions.Get(posData.backPos[idx]); string toNodeID = GetNodeID(pos, idx); string fromNodeID = GetNodeID(posData.backPos[idx], posData.backIndex[idx]); sb.Append(" "); sb.Append(fromNodeID); sb.Append(" -> "); sb.Append(toNodeID); string attrs; bestPathMap.TryGetValue(fromNodeID, out string path); if (toNodeID.Equals(path, StringComparison.Ordinal)) { // This arc is on best path attrs = " color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20"; } else { attrs = ""; } IDictionary dict = tok.GetDict(posData.backType[idx]); int wordCost = dict.GetWordCost(posData.backID[idx]); int bgCost = costs.Get(backPosData.lastRightID[posData.backIndex[idx]], dict.GetLeftId(posData.backID[idx])); string surfaceForm = new string(fragment, posData.backPos[idx] - startPos, pos - posData.backPos[idx]); sb.Append(" [label=\""); sb.Append(surfaceForm); sb.Append(' '); sb.Append(wordCost); if (bgCost >= 0) { sb.Append('+'); } sb.Append(bgCost); sb.Append("\""); sb.Append(attrs); sb.Append("]\n"); } } return(sb.ToString()); }