protected void Configure <V, K>(V builder) where V : Builder <K> where K : TokenizerBase <T> { builder.LoadDictionaries(); this.tokenFactory = builder.TokenFactory; this.tokenInfoDictionary = builder.TokenInfoDictionary; this.unknownDictionary = builder.UnknownDictionary; this.userDictionary = builder.UserDictionary; this.insertedDictionary = builder.InsertedDictionary; this.viterbiBuilder = new ViterbiBuilder( builder.Fst, tokenInfoDictionary, unknownDictionary, userDictionary, builder.Mode ); this.viterbiSearcher = new ViterbiSearcher( builder.Mode, builder.ConnectionCosts, unknownDictionary, builder.Penalties ); this.viterbiFormatter = new ViterbiFormatter(builder.ConnectionCosts); this.split = builder.Split; InitDictionaryMap(); }
/// <summary> /// Tokenize input sentence. /// </summary> /// <param name="offset">offset of sentence in original input text</param> /// <param name="text">sentence to tokenize</param> /// <returns>list of Token</returns> List <T> CreateTokenList(int offset, string text) { var result = new List <T>(); var lattice = ViterbiBuilder.Build(text); var bestPath = ViterbiSearcher.Search(lattice); foreach (var node in bestPath) { var wordId = node.WordId; if (node.Type == ViterbiNode.NodeType.Known && wordId == -1) { // Do not include BOS/EOS continue; } var token = TokenFactory.CreateToken( wordId, node.Surface, node.Type, offset + node.StartIndex, DictionaryMap[node.Type] ); result.Add(token); } return(result); }
/// <summary> /// Tokenizes the provided text and outputs the corresponding Viterbi lattice and the Viterbi path to the provided output stream /// /// The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format. /// /// This method is not thread safe /// </summary> /// <param name="output">output stream to write to</param> /// <param name="text">text to tokenize</param> public void DebugTokenize(Stream output, string text) { var lattice = ViterbiBuilder.Build(text); var bestPath = ViterbiSearcher.Search(lattice); using (var writer = new StreamWriter(output, Encoding.UTF8, 1024, true)) { writer.Write(ViterbiFormatter.Format(lattice, bestPath)); } }
protected void Configure(BuilderBase builder) { builder.LoadDictionaries(); TokenFactory = builder.TokenFactory; TokenInfoDictionary = builder.TokenInfoDictionary; UnknownDictionary = builder.UnknownDictionary; UserDictionary = builder.UserDictionary; InsertedDictionary = builder.InsertedDictionary; ViterbiBuilder = new ViterbiBuilder(builder.DoubleArrayTrie, TokenInfoDictionary, UnknownDictionary, UserDictionary, builder.Mode); ViterbiSearcher = new ViterbiSearcher(builder.Mode, builder.ConnectionCosts, UnknownDictionary, builder.Penalties); ViterbiFormatter = new ViterbiFormatter(builder.ConnectionCosts); Split = builder.Split; InitDictionaryMap(); }
public void Dispose() { if (viterbiBuilder != null) { viterbiBuilder.Dispose(); } if (viterbiSearcher != null) { viterbiSearcher.Dispose(); } if (viterbiFormatter != null) { viterbiFormatter.Dispose(); } if (tokenInfoDictionary != null) { tokenInfoDictionary.Dispose(); } viterbiBuilder = null; viterbiSearcher = null; viterbiFormatter = null; tokenInfoDictionary = null; }
/// <summary> /// Tokenize input sentence. Up to maxCount different paths of cost at most OPT + costSlack are returned ordered in ascending order by cost, where OPT is the optimal solution. /// </summary> /// <param name="text">sentence to tokenize</param> /// <param name="maxCount">maximum number of paths</param> /// <param name="costSlack">maximum cost slack of a path</param> /// <returns>instance of MultiSearchResult containing the tokenizations</returns> MultiSearchResult CreateMultiSearchResult(string text, int maxCount, int costSlack) { var lattice = ViterbiBuilder.Build(text); return(ViterbiSearcher.SearchMultiple(lattice, maxCount, costSlack)); }