/// <summary> /// Tokenize input sentence. /// </summary> /// <param name="offset">offset of sentence in original input text</param> /// <param name="text">sentence to tokenize</param> /// <returns>list of Token</returns> List <T> CreateTokenList(int offset, string text) { var result = new List <T>(); var lattice = ViterbiBuilder.Build(text); var bestPath = ViterbiSearcher.Search(lattice); foreach (var node in bestPath) { var wordId = node.WordId; if (node.Type == ViterbiNode.NodeType.Known && wordId == -1) { // Do not include BOS/EOS continue; } var token = TokenFactory.CreateToken( wordId, node.Surface, node.Type, offset + node.StartIndex, DictionaryMap[node.Type] ); result.Add(token); } return(result); }
/// <summary> /// Writes the Viterbi lattice for the provided text to an output stream /// /// The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format. /// /// This method is not thread safe /// </summary> /// <param name="output">output stream to write to</param> /// <param name="text">text to create lattice for</param> public void DebugLattice(Stream output, string text) { var lattice = ViterbiBuilder.Build(text); using (var writer = new StreamWriter(output, Encoding.UTF8, 1024, true)) { writer.Write(ViterbiFormatter.Format(lattice)); } }
/// <summary> /// Tokenizes the provided text and outputs the corresponding Viterbi lattice and the Viterbi path to the provided output stream /// /// The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format. /// /// This method is not thread safe /// </summary> /// <param name="output">output stream to write to</param> /// <param name="text">text to tokenize</param> public void DebugTokenize(Stream output, string text) { var lattice = ViterbiBuilder.Build(text); var bestPath = ViterbiSearcher.Search(lattice); using (var writer = new StreamWriter(output, Encoding.UTF8, 1024, true)) { writer.Write(ViterbiFormatter.Format(lattice, bestPath)); } }
/// <summary> /// Tokenize input sentence. Up to maxCount different paths of cost at most OPT + costSlack are returned ordered in ascending order by cost, where OPT is the optimal solution. /// </summary> /// <param name="text">sentence to tokenize</param> /// <param name="maxCount">maximum number of paths</param> /// <param name="costSlack">maximum cost slack of a path</param> /// <returns>instance of MultiSearchResult containing the tokenizations</returns> MultiSearchResult CreateMultiSearchResult(string text, int maxCount, int costSlack) { var lattice = ViterbiBuilder.Build(text); return(ViterbiSearcher.SearchMultiple(lattice, maxCount, costSlack)); }