/// <summary> /// Tokenize input sentence. /// </summary> /// <param name="offset">offset of sentence in original input text</param> /// <param name="text">sentence to tokenize</param> /// <returns>list of Token</returns> List <T> CreateTokenList(int offset, string text) { var result = new List <T>(); var lattice = ViterbiBuilder.Build(text); var bestPath = ViterbiSearcher.Search(lattice); foreach (var node in bestPath) { var wordId = node.WordId; if (node.Type == ViterbiNode.NodeType.Known && wordId == -1) { // Do not include BOS/EOS continue; } var token = TokenFactory.CreateToken( wordId, node.Surface, node.Type, offset + node.StartIndex, DictionaryMap[node.Type] ); result.Add(token); } return(result); }
/// <summary> /// Tokenizes the provided text and outputs the corresponding Viterbi lattice and the Viterbi path to the provided output stream /// /// The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format. /// /// This method is not thread safe /// </summary> /// <param name="output">output stream to write to</param> /// <param name="text">text to tokenize</param> public void DebugTokenize(Stream output, string text) { var lattice = ViterbiBuilder.Build(text); var bestPath = ViterbiSearcher.Search(lattice); using (var writer = new StreamWriter(output, Encoding.UTF8, 1024, true)) { writer.Write(ViterbiFormatter.Format(lattice, bestPath)); } }