/** * Tokenize input sentence. * * @param offset offset of sentence in original input text * @param text sentence to tokenize * @return list of Token */ private List <T> CreateTokenList(int offset, string text) { List <T> result = new List <T>(); ViterbiLattice lattice = viterbiBuilder.Build(text); List <ViterbiNode> bestPath = viterbiSearcher.Search(lattice); foreach (ViterbiNode node in bestPath) { int wordId = node.WordId; if (node.Type == ViterbiNode.NodeType.KNOWN && wordId == -1) { // Do not include BOS/EOS continue; } T token = (T)tokenFactory.CreateToken( wordId, node.Surface, node.Type, offset + node.StartIndex, dictionaryMap[node.Type] ); result.Add(token); } return(result); }
/** * Tokenize input sentence. Up to maxCount different paths of cost at most OPT + costSlack are returned ordered in ascending order by cost, where OPT is the optimal solution. * * @param text sentence to tokenize * @param maxCount maximum number of paths * @param costSlack maximum cost slack of a path * @return instance of MultiSearchResult containing the tokenizations */ private MultiSearchResult CreateMultiSearchResult(string text, int maxCount, int costSlack) { ViterbiLattice lattice = viterbiBuilder.Build(text); MultiSearchResult multiSearchResult = viterbiSearcher.SearchMultiple(lattice, maxCount, costSlack); return(multiSearchResult); }
/** * Writes the Viterbi lattice for the provided text to an output stream * <p> * The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format. * <p> * This method is not thread safe * * @param outputStream output stream to write to * @param text text to create lattice for * @throws java.io.IOException if an error occurs when writing the lattice */ public void DebugLattice(Stream outputStream, string text) { ViterbiLattice lattice = viterbiBuilder.Build(text); using (var writer = new StreamWriter(outputStream, Encoding.UTF8)) { var bytes = Encoding.UTF8.GetBytes(viterbiFormatter.Format(lattice)); outputStream.Write(bytes, 0, bytes.Length); } }