示例#1
0
        /// <summary>
        /// Tokenize input sentence.
        /// </summary>
        /// <param name="offset">offset of sentence in original input text</param>
        /// <param name="text">sentence to tokenize</param>
        /// <returns>list of Token</returns>
        List <T> CreateTokenList(int offset, string text)
        {
            var result = new List <T>();

            var lattice  = ViterbiBuilder.Build(text);
            var bestPath = ViterbiSearcher.Search(lattice);

            foreach (var node in bestPath)
            {
                var wordId = node.WordId;
                if (node.Type == ViterbiNode.NodeType.Known && wordId == -1)
                {
                    // Do not include BOS/EOS
                    continue;
                }

                var token = TokenFactory.CreateToken(
                    wordId,
                    node.Surface,
                    node.Type,
                    offset + node.StartIndex,
                    DictionaryMap[node.Type]
                    );
                result.Add(token);
            }

            return(result);
        }
示例#2
0
        /// <summary>
        /// Writes the Viterbi lattice for the provided text to an output stream
        ///
        /// The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format.
        ///
        /// This method is not thread safe
        /// </summary>
        /// <param name="output">output stream to write to</param>
        /// <param name="text">text to create lattice for</param>
        public void DebugLattice(Stream output, string text)
        {
            var lattice = ViterbiBuilder.Build(text);

            using (var writer = new StreamWriter(output, Encoding.UTF8, 1024, true))
            {
                writer.Write(ViterbiFormatter.Format(lattice));
            }
        }
示例#3
0
        /// <summary>
        /// Tokenizes the provided text and outputs the corresponding Viterbi lattice and the Viterbi path to the provided output stream
        ///
        /// The output is written in <a href="https://en.wikipedia.org/wiki/DOT_(graph_description_language)">DOT</a> format.
        ///
        /// This method is not thread safe
        /// </summary>
        /// <param name="output">output stream to write to</param>
        /// <param name="text">text to tokenize</param>
        public void DebugTokenize(Stream output, string text)
        {
            var lattice  = ViterbiBuilder.Build(text);
            var bestPath = ViterbiSearcher.Search(lattice);

            using (var writer = new StreamWriter(output, Encoding.UTF8, 1024, true))
            {
                writer.Write(ViterbiFormatter.Format(lattice, bestPath));
            }
        }
示例#4
0
        /// <summary>
        /// Tokenize input sentence. Up to maxCount different paths of cost at most OPT + costSlack are returned ordered in ascending order by cost, where OPT is the optimal solution.
        /// </summary>
        /// <param name="text">sentence to tokenize</param>
        /// <param name="maxCount">maximum number of paths</param>
        /// <param name="costSlack">maximum cost slack of a path</param>
        /// <returns>instance of MultiSearchResult containing the tokenizations</returns>
        MultiSearchResult CreateMultiSearchResult(string text, int maxCount, int costSlack)
        {
            var lattice = ViterbiBuilder.Build(text);

            return(ViterbiSearcher.SearchMultiple(lattice, maxCount, costSlack));
        }