/// <summary>Tokenizes the string</summary>
        /// <param name="input">The string to be tokenized</param>
        /// <returns>A span array containing individual tokens as elements</returns>
        public override Span[] TokenizePositions(string input)
        {
            if (string.IsNullOrEmpty(input))
            {
                return(new Span[0]);
            }

            var tokens             = SplitOnWhitespaces(input);
            var newTokens          = new List <Span>();
            var tokenProbabilities = new List <double>();

            for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
            {
                var    tokenSpan = tokens[currentToken];
                string token     = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
                // Can't tokenize single characters
                if (token.Length < 2)
                {
                    newTokens.Add(tokenSpan);
                    tokenProbabilities.Add(1.0);
                }
                else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
                {
                    newTokens.Add(tokenSpan);
                    tokenProbabilities.Add(1.0);
                }
                else
                {
                    int    startPosition    = tokenSpan.Start;
                    int    endPosition      = tokenSpan.End;
                    int    originalStart    = tokenSpan.Start;
                    double tokenProbability = 1.0;
                    for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
                    {
                        var      context       = _contextGenerator.GetContext(new Tuple <string, int>(token, currentPosition - originalStart));
                        double[] probabilities = _model.Evaluate(context);
                        string   bestOutcome   = _model.GetBestOutcome(probabilities);

                        tokenProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)];
                        if (bestOutcome == TokenContextGenerator.SplitIndicator)
                        {
                            newTokens.Add(new Span(startPosition, currentPosition));
                            tokenProbabilities.Add(tokenProbability);
                            startPosition    = currentPosition;
                            tokenProbability = 1.0;
                        }
                    }
                    newTokens.Add(new Span(startPosition, endPosition));
                    tokenProbabilities.Add(tokenProbability);
                }
            }

            return(newTokens.ToArray());
        }
Beispiel #2
0
        /// <summary>
        /// Detect the position of the first words of sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <returns>
        /// A integer array containing the positions of the end index of
        /// every sentence
        /// </returns>
        public virtual int[] SentencePositionDetect(string input)
        {
            double sentenceProbability = 1;

            _sentenceProbs.Clear();
            var        buffer     = new StringBuilder(input);
            List <int> endersList = _scanner.GetPositions(input);
            var        positions  = new List <int>(endersList.Count);

            for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++)
            {
                int candidate = endersList[currentEnder];
                int cInt      = candidate;

                // skip over the leading parts of non-token final delimiters
                int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1);
                if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace))
                {
                    continue;
                }

                var      pair          = new Tuple <StringBuilder, int>(buffer, candidate);
                var      context       = _contextGenerator.GetContext(pair);
                double[] probabilities = _model.Evaluate(context);
                string   bestOutcome   = _model.GetBestOutcome(probabilities);
                sentenceProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)];
                if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt))
                {
                    if (index != cInt)
                    {
                        positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)));                        //moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))));
                        _sentenceProbs.Add(probabilities[_model.GetOutcomeIndex(bestOutcome)]);
                    }
                    index = cInt + 1;
                }
            }

            return(positions.ToArray());
        }