Пример #1
0
        /// <summary>
        /// Tokenizes the string.
        /// </summary>
        /// <param name="input">
        /// The string to be tokenized.
        /// </param>
        /// <returns>
        /// A span array containing individual tokens as elements.
        /// </returns>
        public virtual Util.Span[] TokenizePositions(string input)
        {
            if (mUnicodeMapping)
            {
                input = Utils.MapUnicodeChars(input);
            }

            Util.Span[] tokens = Split(input);
            mNewTokens.Clear();
            mTokenProbabilities.Clear();

            for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
            {
                Util.Span tokenSpan = tokens[currentToken];
                string    token     = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
                // Can't tokenize single characters
                if (token.Length < 2)
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else
                {
                    int    startPosition    = tokenSpan.Start;
                    int    endPosition      = tokenSpan.End;
                    int    originalStart    = tokenSpan.Start;
                    double tokenProbability = 1.0;
                    for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
                    {
                        //Console.Write("{0} {1}|{2} ({3})", currentPosition - originalStart - 1, token[currentPosition - originalStart - 1], token[currentPosition - originalStart], token);
                        if (mAlphaNumericOptimization)
                        {
                            char leftChar  = token[currentPosition - originalStart - 1];
                            char rightChar = token[currentPosition - originalStart];
                            if (char.IsLetterOrDigit(leftChar) && char.IsLetterOrDigit(rightChar)) /*Console.WriteLine();*/ continue {
                                ;
                            }
                        }
                        double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.Pair <string, int>(token, currentPosition - originalStart)));
                        string   bestOutcome   = mModel.GetBestOutcome(probabilities);
                        //Console.WriteLine(bestOutcome);

                        tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                        if (bestOutcome == TokenContextGenerator.SplitIndicator)
                        {
                            mNewTokens.Add(new Util.Span(startPosition, currentPosition));
                            mTokenProbabilities.Add(tokenProbability);
                            startPosition    = currentPosition;
                            tokenProbability = 1.0;
                        }
                    }
                    mNewTokens.Add(new Util.Span(startPosition, endPosition));
                    mTokenProbabilities.Add(tokenProbability);
                }
            }
Пример #2
0
        /// <summary>
        /// Tokenizes the string.
        /// </summary>
        /// <param name="input">
        /// The string to be tokenized.
        /// </param>
        /// <returns>
        /// A span array containing individual tokens as elements.
        /// </returns>
        public virtual Util.Span[] TokenizePositions(string input)
        {
            Util.Span[] tokens = Split(input);
            mNewTokens.Clear();
            mTokenProbabilities.Clear();

            for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
            {
                Util.Span tokenSpan = tokens[currentToken];
                string    token     = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
                // Can't tokenize single characters
                if (token.Length < 2)
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
                {
                    mNewTokens.Add(tokenSpan);
                    mTokenProbabilities.Add(1.0);
                }
                else
                {
                    int    startPosition    = tokenSpan.Start;
                    int    endPosition      = tokenSpan.End;
                    int    originalStart    = tokenSpan.Start;
                    double tokenProbability = 1.0;
                    for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
                    {
                        double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(new Util.Pair <string, int>(token, currentPosition - originalStart)));
                        string   bestOutcome   = mModel.GetBestOutcome(probabilities);

                        tokenProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                        if (bestOutcome == TokenContextGenerator.SplitIndicator)
                        {
                            mNewTokens.Add(new Util.Span(startPosition, currentPosition));
                            mTokenProbabilities.Add(tokenProbability);
                            startPosition    = currentPosition;
                            tokenProbability = 1.0;
                        }
                    }
                    mNewTokens.Add(new Util.Span(startPosition, endPosition));
                    mTokenProbabilities.Add(tokenProbability);
                }
            }

            return(mNewTokens.ToArray());
        }
        /// <summary>
        /// Detect the position of the first words of sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <returns>
        /// A integer array containing the positions of the end index of
        /// every sentence
        /// </returns>
        public virtual int[] SentencePositionDetect(string input)
        {
            if (mUnicodeMapping)
            {
                input = Utils.MapUnicodeChars(input);
            }

            double sentenceProbability = 1;

            mSentenceProbs.Clear();
            System.Text.StringBuilder buffer = new System.Text.StringBuilder(input);
            List <int> endersList            = mScanner.GetPositions(input);
            List <int> positions             = new List <int>(endersList.Count);

            for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++)
            {
                int candidate = endersList[currentEnder];
                int cInt      = candidate;

                // skip over the leading parts of non-token final delimiters
                int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1);
                if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace))
                {
                    continue;
                }

                Util.Pair <System.Text.StringBuilder, int> pair = new Util.Pair <System.Text.StringBuilder, int>(buffer, candidate);
                double[] probabilities = mModel.Evaluate(mContextGenerator.GetContext(pair));
                string   bestOutcome   = mModel.GetBestOutcome(probabilities);
                sentenceProbability *= probabilities[mModel.GetOutcomeIndex(bestOutcome)];
                if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt))
                {
                    if (index != cInt)
                    {
                        positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)));                        //moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))));
                        mSentenceProbs.Add(probabilities[mModel.GetOutcomeIndex(bestOutcome)]);
                    }
                    index = cInt + 1;
                }
            }

            return(positions.ToArray());
        }
Пример #4
0
        // Methods --------------------

        private void AddEvents(string line)
        {
            string[] wordsWithSeparatorToken = line.Split(' ');
            foreach (string wordWithSeparatorToken in wordsWithSeparatorToken)
            {
                var parts = wordWithSeparatorToken.Split(_tokenSeparator);
                var indicesOfSeparators = new List <int>();
                for (var i = 1; i < parts.Length; i++)
                {
                    var indexOfSeparator = parts.Where((p, index) => index < i).Sum(p => p.Length);
                    indicesOfSeparators.Add(indexOfSeparator);
                }

                var word = string.Join("", parts);
                for (int index = 0; index < word.Length; index++)
                {
                    string[] context = ContextGenerator.GetContext(new Tuple <string, int>(word, index));

                    var outcome       = indicesOfSeparators.Contains(index) ? "T" : "F";
                    var trainingEvent = new SharpEntropy.TrainingEvent(outcome, context);
                    _eventList.Add(trainingEvent);
                }
            }
        }
Пример #5
0
 private void AddEvents(string line)
 {
     string[] spacedTokens = line.Split(' ');
     for (int currentToken = 0; currentToken < spacedTokens.Length; currentToken++)
     {
         string buffer = spacedTokens[currentToken];
         if (MaximumEntropyTokenizer.AlphaNumeric.IsMatch(buffer))
         {
             int lastIndex = buffer.Length - 1;
             for (int index = 0; index < buffer.Length; index++)
             {
                 string[] context = mContextGenerator.GetContext(new Util.Pair <string, int>(buffer, index));
                 if (index == lastIndex)
                 {
                     mEventList.Add(new SharpEntropy.TrainingEvent("T", context));
                 }
                 else
                 {
                     mEventList.Add(new SharpEntropy.TrainingEvent("F", context));
                 }
             }
         }
     }
 }
        private void AddNewEvents(string token)
        {
            System.Text.StringBuilder buffer = mBuffer;
            buffer.Append(token.Trim());
            int sentenceEndPosition = buffer.Length - 1;

            //add following word to stringbuilder
            if (mNext != null && token.Length > 0)
            {
                int positionAfterFirstWordInNext = mNext.IndexOf(" ");
                if (positionAfterFirstWordInNext != -1)
                {
                    // should maybe changes this so that it usually adds a space
                    // before the next sentence, but sometimes leaves no space.
                    buffer.Append(" ");
                    buffer.Append(mNext.Substring(0, (positionAfterFirstWordInNext) - (0)));
                }
            }

            for (System.Collections.IEnumerator iterator = mScanner.GetPositions(buffer).GetEnumerator(); iterator.MoveNext();)
            {
                int candidate = (int)iterator.Current;
                Util.Pair <System.Text.StringBuilder, int> pair = new Util.Pair <System.Text.StringBuilder, int>(buffer, candidate);
                string type = (candidate == sentenceEndPosition) ? "T" : "F";
                SentenceDetectionEvent sentenceEvent = new SentenceDetectionEvent(type, mContextGenerator.GetContext(pair));

                if (null != mTail)
                {
                    mTail.NextEvent = sentenceEvent;
                    mTail           = sentenceEvent;
                }
                else if (null == mHead)
                {
                    mHead = sentenceEvent;
                }
                else if (null == mHead.NextEvent)
                {
                    mHead.NextEvent = mTail = sentenceEvent;
                }
            }

            buffer.Length = 0;
        }
        public virtual void AddEvents(Util.Span[] tokens, string input)
        {
            if (tokens.Length > 0)
            {
                int         startPosition      = tokens[0].Start;
                int         endPosition        = tokens[tokens.Length - 1].End;
                string      sentence           = input.Substring(startPosition, (endPosition) - (startPosition));
                Util.Span[] candidateTokens    = MaximumEntropyTokenizer.SplitOnWhitespaces(sentence);
                int         firstTrainingToken = -1;
                int         lastTrainingToken  = -1;

                for (int currentCandidate = 0; currentCandidate < candidateTokens.Length; currentCandidate++)
                {
                    Util.Span candidateSpan  = candidateTokens[currentCandidate];
                    string    candidateToken = sentence.Substring(candidateSpan.Start, (candidateSpan.End) - (candidateSpan.Start));
                    //adjust candidateSpan to text offsets
                    candidateSpan = new Util.Span(candidateSpan.Start + startPosition, candidateSpan.End + startPosition);
                    //should we skip this token
                    if (candidateToken.Length > 1 && (!mSkipAlphanumerics || !MaximumEntropyTokenizer.AlphaNumeric.IsMatch(candidateToken)))
                    {
                        //find offsets of annotated tokens inside candidate tokens
                        bool foundTrainingTokens = false;
                        for (int currentToken = lastTrainingToken + 1; currentToken < tokens.Length; currentToken++)
                        {
                            if (candidateSpan.Contains(tokens[currentToken]))
                            {
                                if (!foundTrainingTokens)
                                {
                                    firstTrainingToken  = currentToken;
                                    foundTrainingTokens = true;
                                }
                                lastTrainingToken = currentToken;
                            }
                            else if (candidateSpan.End < tokens[currentToken].End)
                            {
                                break;
                            }
                            else if (tokens[currentToken].End < candidateSpan.Start)
                            {
                                //keep looking
                            }
                            else
                            {
                                throw new ApplicationException("Bad training token: " + tokens[currentToken] + " cand: " + candidateSpan);
                            }
                        }
                        // create training data
                        if (foundTrainingTokens)
                        {
                            for (int currentToken = firstTrainingToken; currentToken <= lastTrainingToken; currentToken++)
                            {
                                Util.Span trainingTokenSpan = tokens[currentToken];

                                int candidateStart = candidateSpan.Start;
                                for (int currentPosition = trainingTokenSpan.Start + 1; currentPosition < trainingTokenSpan.End; currentPosition++)
                                {
                                    string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, currentPosition - candidateStart));
                                    mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.NoSplitIndicator, context));
                                }
                                if (trainingTokenSpan.End != candidateSpan.End)
                                {
                                    string[] context = mContextGenerator.GetContext(new Tuple <string, int>(candidateToken, trainingTokenSpan.End - candidateStart));
                                    mEvents.Add(new SharpEntropy.TrainingEvent(TokenContextGenerator.SplitIndicator, context));
                                }
                            }
                        }
                    }
                }
            }
        }