/// <summary>Tokenizes the string</summary>
        /// <param name="input">The string to be tokenized</param>
        /// <returns>A span array containing individual tokens as elements</returns>
        public override Span[] TokenizePositions(string input)
        {
            if (string.IsNullOrEmpty(input))
            {
                return(new Span[0]);
            }

            var tokens             = SplitOnWhitespaces(input);
            var newTokens          = new List <Span>();
            var tokenProbabilities = new List <double>();

            for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++)
            {
                var    tokenSpan = tokens[currentToken];
                string token     = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start));
                // Can't tokenize single characters
                if (token.Length < 2)
                {
                    newTokens.Add(tokenSpan);
                    tokenProbabilities.Add(1.0);
                }
                else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token))
                {
                    newTokens.Add(tokenSpan);
                    tokenProbabilities.Add(1.0);
                }
                else
                {
                    int    startPosition    = tokenSpan.Start;
                    int    endPosition      = tokenSpan.End;
                    int    originalStart    = tokenSpan.Start;
                    double tokenProbability = 1.0;
                    for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++)
                    {
                        var      context       = _contextGenerator.GetContext(new Tuple <string, int>(token, currentPosition - originalStart));
                        double[] probabilities = _model.Evaluate(context);
                        string   bestOutcome   = _model.GetBestOutcome(probabilities);

                        tokenProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)];
                        if (bestOutcome == TokenContextGenerator.SplitIndicator)
                        {
                            newTokens.Add(new Span(startPosition, currentPosition));
                            tokenProbabilities.Add(tokenProbability);
                            startPosition    = currentPosition;
                            tokenProbability = 1.0;
                        }
                    }
                    newTokens.Add(new Span(startPosition, endPosition));
                    tokenProbabilities.Add(tokenProbability);
                }
            }

            return(newTokens.ToArray());
        }
Exemplo n.º 2
0
        private Event CreateEvent(string obs)
        {
            int lastSeparator = obs.LastIndexOf(_separator);

            if (lastSeparator < 0)
            {
                return(null);
            }
            Event e = new Event(obs.Substring(lastSeparator + 1), _contextGenerator.GetContext(obs.Substring(0, lastSeparator)));

            return(e);
        }
        public virtual TrainingEvent ReadNextEvent()
        {
            var nextToken = _dataReader.NextToken();
            // split on tab
            var parts     = nextToken.Split('\t');
            var isInvalid = parts.Last() == "1";
            var type      = isInvalid ? "INV" : "OK";
            var email     = parts.First();
            var nextEvent = new TrainingEvent(type, _contextGenerator.GetContext(email));

            return(nextEvent);
        }
Exemplo n.º 4
0
        private TrainingEvent CreateEvent(string observation)
        {
            int lastSpace = observation.LastIndexOf((char)' ');

            if (lastSpace == -1)
            {
                return(null);
            }
            else
            {
                return(new TrainingEvent(observation.Substring(lastSpace + 1), mContext.GetContext(observation.Substring(0, (lastSpace) - (0)))));
            }
        }
Exemplo n.º 5
0
        // Methods ----------------

        /// <summary>
        /// Tests the probabilty of an email to be invalid.
        /// </summary>
        /// <param name="email">
        /// The email to be processed.
        /// </param>
        /// <returns>
        /// A string array containing individual sentences as elements.
        /// </returns>
        public double GetInvalidProbability(string email)
        {
            try
            {
                var      context       = _contextGenerator.GetContext(email);
                double[] probabilities = _model.Evaluate(context);
                return(probabilities.Last());
            }
            catch (KeyNotFoundException ex)
            {
                // This case should not happen if we trained the model with enough data
                return(0.5);
            }
        }
        public TrainingEvent ReadNextEvent()
        {
            // read current email/flag
            var emailAndValidity = _emailsAndValidities[_currentIndex];
            var type             = emailAndValidity.IsInvalid ? "INV" : "OK";
            var email            = emailAndValidity.Email;
            // create event
            var nextEvent = new TrainingEvent(type, _contextGenerator.GetContext(email));

            // increase current index
            _currentIndex++;

            return(nextEvent);
        }
        private void AddNewEvents(string token)
        {
            StringBuilder buffer = _buffer;

            buffer.Append(token.Trim());

            // only one sentence per token, so the end of sentence if at the very end of the token
            int sentenceEndPosition = buffer.Length - 1;

            //add following word to stringbuilder
            if (_next != null && token.Length > 0)
            {
                int positionAfterFirstWordInNext = _next.IndexOf(" ");
                if (positionAfterFirstWordInNext != -1)
                {
                    // should maybe changes this so that it usually adds a space
                    // before the next sentence, but sometimes leaves no space.
                    buffer.Append(" ");
                    buffer.Append(_next.Substring(0, positionAfterFirstWordInNext));
                }
            }

            foreach (var candidate in _scanner.GetPositions(buffer))
            {
                var    pair          = new Tuple <StringBuilder, int>(buffer, candidate);
                string type          = (candidate == sentenceEndPosition) ? "T" : "F";
                var    sentenceEvent = new SentenceDetectionEvent(type, _contextGenerator.GetContext(pair));

                if (_tail != null)
                {
                    _tail.NextEvent = sentenceEvent;
                    _tail           = sentenceEvent;
                }
                else if (_head == null)
                {
                    _head = sentenceEvent;
                }
                else if (_head.NextEvent == null)
                {
                    _head.NextEvent = _tail = sentenceEvent;
                }
            }

            buffer.Length = 0;
        }
Exemplo n.º 8
0
        /// <summary>
        /// Detect the position of the first words of sentences in a string.
        /// </summary>
        /// <param name="input">
        /// The string to be processed.
        /// </param>
        /// <returns>
        /// A integer array containing the positions of the end index of
        /// every sentence
        /// </returns>
        public virtual int[] SentencePositionDetect(string input)
        {
            double sentenceProbability = 1;

            _sentenceProbs.Clear();
            var        buffer     = new StringBuilder(input);
            List <int> endersList = _scanner.GetPositions(input);
            var        positions  = new List <int>(endersList.Count);

            for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++)
            {
                int candidate = endersList[currentEnder];
                int cInt      = candidate;

                // skip over the leading parts of non-token final delimiters
                int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1);
                if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace))
                {
                    continue;
                }

                var      pair          = new Tuple <StringBuilder, int>(buffer, candidate);
                var      context       = _contextGenerator.GetContext(pair);
                double[] probabilities = _model.Evaluate(context);
                string   bestOutcome   = _model.GetBestOutcome(probabilities);
                sentenceProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)];
                if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt))
                {
                    if (index != cInt)
                    {
                        positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)));                        //moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))));
                        _sentenceProbs.Add(probabilities[_model.GetOutcomeIndex(bestOutcome)]);
                    }
                    index = cInt + 1;
                }
            }

            return(positions.ToArray());
        }