/// <summary>Tokenizes the string</summary> /// <param name="input">The string to be tokenized</param> /// <returns>A span array containing individual tokens as elements</returns> public override Span[] TokenizePositions(string input) { if (string.IsNullOrEmpty(input)) { return(new Span[0]); } var tokens = SplitOnWhitespaces(input); var newTokens = new List <Span>(); var tokenProbabilities = new List <double>(); for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++) { var tokenSpan = tokens[currentToken]; string token = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start)); // Can't tokenize single characters if (token.Length < 2) { newTokens.Add(tokenSpan); tokenProbabilities.Add(1.0); } else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token)) { newTokens.Add(tokenSpan); tokenProbabilities.Add(1.0); } else { int startPosition = tokenSpan.Start; int endPosition = tokenSpan.End; int originalStart = tokenSpan.Start; double tokenProbability = 1.0; for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++) { var context = _contextGenerator.GetContext(new Tuple <string, int>(token, currentPosition - originalStart)); double[] probabilities = _model.Evaluate(context); string bestOutcome = _model.GetBestOutcome(probabilities); tokenProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)]; if (bestOutcome == TokenContextGenerator.SplitIndicator) { newTokens.Add(new Span(startPosition, currentPosition)); tokenProbabilities.Add(tokenProbability); startPosition = currentPosition; tokenProbability = 1.0; } } newTokens.Add(new Span(startPosition, endPosition)); tokenProbabilities.Add(tokenProbability); } } return(newTokens.ToArray()); }
// Methods ---------------- /// <summary> /// Tests the probabilty of an email to be invalid. /// </summary> /// <param name="email"> /// The email to be processed. /// </param> /// <returns> /// A string array containing individual sentences as elements. /// </returns> public double GetInvalidProbability(string email) { try { var context = _contextGenerator.GetContext(email); double[] probabilities = _model.Evaluate(context); return(probabilities.Last()); } catch (KeyNotFoundException ex) { // This case should not happen if we trained the model with enough data return(0.5); } }
/// <summary> /// Detect the position of the first words of sentences in a string. /// </summary> /// <param name="input"> /// The string to be processed. /// </param> /// <returns> /// A integer array containing the positions of the end index of /// every sentence /// </returns> public virtual int[] SentencePositionDetect(string input) { double sentenceProbability = 1; _sentenceProbs.Clear(); var buffer = new StringBuilder(input); List <int> endersList = _scanner.GetPositions(input); var positions = new List <int>(endersList.Count); for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++) { int candidate = endersList[currentEnder]; int cInt = candidate; // skip over the leading parts of non-token final delimiters int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1); if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace)) { continue; } var pair = new Tuple <StringBuilder, int>(buffer, candidate); var context = _contextGenerator.GetContext(pair); double[] probabilities = _model.Evaluate(context); string bestOutcome = _model.GetBestOutcome(probabilities); sentenceProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)]; if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt)) { if (index != cInt) { positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))); //moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)))); _sentenceProbs.Add(probabilities[_model.GetOutcomeIndex(bestOutcome)]); } index = cInt + 1; } } return(positions.ToArray()); }