/// <summary>Tokenizes the string</summary> /// <param name="input">The string to be tokenized</param> /// <returns>A span array containing individual tokens as elements</returns> public override Span[] TokenizePositions(string input) { if (string.IsNullOrEmpty(input)) { return(new Span[0]); } var tokens = SplitOnWhitespaces(input); var newTokens = new List <Span>(); var tokenProbabilities = new List <double>(); for (int currentToken = 0, tokenCount = tokens.Length; currentToken < tokenCount; currentToken++) { var tokenSpan = tokens[currentToken]; string token = input.Substring(tokenSpan.Start, (tokenSpan.End) - (tokenSpan.Start)); // Can't tokenize single characters if (token.Length < 2) { newTokens.Add(tokenSpan); tokenProbabilities.Add(1.0); } else if (AlphaNumericOptimization && AlphaNumeric.IsMatch(token)) { newTokens.Add(tokenSpan); tokenProbabilities.Add(1.0); } else { int startPosition = tokenSpan.Start; int endPosition = tokenSpan.End; int originalStart = tokenSpan.Start; double tokenProbability = 1.0; for (int currentPosition = originalStart + 1; currentPosition < endPosition; currentPosition++) { var context = _contextGenerator.GetContext(new Tuple <string, int>(token, currentPosition - originalStart)); double[] probabilities = _model.Evaluate(context); string bestOutcome = _model.GetBestOutcome(probabilities); tokenProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)]; if (bestOutcome == TokenContextGenerator.SplitIndicator) { newTokens.Add(new Span(startPosition, currentPosition)); tokenProbabilities.Add(tokenProbability); startPosition = currentPosition; tokenProbability = 1.0; } } newTokens.Add(new Span(startPosition, endPosition)); tokenProbabilities.Add(tokenProbability); } } return(newTokens.ToArray()); }
private Event CreateEvent(string obs) { int lastSeparator = obs.LastIndexOf(_separator); if (lastSeparator < 0) { return(null); } Event e = new Event(obs.Substring(lastSeparator + 1), _contextGenerator.GetContext(obs.Substring(0, lastSeparator))); return(e); }
public virtual TrainingEvent ReadNextEvent() { var nextToken = _dataReader.NextToken(); // split on tab var parts = nextToken.Split('\t'); var isInvalid = parts.Last() == "1"; var type = isInvalid ? "INV" : "OK"; var email = parts.First(); var nextEvent = new TrainingEvent(type, _contextGenerator.GetContext(email)); return(nextEvent); }
private TrainingEvent CreateEvent(string observation) { int lastSpace = observation.LastIndexOf((char)' '); if (lastSpace == -1) { return(null); } else { return(new TrainingEvent(observation.Substring(lastSpace + 1), mContext.GetContext(observation.Substring(0, (lastSpace) - (0))))); } }
// Methods ---------------- /// <summary> /// Tests the probabilty of an email to be invalid. /// </summary> /// <param name="email"> /// The email to be processed. /// </param> /// <returns> /// A string array containing individual sentences as elements. /// </returns> public double GetInvalidProbability(string email) { try { var context = _contextGenerator.GetContext(email); double[] probabilities = _model.Evaluate(context); return(probabilities.Last()); } catch (KeyNotFoundException ex) { // This case should not happen if we trained the model with enough data return(0.5); } }
public TrainingEvent ReadNextEvent() { // read current email/flag var emailAndValidity = _emailsAndValidities[_currentIndex]; var type = emailAndValidity.IsInvalid ? "INV" : "OK"; var email = emailAndValidity.Email; // create event var nextEvent = new TrainingEvent(type, _contextGenerator.GetContext(email)); // increase current index _currentIndex++; return(nextEvent); }
private void AddNewEvents(string token) { StringBuilder buffer = _buffer; buffer.Append(token.Trim()); // only one sentence per token, so the end of sentence if at the very end of the token int sentenceEndPosition = buffer.Length - 1; //add following word to stringbuilder if (_next != null && token.Length > 0) { int positionAfterFirstWordInNext = _next.IndexOf(" "); if (positionAfterFirstWordInNext != -1) { // should maybe changes this so that it usually adds a space // before the next sentence, but sometimes leaves no space. buffer.Append(" "); buffer.Append(_next.Substring(0, positionAfterFirstWordInNext)); } } foreach (var candidate in _scanner.GetPositions(buffer)) { var pair = new Tuple <StringBuilder, int>(buffer, candidate); string type = (candidate == sentenceEndPosition) ? "T" : "F"; var sentenceEvent = new SentenceDetectionEvent(type, _contextGenerator.GetContext(pair)); if (_tail != null) { _tail.NextEvent = sentenceEvent; _tail = sentenceEvent; } else if (_head == null) { _head = sentenceEvent; } else if (_head.NextEvent == null) { _head.NextEvent = _tail = sentenceEvent; } } buffer.Length = 0; }
/// <summary> /// Detect the position of the first words of sentences in a string. /// </summary> /// <param name="input"> /// The string to be processed. /// </param> /// <returns> /// A integer array containing the positions of the end index of /// every sentence /// </returns> public virtual int[] SentencePositionDetect(string input) { double sentenceProbability = 1; _sentenceProbs.Clear(); var buffer = new StringBuilder(input); List <int> endersList = _scanner.GetPositions(input); var positions = new List <int>(endersList.Count); for (int currentEnder = 0, enderCount = endersList.Count, index = 0; currentEnder < enderCount; currentEnder++) { int candidate = endersList[currentEnder]; int cInt = candidate; // skip over the leading parts of non-token final delimiters int firstWhiteSpace = GetFirstWhitespace(input, cInt + 1); if (((currentEnder + 1) < enderCount) && ((endersList[currentEnder + 1]) < firstWhiteSpace)) { continue; } var pair = new Tuple <StringBuilder, int>(buffer, candidate); var context = _contextGenerator.GetContext(pair); double[] probabilities = _model.Evaluate(context); string bestOutcome = _model.GetBestOutcome(probabilities); sentenceProbability *= probabilities[_model.GetOutcomeIndex(bestOutcome)]; if (bestOutcome.Equals("T") && IsAcceptableBreak(input, index, cInt)) { if (index != cInt) { positions.Add(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1))); //moIntegerPool.GetInteger(GetFirstNonWhitespace(input, GetFirstWhitespace(input, cInt + 1)))); _sentenceProbs.Add(probabilities[_model.GetOutcomeIndex(bestOutcome)]); } index = cInt + 1; } } return(positions.ToArray()); }