/// <summary> /// Finds the boundaries of atomic parts in a string. /// </summary> /// <param name="value">The string to be tokenized.</param> /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into s) for each token as the individuals array elements.</returns> public override Span[] TokenizePos(string value) { var tokens = WhitespaceTokenizer.Instance.TokenizePos(value); newTokens.Clear(); tokProbs.Clear(); for (int i = 0, il = tokens.Length; i < il; i++) { var s = tokens[i]; //string tok = value.Substring(s.getStart(), s.getEnd()); var tok = tokens[i].GetCoveredText(value); // Can't tokenize single characters if (tok.Length < 2) { newTokens.Add(s); tokProbs.Add(1d); } else if (useAlphaNumericOptimization && alphanumeric.IsMatch(tok)) { newTokens.Add(s); tokProbs.Add(1d); } else { var start = s.Start; var end = s.End; var origStart = s.Start; var tokenProb = 1.0; for (var j = origStart + 1; j < end; j++) { var probs = model.Eval(cg.GetContext(tok, j - origStart)); var best = model.GetBestOutcome(probs); tokenProb *= probs[model.GetIndex(best)]; if (best == Split) { newTokens.Add(new Span(start, j)); tokProbs.Add(tokenProb); start = j; tokenProb = 1.0; } } newTokens.Add(new Span(start, end)); tokProbs.Add(tokenProb); } } return(newTokens.ToArray()); }
/// <summary> /// Creates events for the provided sample. /// </summary> /// <param name="sample">The sample the sample for which training <see cref="T:Event"/>s are be created.</param> /// <returns>The events enumerator.</returns> protected override IEnumerator <Event> CreateEvents(TokenSample sample) { var events = new List <Event>(50); var tokens = sample.TokenSpans; var text = sample.Text; if (tokens.Length > 0) { var start = tokens[0].Start; var end = tokens[tokens.Length - 1].End; var sent = text.Substring(start, end - start); var candTokens = WhitespaceTokenizer.Instance.TokenizePos(sent); var firstTrainingToken = -1; var lastTrainingToken = -1; foreach (var candToken in candTokens) { var cSpan = candToken; var ctok = cSpan.GetCoveredText(sent); //adjust cSpan to text offsets cSpan = new Span(cSpan.Start + start, cSpan.End + start); //should we skip this token if (ctok.Length > 1 && (!skipAlphaNumerics || !alphaNumeric.IsMatch(ctok))) { //find offsets of annotated tokens inside of candidate tokens var foundTrainingTokens = false; for (var ti = lastTrainingToken + 1; ti < tokens.Length; ti++) { if (cSpan.Contains(tokens[ti])) { if (!foundTrainingTokens) { firstTrainingToken = ti; foundTrainingTokens = true; } lastTrainingToken = ti; } else if (cSpan.End < tokens[ti].End) { break; } else if (tokens[ti].End < cSpan.Start) { //keep looking } else { // TODO: Add a logging mechanic // warning Debug.Print("Bad training token: " + tokens[ti] + " cand: " + cSpan + " token=" + tokens[ti].GetCoveredText(text)); } } // create training data if (foundTrainingTokens) { for (var ti = firstTrainingToken; ti <= lastTrainingToken; ti++) { var tSpan = tokens[ti]; var cStart = cSpan.Start; for (var i = tSpan.Start + 1; i < tSpan.End; i++) { var context = cg.GetContext(ctok, i - cStart); events.Add(new Event(TokenizerME.NoSplit, context)); } if (tSpan.End != cSpan.End) { var context = cg.GetContext(ctok, tSpan.End - cStart); events.Add(new Event(TokenizerME.Split, context)); } } } } } } return(events.GetEnumerator()); }