Exemple #1
0
        /// <summary>
        /// Finds the boundaries of atomic parts in a string.
        /// </summary>
        /// <param name="value">The string to be tokenized.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into s) for each token as the individuals array elements.</returns>
        public override Span[] TokenizePos(string value)
        {
            var tokens = WhitespaceTokenizer.Instance.TokenizePos(value);

            newTokens.Clear();
            tokProbs.Clear();
            for (int i = 0, il = tokens.Length; i < il; i++)
            {
                var s = tokens[i];
                //string tok = value.Substring(s.getStart(), s.getEnd());
                var tok = tokens[i].GetCoveredText(value);
                // Can't tokenize single characters
                if (tok.Length < 2)
                {
                    newTokens.Add(s);
                    tokProbs.Add(1d);
                }
                else if (useAlphaNumericOptimization && alphanumeric.IsMatch(tok))
                {
                    newTokens.Add(s);
                    tokProbs.Add(1d);
                }
                else
                {
                    var start     = s.Start;
                    var end       = s.End;
                    var origStart = s.Start;
                    var tokenProb = 1.0;
                    for (var j = origStart + 1; j < end; j++)
                    {
                        var probs = model.Eval(cg.GetContext(tok, j - origStart));
                        var best  = model.GetBestOutcome(probs);
                        tokenProb *= probs[model.GetIndex(best)];
                        if (best == Split)
                        {
                            newTokens.Add(new Span(start, j));
                            tokProbs.Add(tokenProb);
                            start     = j;
                            tokenProb = 1.0;
                        }
                    }
                    newTokens.Add(new Span(start, end));
                    tokProbs.Add(tokenProb);
                }
            }
            return(newTokens.ToArray());
        }
Exemple #2
0
        /// <summary>
        /// Creates events for the provided sample.
        /// </summary>
        /// <param name="sample">The sample the sample for which training <see cref="T:Event"/>s are be created.</param>
        /// <returns>The events enumerator.</returns>
        protected override IEnumerator <Event> CreateEvents(TokenSample sample)
        {
            var events = new List <Event>(50);

            var tokens = sample.TokenSpans;
            var text   = sample.Text;

            if (tokens.Length > 0)
            {
                var start = tokens[0].Start;
                var end   = tokens[tokens.Length - 1].End;

                var sent = text.Substring(start, end - start);

                var candTokens = WhitespaceTokenizer.Instance.TokenizePos(sent);

                var firstTrainingToken = -1;
                var lastTrainingToken  = -1;
                foreach (var candToken in candTokens)
                {
                    var cSpan = candToken;
                    var ctok  = cSpan.GetCoveredText(sent);
                    //adjust cSpan to text offsets
                    cSpan = new Span(cSpan.Start + start, cSpan.End + start);
                    //should we skip this token
                    if (ctok.Length > 1 &&
                        (!skipAlphaNumerics || !alphaNumeric.IsMatch(ctok)))
                    {
                        //find offsets of annotated tokens inside of candidate tokens
                        var foundTrainingTokens = false;
                        for (var ti = lastTrainingToken + 1; ti < tokens.Length; ti++)
                        {
                            if (cSpan.Contains(tokens[ti]))
                            {
                                if (!foundTrainingTokens)
                                {
                                    firstTrainingToken  = ti;
                                    foundTrainingTokens = true;
                                }
                                lastTrainingToken = ti;
                            }
                            else if (cSpan.End < tokens[ti].End)
                            {
                                break;
                            }
                            else if (tokens[ti].End < cSpan.Start)
                            {
                                //keep looking
                            }
                            else
                            {
                                // TODO: Add a logging mechanic
                                // warning
                                Debug.Print("Bad training token: " + tokens[ti] + " cand: " + cSpan + " token=" +
                                            tokens[ti].GetCoveredText(text));
                            }
                        }

                        // create training data
                        if (foundTrainingTokens)
                        {
                            for (var ti = firstTrainingToken; ti <= lastTrainingToken; ti++)
                            {
                                var tSpan  = tokens[ti];
                                var cStart = cSpan.Start;
                                for (var i = tSpan.Start + 1; i < tSpan.End; i++)
                                {
                                    var context = cg.GetContext(ctok, i - cStart);
                                    events.Add(new Event(TokenizerME.NoSplit, context));
                                }

                                if (tSpan.End != cSpan.End)
                                {
                                    var context = cg.GetContext(ctok, tSpan.End - cStart);
                                    events.Add(new Event(TokenizerME.Split, context));
                                }
                            }
                        }
                    }
                }
            }

            return(events.GetEnumerator());
        }