Ejemplo n.º 1
0
        internal static void TestModel(IMaxentModel model, Event ev, double higherProbability)
        {
            var outcomes = model.Eval(ev.Context);
            var outcome  = model.GetBestOutcome(outcomes);

            Assert.AreEqual(2, outcomes.Length);
            Assert.AreEqual(ev.Outcome, outcome);

            if (ev.Outcome.Equals(model.GetOutcome(0)))
            {
                Assert.AreEqual(higherProbability, outcomes[0], 0.0001);
            }

            if (!ev.Outcome.Equals(model.GetOutcome(0)))
            {
                Assert.AreEqual(1.0 - higherProbability, outcomes[0], 0.0001);
            }

            if (ev.Outcome.Equals(model.GetOutcome(1)))
            {
                Assert.AreEqual(higherProbability, outcomes[1], 0.0001);
            }

            if (!ev.Outcome.Equals(model.GetOutcome(1)))
            {
                Assert.AreEqual(1.0 - higherProbability, outcomes[1], 0.0001);
            }
        }
Ejemplo n.º 2
0
        /// <summary>
        /// Finds the boundaries of atomic parts in a string.
        /// </summary>
        /// <param name="value">The string to be tokenized.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into s) for each token as the individuals array elements.</returns>
        public override Span[] TokenizePos(string value)
        {
            var tokens = WhitespaceTokenizer.Instance.TokenizePos(value);

            newTokens.Clear();
            tokProbs.Clear();
            for (int i = 0, il = tokens.Length; i < il; i++)
            {
                var s = tokens[i];
                //string tok = value.Substring(s.getStart(), s.getEnd());
                var tok = tokens[i].GetCoveredText(value);
                // Can't tokenize single characters
                if (tok.Length < 2)
                {
                    newTokens.Add(s);
                    tokProbs.Add(1d);
                }
                else if (useAlphaNumericOptimization && alphanumeric.IsMatch(tok))
                {
                    newTokens.Add(s);
                    tokProbs.Add(1d);
                }
                else
                {
                    var start     = s.Start;
                    var end       = s.End;
                    var origStart = s.Start;
                    var tokenProb = 1.0;
                    for (var j = origStart + 1; j < end; j++)
                    {
                        var probs = model.Eval(cg.GetContext(tok, j - origStart));
                        var best  = model.GetBestOutcome(probs);
                        tokenProb *= probs[model.GetIndex(best)];
                        if (best == Split)
                        {
                            newTokens.Add(new Span(start, j));
                            tokProbs.Add(tokenProb);
                            start     = j;
                            tokenProb = 1.0;
                        }
                    }
                    newTokens.Add(new Span(start, end));
                    tokProbs.Add(tokenProb);
                }
            }
            return(newTokens.ToArray());
        }
Ejemplo n.º 3
0
        /// <summary>
        /// Detects the position of the sentences in the specified string.
        /// </summary>
        /// <param name="text">The string to be sentence detected.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into <paramref name="text"/>) for each detected sentence as the individuals array elements.</returns>
        public Span[] SentPosDetect(string text)
        {
            sentProbs.Clear();

            var enders    = scanner.GetPositions(text);
            var positions = new List <int>(enders.Count);

            for (int i = 0, end = enders.Count, index = 0; i < end; i++)
            {
                var cint = enders[i]; // candidate position

                // skip over the leading parts of non-token final delimiters
                var fws = GetFirstWS(text, cint + 1);

                if (i + 1 < end && enders[i + 1] < fws)
                {
                    continue;
                }
                if (positions.Count > 0 && cint < positions[positions.Count - 1])
                {
                    continue;
                }

                var probs       = model.Eval(cgen.GetContext(text, cint));
                var bestOutcome = model.GetBestOutcome(probs);

                if (bestOutcome == null) // beamSearch can theoretically return a null value.
                {
                    continue;
                }

                if (bestOutcome.Equals(Split) && IsAcceptableBreak(text, index, cint))
                {
                    if (index != cint)
                    {
                        positions.Add(useTokenEnd
                            ? GetFirstNonWS(text, GetFirstWS(text, cint + 1))
                            : GetFirstNonWS(text, cint + 1));

                        sentProbs.Add(probs[model.GetIndex(bestOutcome)]);
                    }
                    index = cint + 1;
                }
            }

            var starts = new int[positions.Count];

            for (var i = 0; i < starts.Length; i++)
            {
                starts[i] = positions[i];
            }

            // string does not contain sentence end positions
            if (starts.Length == 0)
            {
                // remove leading and trailing whitespace
                var start = 0;
                var end   = text.Length;

                while (start < text.Length && char.IsWhiteSpace(text[start]))
                {
                    start++;
                }

                while (end > 0 && char.IsWhiteSpace(text[end - 1]))
                {
                    end--;
                }

                if ((end - start) > 0)
                {
                    sentProbs.Add(1d);
                    return(new[] { new Span(start, end) });
                }

                return(new Span[0]);
            }

            // Convert the sentence end indexes to spans

            var leftover = starts[starts.Length - 1] != text.Length;
            var spans    = new Span[leftover ? starts.Length + 1 : starts.Length];

            for (var si = 0; si < starts.Length; si++)
            {
                int start = si == 0 ? 0 : starts[si - 1];

                // A span might contain only white spaces, in this case the length of
                // the span will be zero after trimming and should be ignored.
                var span = new Span(start, starts[si]).Trim(text);
                if (span.Length > 0)
                {
                    spans[si] = span;
                }
                else
                {
                    sentProbs.Remove(si);
                }
            }

            if (leftover)
            {
                var span = new Span(starts[starts.Length - 1], text.Length).Trim(text);
                if (span.Length > 0)
                {
                    spans[spans.Length - 1] = span;
                    sentProbs.Add(1d);
                }
            }
            /* set the prob for each span */
            for (var i = 0; i < spans.Length; i++)
            {
                var prob = sentProbs[i];
                spans[i] = new Span(spans[i], prob);
            }

            return(spans);
        }