Esempio n. 1
0
        /// <summary>
        /// Creates events for the provided sample.
        /// </summary>
        /// <param name="sample">The sample the sample for which training <see cref="T:Event"/>s are be created.</param>
        /// <returns>The events enumerator.</returns>
        protected override IEnumerator <Event> CreateEvents(SentenceSample sample)
        {
            var events = new List <Event>();

            foreach (var sentenceSpan in sample.Sentences)
            {
                var sentenceString = sentenceSpan.GetCoveredText(sample.Document);

                for (var it = new IteratorAdapter <int>(scanner.GetPositions(sentenceString)); it.HasNext();)
                {
                    int    candidate = it.Next();
                    string type      = SentenceDetectorME.NO_SPLIT;
                    if (!it.HasNext())
                    {
                        type = SentenceDetectorME.SPLIT;
                    }
                    events.Add(new Event(type, cg.GetContext(sample.Document, sentenceSpan.Start + candidate)));
                }
            }

            return(events.GetEnumerator());
        }
Esempio n. 2
0
        /// <summary>
        /// Detects the position of the sentences in the specified string.
        /// </summary>
        /// <param name="text">The string to be sentence detected.</param>
        /// <returns>The <see cref="T:Span[]"/> with the spans (offsets into <paramref name="text"/>) for each detected sentence as the individuals array elements.</returns>
        public Span[] SentPosDetect(string text)
        {
            sentProbs.Clear();

            var enders    = scanner.GetPositions(text);
            var positions = new List <int>(enders.Count);

            for (int i = 0, end = enders.Count, index = 0; i < end; i++)
            {
                var cint = enders[i]; // candidate position

                // skip over the leading parts of non-token final delimiters
                var fws = GetFirstWS(text, cint + 1);

                if (i + 1 < end && enders[i + 1] < fws)
                {
                    continue;
                }
                if (positions.Count > 0 && cint < positions[positions.Count - 1])
                {
                    continue;
                }

                var probs       = model.Eval(cgen.GetContext(text, cint));
                var bestOutcome = model.GetBestOutcome(probs);

                if (bestOutcome == null) // beamSearch can theoretically return a null value.
                {
                    continue;
                }

                if (bestOutcome.Equals(Split) && IsAcceptableBreak(text, index, cint))
                {
                    if (index != cint)
                    {
                        positions.Add(useTokenEnd
                            ? GetFirstNonWS(text, GetFirstWS(text, cint + 1))
                            : GetFirstNonWS(text, cint + 1));

                        sentProbs.Add(probs[model.GetIndex(bestOutcome)]);
                    }
                    index = cint + 1;
                }
            }

            var starts = new int[positions.Count];

            for (var i = 0; i < starts.Length; i++)
            {
                starts[i] = positions[i];
            }

            // string does not contain sentence end positions
            if (starts.Length == 0)
            {
                // remove leading and trailing whitespace
                var start = 0;
                var end   = text.Length;

                while (start < text.Length && char.IsWhiteSpace(text[start]))
                {
                    start++;
                }

                while (end > 0 && char.IsWhiteSpace(text[end - 1]))
                {
                    end--;
                }

                if ((end - start) > 0)
                {
                    sentProbs.Add(1d);
                    return(new[] { new Span(start, end) });
                }

                return(new Span[0]);
            }

            // Convert the sentence end indexes to spans

            var leftover = starts[starts.Length - 1] != text.Length;
            var spans    = new Span[leftover ? starts.Length + 1 : starts.Length];

            for (var si = 0; si < starts.Length; si++)
            {
                int start = si == 0 ? 0 : starts[si - 1];

                // A span might contain only white spaces, in this case the length of
                // the span will be zero after trimming and should be ignored.
                var span = new Span(start, starts[si]).Trim(text);
                if (span.Length > 0)
                {
                    spans[si] = span;
                }
                else
                {
                    sentProbs.Remove(si);
                }
            }

            if (leftover)
            {
                var span = new Span(starts[starts.Length - 1], text.Length).Trim(text);
                if (span.Length > 0)
                {
                    spans[spans.Length - 1] = span;
                    sentProbs.Add(1d);
                }
            }
            /* set the prob for each span */
            for (var i = 0; i < spans.Length; i++)
            {
                var prob = sentProbs[i];
                spans[i] = new Span(spans[i], prob);
            }

            return(spans);
        }