/// <summary>
        /// Given a sequence, applies our patterns over the sequence and returns
        /// all non overlapping matches.
        /// </summary>
        /// <remarks>
        /// Given a sequence, applies our patterns over the sequence and returns
        /// all non overlapping matches.  When multiple patterns overlaps,
        /// matched patterns are selected by order specified by the comparator
        /// </remarks>
        /// <param name="elements">input sequence to match against</param>
        /// <param name="cmp">comparator indicating order that overlapped sequences should be selected.</param>
        /// <returns>list of match results that are non-overlapping</returns>
        public virtual IList <ISequenceMatchResult <T> > FindNonOverlapping <_T0, _T1>(IList <_T0> elements, IComparator <_T1> cmp)
            where _T0 : T
        {
            ICollection <SequencePattern <T> > triggered = GetTriggeredPatterns(elements);
            IList <ISequenceMatchResult <T> >  all       = new List <ISequenceMatchResult <T> >();
            int i = 0;

            foreach (SequencePattern <T> p in triggered)
            {
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                SequenceMatcher <T> m = p.GetMatcher(elements);
                m.SetMatchWithResult(matchWithResult);
                m.SetOrder(i);
                while (m.Find())
                {
                    all.Add(m.ToBasicSequenceMatchResult());
                }
                i++;
            }
            IList <ISequenceMatchResult <T> > res = IntervalTree.GetNonOverlapping(all, SequenceMatchResultConstants.ToInterval, cmp);

            res.Sort(SequenceMatchResultConstants.OffsetComparator);
            return(res);
        }
Exemplo n.º 2
0
            public ISequenceMatchResult <T> Apply(ISequenceMatchResult <T> seqMatchResult, params int[] groups)
            {
                SequenceMatcher <T> matcher = pattern.GetMatcher(seqMatchResult.Elements());

                if (matcher.Find())
                {
                    return(matcher);
                }
                else
                {
                    return(null);
                }
            }
Exemplo n.º 3
0
 public ISequenceMatchResult <T> Apply(ISequenceMatchResult <T> seqMatchResult, params int[] groups)
 {
     if (seqMatchResult is SequenceMatcher)
     {
         SequenceMatcher <T> matcher = (SequenceMatcher <T>)seqMatchResult;
         if (matcher.Find())
         {
             return(matcher);
         }
         else
         {
             return(null);
         }
     }
     else
     {
         return(null);
     }
 }
        /// <summary>
        /// Given a sequence, applies our patterns over the sequence and returns
        /// all non overlapping matches.
        /// </summary>
        /// <remarks>
        /// Given a sequence, applies our patterns over the sequence and returns
        /// all non overlapping matches.  When multiple patterns overlaps,
        /// matched patterns are selected to give the overall maximum score.
        /// </remarks>
        /// <param name="elements">input sequence to match against</param>
        /// <param name="scorer">scorer for scoring each match</param>
        /// <returns>list of match results that are non-overlapping</returns>
        public virtual IList <ISequenceMatchResult <T> > FindNonOverlappingMaxScore <_T0, _T1>(IList <_T0> elements, IToDoubleFunction <_T1> scorer)
            where _T0 : T
        {
            ICollection <SequencePattern <T> > triggered = GetTriggeredPatterns(elements);
            IList <ISequenceMatchResult <T> >  all       = new List <ISequenceMatchResult <T> >();
            int i = 0;

            foreach (SequencePattern <T> p in triggered)
            {
                SequenceMatcher <T> m = p.GetMatcher(elements);
                m.SetMatchWithResult(matchWithResult);
                m.SetOrder(i);
                while (m.Find())
                {
                    all.Add(m.ToBasicSequenceMatchResult());
                }
                i++;
            }
            IList <ISequenceMatchResult <T> > res = IntervalTree.GetNonOverlappingMaxScore(all, SequenceMatchResultConstants.ToInterval, scorer);

            res.Sort(SequenceMatchResultConstants.OffsetComparator);
            return(res);
        }
        /// <summary>
        /// Returns a List of Lists where each element is built from a run
        /// of Words in the input Document.
        /// </summary>
        /// <remarks>
        /// Returns a List of Lists where each element is built from a run
        /// of Words in the input Document. Specifically, reads through each word in
        /// the input document and breaks off a sentence after finding a valid
        /// sentence boundary token or end of file.
        /// Note that for this to work, the words in the
        /// input document must have been tokenized with a tokenizer that makes
        /// sentence boundary tokens their own tokens (e.g.,
        /// <see cref="PTBTokenizer{T}"/>
        /// ).
        /// </remarks>
        /// <param name="words">A list of already tokenized words (must implement HasWord or be a String).</param>
        /// <returns>A list of sentences.</returns>
        /// <seealso cref="WordToSentenceProcessor{IN}.WordToSentenceProcessor(string, string, Java.Util.ISet{E}, Java.Util.ISet{E}, string, NewlineIsSentenceBreak, Edu.Stanford.Nlp.Ling.Tokensregex.SequencePattern{T}, Java.Util.ISet{E}, bool, bool)"/>
        private IList <IList <In> > WordsToSentences <_T0>(IList <_T0> words)
            where _T0 : IN
        {
            IdentityHashMap <object, bool> isSentenceBoundary = null;

            // is null unless used by sentenceBoundaryMultiTokenPattern
            if (sentenceBoundaryMultiTokenPattern != null)
            {
                // Do initial pass using TokensRegex to identify multi token patterns that need to be matched
                // and add the last token of a match to our table of sentence boundary tokens.
                isSentenceBoundary = new IdentityHashMap <object, bool>();
                SequenceMatcher <In> matcher = sentenceBoundaryMultiTokenPattern.GetMatcher(words);
                while (matcher.Find())
                {
                    IList <In> nodes = matcher.GroupNodes();
                    if (nodes != null && !nodes.IsEmpty())
                    {
                        isSentenceBoundary[nodes[nodes.Count - 1]] = true;
                    }
                }
            }
            // Split tokens into sentences!!!
            IList <IList <In> > sentences       = Generics.NewArrayList();
            IList <In>          currentSentence = new List <In>();
            IList <In>          lastSentence    = null;
            bool insideRegion          = false;
            bool inWaitForForcedEnd    = false;
            bool lastTokenWasNewline   = false;
            bool lastSentenceEndForced = false;

            foreach (IN o in words)
            {
                string word      = GetString(o);
                bool   forcedEnd = IsForcedEndToken(o);
                // if (DEBUG) { if (forcedEnd) { log.info("Word is " + word + "; marks forced end of sentence [cont.]"); } }
                bool inMultiTokenExpr = false;
                bool discardToken     = false;
                if (o is ICoreMap)
                {
                    // Hacky stuff to ensure sentence breaks do not happen in certain cases
                    ICoreMap cm = (ICoreMap)o;
                    if (!forcedEnd)
                    {
                        bool forcedUntilEndValue = cm.Get(typeof(CoreAnnotations.ForcedSentenceUntilEndAnnotation));
                        if (forcedUntilEndValue != null && forcedUntilEndValue)
                        {
                            // if (DEBUG) { log.info("Word is " + word + "; starting wait for forced end of sentence [cont.]"); }
                            inWaitForForcedEnd = true;
                        }
                        else
                        {
                            MultiTokenTag mt = cm.Get(typeof(CoreAnnotations.MentionTokenAnnotation));
                            if (mt != null && !mt.IsEnd())
                            {
                                // In the middle of a multi token mention, make sure sentence is not ended here
                                // if (DEBUG) { log.info("Word is " + word + "; inside multi-token mention [cont.]"); }
                                inMultiTokenExpr = true;
                            }
                        }
                    }
                }
                if (tokenPatternsToDiscard != null)
                {
                    discardToken = MatchesTokenPatternsToDiscard(word);
                }
                if (sentenceRegionBeginPattern != null && !insideRegion)
                {
                    if (sentenceRegionBeginPattern.Matcher(word).Matches())
                    {
                        insideRegion = true;
                    }
                    lastTokenWasNewline = false;
                    continue;
                }
                if (!lastSentenceEndForced && lastSentence != null && currentSentence.IsEmpty() && !lastTokenWasNewline && sentenceBoundaryFollowersPattern.Matcher(word).Matches())
                {
                    if (!discardToken)
                    {
                        lastSentence.Add(o);
                    }
                    lastTokenWasNewline = false;
                    continue;
                }
                bool   newSentForced = false;
                bool   newSent       = false;
                string debugText     = (discardToken) ? "discarded" : "added to current";
                if (inWaitForForcedEnd && !forcedEnd)
                {
                    if (sentenceBoundaryToDiscard.Contains(word))
                    {
                        // there can be newlines even in something to keep together
                        discardToken = true;
                    }
                    if (!discardToken)
                    {
                        currentSentence.Add(o);
                    }
                }
                else
                {
                    if (inMultiTokenExpr && !forcedEnd)
                    {
                        if (!discardToken)
                        {
                            currentSentence.Add(o);
                        }
                    }
                    else
                    {
                        if (sentenceBoundaryToDiscard.Contains(word))
                        {
                            if (forcedEnd)
                            {
                                // sentence boundary can easily be forced end
                                inWaitForForcedEnd = false;
                                newSentForced      = true;
                            }
                            else
                            {
                                if (newlineIsSentenceBreak == WordToSentenceProcessor.NewlineIsSentenceBreak.Always)
                                {
                                    newSentForced = true;
                                }
                                else
                                {
                                    if (newlineIsSentenceBreak == WordToSentenceProcessor.NewlineIsSentenceBreak.TwoConsecutive && lastTokenWasNewline)
                                    {
                                        newSentForced = true;
                                    }
                                }
                            }
                            lastTokenWasNewline = true;
                        }
                        else
                        {
                            lastTokenWasNewline = false;
                            bool isb;
                            if (xmlBreakElementsToDiscard != null && MatchesXmlBreakElementToDiscard(word))
                            {
                                newSentForced = true;
                            }
                            else
                            {
                                if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.Matcher(word).Matches())
                                {
                                    insideRegion  = false;
                                    newSentForced = true;
                                }
                                else
                                {
                                    // Marked sentence boundaries
                                    if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary[o]) != null) && isb)
                                    {
                                        if (!discardToken)
                                        {
                                            currentSentence.Add(o);
                                        }
                                        newSent = true;
                                    }
                                    else
                                    {
                                        if (sentenceBoundaryTokenPattern.Matcher(word).Matches())
                                        {
                                            if (!discardToken)
                                            {
                                                currentSentence.Add(o);
                                            }
                                            newSent = true;
                                        }
                                        else
                                        {
                                            if (forcedEnd)
                                            {
                                                if (!discardToken)
                                                {
                                                    currentSentence.Add(o);
                                                }
                                                inWaitForForcedEnd = false;
                                                newSentForced      = true;
                                            }
                                            else
                                            {
                                                if (!discardToken)
                                                {
                                                    currentSentence.Add(o);
                                                }
                                                // chris added this next test in 2017; a bit weird, but KBP setup doesn't have newline in sentenceBoundary patterns, just in toDiscard
                                                if (AbstractTokenizer.NewlineToken.Equals(word))
                                                {
                                                    lastTokenWasNewline = true;
                                                }
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
                if ((newSentForced || newSent) && (!currentSentence.IsEmpty() || allowEmptySentences))
                {
                    sentences.Add(currentSentence);
                    // adds this sentence now that it's complete
                    lastSentenceEndForced = ((lastSentence == null || lastSentence.IsEmpty()) && lastSentenceEndForced) || newSentForced;
                    lastSentence          = currentSentence;
                    currentSentence       = new List <In>();
                }
                else
                {
                    // clears the current sentence
                    if (newSentForced)
                    {
                        lastSentenceEndForced = true;
                    }
                }
            }
            // add any words at the end, even if there isn't a sentence
            // terminator at the end of file
            if (!currentSentence.IsEmpty())
            {
                sentences.Add(currentSentence);
            }
            // adds last sentence
            return(sentences);
        }