/// <summary> /// Given a sequence, applies our patterns over the sequence and returns /// all non overlapping matches. /// </summary> /// <remarks> /// Given a sequence, applies our patterns over the sequence and returns /// all non overlapping matches. When multiple patterns overlaps, /// matched patterns are selected by order specified by the comparator /// </remarks> /// <param name="elements">input sequence to match against</param> /// <param name="cmp">comparator indicating order that overlapped sequences should be selected.</param> /// <returns>list of match results that are non-overlapping</returns> public virtual IList <ISequenceMatchResult <T> > FindNonOverlapping <_T0, _T1>(IList <_T0> elements, IComparator <_T1> cmp) where _T0 : T { ICollection <SequencePattern <T> > triggered = GetTriggeredPatterns(elements); IList <ISequenceMatchResult <T> > all = new List <ISequenceMatchResult <T> >(); int i = 0; foreach (SequencePattern <T> p in triggered) { if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } SequenceMatcher <T> m = p.GetMatcher(elements); m.SetMatchWithResult(matchWithResult); m.SetOrder(i); while (m.Find()) { all.Add(m.ToBasicSequenceMatchResult()); } i++; } IList <ISequenceMatchResult <T> > res = IntervalTree.GetNonOverlapping(all, SequenceMatchResultConstants.ToInterval, cmp); res.Sort(SequenceMatchResultConstants.OffsetComparator); return(res); }
public ISequenceMatchResult <T> Apply(ISequenceMatchResult <T> seqMatchResult, params int[] groups) { SequenceMatcher <T> matcher = pattern.GetMatcher(seqMatchResult.Elements()); if (matcher.Find()) { return(matcher); } else { return(null); } }
public ISequenceMatchResult <T> Apply(ISequenceMatchResult <T> seqMatchResult, params int[] groups) { if (seqMatchResult is SequenceMatcher) { SequenceMatcher <T> matcher = (SequenceMatcher <T>)seqMatchResult; if (matcher.Find()) { return(matcher); } else { return(null); } } else { return(null); } }
/// <summary> /// Given a sequence, applies our patterns over the sequence and returns /// all non overlapping matches. /// </summary> /// <remarks> /// Given a sequence, applies our patterns over the sequence and returns /// all non overlapping matches. When multiple patterns overlaps, /// matched patterns are selected to give the overall maximum score. /// </remarks> /// <param name="elements">input sequence to match against</param> /// <param name="scorer">scorer for scoring each match</param> /// <returns>list of match results that are non-overlapping</returns> public virtual IList <ISequenceMatchResult <T> > FindNonOverlappingMaxScore <_T0, _T1>(IList <_T0> elements, IToDoubleFunction <_T1> scorer) where _T0 : T { ICollection <SequencePattern <T> > triggered = GetTriggeredPatterns(elements); IList <ISequenceMatchResult <T> > all = new List <ISequenceMatchResult <T> >(); int i = 0; foreach (SequencePattern <T> p in triggered) { SequenceMatcher <T> m = p.GetMatcher(elements); m.SetMatchWithResult(matchWithResult); m.SetOrder(i); while (m.Find()) { all.Add(m.ToBasicSequenceMatchResult()); } i++; } IList <ISequenceMatchResult <T> > res = IntervalTree.GetNonOverlappingMaxScore(all, SequenceMatchResultConstants.ToInterval, scorer); res.Sort(SequenceMatchResultConstants.OffsetComparator); return(res); }
/// <summary> /// Returns a List of Lists where each element is built from a run /// of Words in the input Document. /// </summary> /// <remarks> /// Returns a List of Lists where each element is built from a run /// of Words in the input Document. Specifically, reads through each word in /// the input document and breaks off a sentence after finding a valid /// sentence boundary token or end of file. /// Note that for this to work, the words in the /// input document must have been tokenized with a tokenizer that makes /// sentence boundary tokens their own tokens (e.g., /// <see cref="PTBTokenizer{T}"/> /// ). /// </remarks> /// <param name="words">A list of already tokenized words (must implement HasWord or be a String).</param> /// <returns>A list of sentences.</returns> /// <seealso cref="WordToSentenceProcessor{IN}.WordToSentenceProcessor(string, string, Java.Util.ISet{E}, Java.Util.ISet{E}, string, NewlineIsSentenceBreak, Edu.Stanford.Nlp.Ling.Tokensregex.SequencePattern{T}, Java.Util.ISet{E}, bool, bool)"/> private IList <IList <In> > WordsToSentences <_T0>(IList <_T0> words) where _T0 : IN { IdentityHashMap <object, bool> isSentenceBoundary = null; // is null unless used by sentenceBoundaryMultiTokenPattern if (sentenceBoundaryMultiTokenPattern != null) { // Do initial pass using TokensRegex to identify multi token patterns that need to be matched // and add the last token of a match to our table of sentence boundary tokens. isSentenceBoundary = new IdentityHashMap <object, bool>(); SequenceMatcher <In> matcher = sentenceBoundaryMultiTokenPattern.GetMatcher(words); while (matcher.Find()) { IList <In> nodes = matcher.GroupNodes(); if (nodes != null && !nodes.IsEmpty()) { isSentenceBoundary[nodes[nodes.Count - 1]] = true; } } } // Split tokens into sentences!!! IList <IList <In> > sentences = Generics.NewArrayList(); IList <In> currentSentence = new List <In>(); IList <In> lastSentence = null; bool insideRegion = false; bool inWaitForForcedEnd = false; bool lastTokenWasNewline = false; bool lastSentenceEndForced = false; foreach (IN o in words) { string word = GetString(o); bool forcedEnd = IsForcedEndToken(o); // if (DEBUG) { if (forcedEnd) { log.info("Word is " + word + "; marks forced end of sentence [cont.]"); } } bool inMultiTokenExpr = false; bool discardToken = false; if (o is ICoreMap) { // Hacky stuff to ensure sentence breaks do not happen in certain cases ICoreMap cm = (ICoreMap)o; if (!forcedEnd) { bool forcedUntilEndValue = cm.Get(typeof(CoreAnnotations.ForcedSentenceUntilEndAnnotation)); if (forcedUntilEndValue != null && forcedUntilEndValue) { // if (DEBUG) { log.info("Word is " + word + "; starting wait for forced end of sentence [cont.]"); } inWaitForForcedEnd = true; } else { MultiTokenTag mt = cm.Get(typeof(CoreAnnotations.MentionTokenAnnotation)); if (mt != null && !mt.IsEnd()) { // In the middle of a multi token mention, make sure sentence is not ended here // if (DEBUG) { log.info("Word is " + word + "; inside multi-token mention [cont.]"); } inMultiTokenExpr = true; } } } } if (tokenPatternsToDiscard != null) { discardToken = MatchesTokenPatternsToDiscard(word); } if (sentenceRegionBeginPattern != null && !insideRegion) { if (sentenceRegionBeginPattern.Matcher(word).Matches()) { insideRegion = true; } lastTokenWasNewline = false; continue; } if (!lastSentenceEndForced && lastSentence != null && currentSentence.IsEmpty() && !lastTokenWasNewline && sentenceBoundaryFollowersPattern.Matcher(word).Matches()) { if (!discardToken) { lastSentence.Add(o); } lastTokenWasNewline = false; continue; } bool newSentForced = false; bool newSent = false; string debugText = (discardToken) ? "discarded" : "added to current"; if (inWaitForForcedEnd && !forcedEnd) { if (sentenceBoundaryToDiscard.Contains(word)) { // there can be newlines even in something to keep together discardToken = true; } if (!discardToken) { currentSentence.Add(o); } } else { if (inMultiTokenExpr && !forcedEnd) { if (!discardToken) { currentSentence.Add(o); } } else { if (sentenceBoundaryToDiscard.Contains(word)) { if (forcedEnd) { // sentence boundary can easily be forced end inWaitForForcedEnd = false; newSentForced = true; } else { if (newlineIsSentenceBreak == WordToSentenceProcessor.NewlineIsSentenceBreak.Always) { newSentForced = true; } else { if (newlineIsSentenceBreak == WordToSentenceProcessor.NewlineIsSentenceBreak.TwoConsecutive && lastTokenWasNewline) { newSentForced = true; } } } lastTokenWasNewline = true; } else { lastTokenWasNewline = false; bool isb; if (xmlBreakElementsToDiscard != null && MatchesXmlBreakElementToDiscard(word)) { newSentForced = true; } else { if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.Matcher(word).Matches()) { insideRegion = false; newSentForced = true; } else { // Marked sentence boundaries if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary[o]) != null) && isb) { if (!discardToken) { currentSentence.Add(o); } newSent = true; } else { if (sentenceBoundaryTokenPattern.Matcher(word).Matches()) { if (!discardToken) { currentSentence.Add(o); } newSent = true; } else { if (forcedEnd) { if (!discardToken) { currentSentence.Add(o); } inWaitForForcedEnd = false; newSentForced = true; } else { if (!discardToken) { currentSentence.Add(o); } // chris added this next test in 2017; a bit weird, but KBP setup doesn't have newline in sentenceBoundary patterns, just in toDiscard if (AbstractTokenizer.NewlineToken.Equals(word)) { lastTokenWasNewline = true; } } } } } } } } } if ((newSentForced || newSent) && (!currentSentence.IsEmpty() || allowEmptySentences)) { sentences.Add(currentSentence); // adds this sentence now that it's complete lastSentenceEndForced = ((lastSentence == null || lastSentence.IsEmpty()) && lastSentenceEndForced) || newSentForced; lastSentence = currentSentence; currentSentence = new List <In>(); } else { // clears the current sentence if (newSentForced) { lastSentenceEndForced = true; } } } // add any words at the end, even if there isn't a sentence // terminator at the end of file if (!currentSentence.IsEmpty()) { sentences.Add(currentSentence); } // adds last sentence return(sentences); }