protected internal static void ExtractPremarkedEntityMentions(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet) { IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); SemanticGraph basicDependency = s.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); SemanticGraph enhancedDependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); if (enhancedDependency == null) { enhancedDependency = s.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); } int beginIndex = -1; foreach (CoreLabel w in sent) { MultiTokenTag t = w.Get(typeof(CoreAnnotations.MentionTokenAnnotation)); if (t != null) { // Part of a mention if (t.IsStart()) { // Start of mention beginIndex = w.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; } if (t.IsEnd()) { // end of mention int endIndex = w.Get(typeof(CoreAnnotations.IndexAnnotation)); if (beginIndex >= 0) { IntPair mSpan = new IntPair(beginIndex, endIndex); int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIndex, endIndex, sent, basicDependency, enhancedDependency, new List <CoreLabel>(sent.SubList(beginIndex, endIndex))); mentions.Add(m); mentionSpanSet.Add(mSpan); beginIndex = -1; } else { Redwood.Log("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1) + " for " + s.Get(typeof(CoreAnnotations.TextAnnotation))); } } } } }
public override bool Equals(object o) { if (this == o) { return(true); } if (o == null || GetType() != o.GetType()) { return(false); } MultiTokenTag that = (MultiTokenTag)o; if (index != that.index) { return(false); } if (!tag.Equals(that.tag)) { return(false); } return(true); }
/// <summary> /// Returns a List of Lists where each element is built from a run /// of Words in the input Document. /// </summary> /// <remarks> /// Returns a List of Lists where each element is built from a run /// of Words in the input Document. Specifically, reads through each word in /// the input document and breaks off a sentence after finding a valid /// sentence boundary token or end of file. /// Note that for this to work, the words in the /// input document must have been tokenized with a tokenizer that makes /// sentence boundary tokens their own tokens (e.g., /// <see cref="PTBTokenizer{T}"/> /// ). /// </remarks> /// <param name="words">A list of already tokenized words (must implement HasWord or be a String).</param> /// <returns>A list of sentences.</returns> /// <seealso cref="WordToSentenceProcessor{IN}.WordToSentenceProcessor(string, string, Java.Util.ISet{E}, Java.Util.ISet{E}, string, NewlineIsSentenceBreak, Edu.Stanford.Nlp.Ling.Tokensregex.SequencePattern{T}, Java.Util.ISet{E}, bool, bool)"/> private IList <IList <In> > WordsToSentences <_T0>(IList <_T0> words) where _T0 : IN { IdentityHashMap <object, bool> isSentenceBoundary = null; // is null unless used by sentenceBoundaryMultiTokenPattern if (sentenceBoundaryMultiTokenPattern != null) { // Do initial pass using TokensRegex to identify multi token patterns that need to be matched // and add the last token of a match to our table of sentence boundary tokens. isSentenceBoundary = new IdentityHashMap <object, bool>(); SequenceMatcher <In> matcher = sentenceBoundaryMultiTokenPattern.GetMatcher(words); while (matcher.Find()) { IList <In> nodes = matcher.GroupNodes(); if (nodes != null && !nodes.IsEmpty()) { isSentenceBoundary[nodes[nodes.Count - 1]] = true; } } } // Split tokens into sentences!!! IList <IList <In> > sentences = Generics.NewArrayList(); IList <In> currentSentence = new List <In>(); IList <In> lastSentence = null; bool insideRegion = false; bool inWaitForForcedEnd = false; bool lastTokenWasNewline = false; bool lastSentenceEndForced = false; foreach (IN o in words) { string word = GetString(o); bool forcedEnd = IsForcedEndToken(o); // if (DEBUG) { if (forcedEnd) { log.info("Word is " + word + "; marks forced end of sentence [cont.]"); } } bool inMultiTokenExpr = false; bool discardToken = false; if (o is ICoreMap) { // Hacky stuff to ensure sentence breaks do not happen in certain cases ICoreMap cm = (ICoreMap)o; if (!forcedEnd) { bool forcedUntilEndValue = cm.Get(typeof(CoreAnnotations.ForcedSentenceUntilEndAnnotation)); if (forcedUntilEndValue != null && forcedUntilEndValue) { // if (DEBUG) { log.info("Word is " + word + "; starting wait for forced end of sentence [cont.]"); } inWaitForForcedEnd = true; } else { MultiTokenTag mt = cm.Get(typeof(CoreAnnotations.MentionTokenAnnotation)); if (mt != null && !mt.IsEnd()) { // In the middle of a multi token mention, make sure sentence is not ended here // if (DEBUG) { log.info("Word is " + word + "; inside multi-token mention [cont.]"); } inMultiTokenExpr = true; } } } } if (tokenPatternsToDiscard != null) { discardToken = MatchesTokenPatternsToDiscard(word); } if (sentenceRegionBeginPattern != null && !insideRegion) { if (sentenceRegionBeginPattern.Matcher(word).Matches()) { insideRegion = true; } lastTokenWasNewline = false; continue; } if (!lastSentenceEndForced && lastSentence != null && currentSentence.IsEmpty() && !lastTokenWasNewline && sentenceBoundaryFollowersPattern.Matcher(word).Matches()) { if (!discardToken) { lastSentence.Add(o); } lastTokenWasNewline = false; continue; } bool newSentForced = false; bool newSent = false; string debugText = (discardToken) ? "discarded" : "added to current"; if (inWaitForForcedEnd && !forcedEnd) { if (sentenceBoundaryToDiscard.Contains(word)) { // there can be newlines even in something to keep together discardToken = true; } if (!discardToken) { currentSentence.Add(o); } } else { if (inMultiTokenExpr && !forcedEnd) { if (!discardToken) { currentSentence.Add(o); } } else { if (sentenceBoundaryToDiscard.Contains(word)) { if (forcedEnd) { // sentence boundary can easily be forced end inWaitForForcedEnd = false; newSentForced = true; } else { if (newlineIsSentenceBreak == WordToSentenceProcessor.NewlineIsSentenceBreak.Always) { newSentForced = true; } else { if (newlineIsSentenceBreak == WordToSentenceProcessor.NewlineIsSentenceBreak.TwoConsecutive && lastTokenWasNewline) { newSentForced = true; } } } lastTokenWasNewline = true; } else { lastTokenWasNewline = false; bool isb; if (xmlBreakElementsToDiscard != null && MatchesXmlBreakElementToDiscard(word)) { newSentForced = true; } else { if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.Matcher(word).Matches()) { insideRegion = false; newSentForced = true; } else { // Marked sentence boundaries if ((isSentenceBoundary != null) && ((isb = isSentenceBoundary[o]) != null) && isb) { if (!discardToken) { currentSentence.Add(o); } newSent = true; } else { if (sentenceBoundaryTokenPattern.Matcher(word).Matches()) { if (!discardToken) { currentSentence.Add(o); } newSent = true; } else { if (forcedEnd) { if (!discardToken) { currentSentence.Add(o); } inWaitForForcedEnd = false; newSentForced = true; } else { if (!discardToken) { currentSentence.Add(o); } // chris added this next test in 2017; a bit weird, but KBP setup doesn't have newline in sentenceBoundary patterns, just in toDiscard if (AbstractTokenizer.NewlineToken.Equals(word)) { lastTokenWasNewline = true; } } } } } } } } } if ((newSentForced || newSent) && (!currentSentence.IsEmpty() || allowEmptySentences)) { sentences.Add(currentSentence); // adds this sentence now that it's complete lastSentenceEndForced = ((lastSentence == null || lastSentence.IsEmpty()) && lastSentenceEndForced) || newSentForced; lastSentence = currentSentence; currentSentence = new List <In>(); } else { // clears the current sentence if (newSentForced) { lastSentenceEndForced = true; } } } // add any words at the end, even if there isn't a sentence // terminator at the end of file if (!currentSentence.IsEmpty()) { sentences.Add(currentSentence); } // adds last sentence return(sentences); }