/// <summary> /// Copy constructor. /// </summary> /// <param name="other">The ArrayCoreMap to copy. It may not be null.</param> public ArrayCoreMap(ICoreMap other) { /*Set<Class<?>>*/ var otherKeys = other.KeySet(); psize = otherKeys.Count; keys = new Type[psize]; values = new Object[psize]; int i = 0; foreach (var key in otherKeys) { this.keys[i] = key; this.values[i] = other.Get(key); i++; } }
public virtual void TestKbpSpanishWorks() { Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard" , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags" , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags" , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]" ); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(kbpSpanishDocument); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < Math.Min(kbpSpanishSentences.Length, sentences.Count); i++) { ICoreMap sentence = sentences[i]; string sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); NUnit.Framework.Assert.AreEqual(kbpSpanishSentences[i], sentenceText, "Bad sentence #" + i); } NUnit.Framework.Assert.AreEqual(kbpSpanishSentences.Length, sentences.Count, "Bad total number of sentences"); }
public virtual Person DoCoreference(int corefMapKey, ICoreMap quote) { if (pronounCorefMap == null) { return(null); } ICollection <Person> quoteNames = new HashSet <Person>(); if (quote != null) { quoteNames = GetNamesInParagraph(quote); } string referent = pronounCorefMap[corefMapKey]; Person candidate = ResolveAmbiguities(referent); if (candidate != null && !quoteNames.Contains(candidate)) { return(candidate); } return(null); }
public virtual void Process(long id, Annotation doc) { ICoreMap sentence = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[0]; SemanticGraph depparse = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)); log.Info("| " + sentence.Get(typeof(CoreAnnotations.TextAnnotation))); // Get all valid subject spans BitSet consumedAsSubjects = new BitSet(); IList <Span> subjectSpans = new List <Span>(); foreach (IndexedWord head in depparse.TopologicalSort()) { // Check if the node is a noun/pronoun if (head.Tag().StartsWith("N") || head.Tag().Equals("PRP")) { // Try to get the NP chunk Optional <IList <IndexedWord> > subjectChunk = segmenter.GetValidChunk(depparse, head, segmenter.ValidSubjectArcs, Optional.Empty(), true); if (subjectChunk.IsPresent()) { // Make sure it's not already a member of a larger NP foreach (IndexedWord tok in subjectChunk.Get()) { if (consumedAsSubjects.Get(tok.Index())) { goto NEXTNODE_continue; } } // Already considered. Continue to the next node. // Register it as an NP foreach (IndexedWord tok_1 in subjectChunk.Get()) { consumedAsSubjects.Set(tok_1.Index()); } // Add it as a subject subjectSpans.Add(ToSpan(subjectChunk.Get())); } } } NEXTNODE_break :; }
/// <summary> /// Converts NamedEntityTagAnnotation tags into /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/> /// s. This /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching /// type. /// </summary> /// <param name="sentence">A sentence annotated with NamedEntityTagAnnotation</param> public virtual void MakeAnnotationFromAllNERTags(ICoreMap sentence) { IList <CoreLabel> words = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); System.Diagnostics.Debug.Assert(words != null); if (mentions == null) { this.logger.Info("mentions are null"); mentions = new List <EntityMention>(); } for (int start = 0; start < words.Count; start++) { int end; // find the first token after start that isn't of nerType string lastneTag = null; string ne = null; for (end = start; end < words.Count; end++) { ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); if (ne.Equals(SeqClassifierFlags.DefaultBackgroundSymbol) || (lastneTag != null && !ne.Equals(lastneTag))) { break; } lastneTag = ne; } if (end > start) { // found a match! string entityType = this.GetEntityTypeForTag(lastneTag); EntityMention m = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null); //TODO: changed entityType in the above sentence to nerTag - Sonal logger.Info("Created " + entityType + " entity mention: " + m); start = end - 1; mentions.Add(m); } } sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions); }
public virtual void AnnotateSentence(ICoreMap sentence) { // this stores all relation mentions generated by this extractor IList <RelationMention> relations = new List <RelationMention>(); // extractAllRelations creates new objects for every predicted relation foreach (RelationMention rel in ExtractAllRelations(sentence)) { // add all relations. potentially useful for a joint model // if (! RelationMention.isUnrelatedLabel(rel.getType())) relations.Add(rel); } // caution: this removes the old list of relation mentions! foreach (RelationMention r in relations) { if (!r.GetType().Equals(RelationMention.Unrelated)) { logger.Fine("Found positive relation in annotateSentence: " + r); } } sentence.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations); }
public virtual void Annotate(Annotation annotation) { // extract entities and relations Annotation output = mr.Annotate(annotation); // transfer entities/relations back to the original annotation IList <ICoreMap> outputSentences = output.Get(typeof(CoreAnnotations.SentencesAnnotation)); IList <ICoreMap> origSentences = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < outputSentences.Count; i++) { ICoreMap outSent = outputSentences[i]; ICoreMap origSent = origSentences[i]; // set entities IList <EntityMention> entities = outSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); origSent.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), entities); if (verbose && entities != null) { log.Info("Extracted the following entities:"); foreach (EntityMention e in entities) { log.Info("\t" + e); } } // set relations IList <RelationMention> relations = outSent.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation)); origSent.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations); if (verbose && relations != null) { log.Info("Extracted the following relations:"); foreach (RelationMention r in relations) { if (!r.GetType().Equals(RelationMention.Unrelated)) { log.Info(r); } } } } }
private void CountAdjacentMentions(ICoreMap sent) { IList <EntityMention> mentions = sent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation)); if (mentions != null) { foreach (EntityMention m1 in mentions) { foreach (EntityMention m2 in mentions) { if (m1 == m2) { continue; } if (m1.GetHeadTokenEnd() == m2.GetHeadTokenStart() && m1.GetType().Equals(m2.GetType())) { adjacentEntityMentions.IncrementCount(m1.GetType()); } } } } }
/// <exception cref="System.IO.IOException"/> public virtual void Annotate(ICoreMap document) { try { //--Create Input File //(create file) File inputFile = File.CreateTempFile("heideltime", ".input"); //(write to file) PrintWriter inputWriter = new PrintWriter(inputFile); PrepareHeidelTimeInput(inputWriter, document); inputWriter.Close(); Optional <string> pubDate = GetPubDate(document); //--Build Command IList <string> args = new List <string>(Arrays.AsList("java", "-jar", this.heideltimePath.GetPath() + "/heideltime.jar", "-c", this.heideltimePath.GetPath() + "/config.props", "-l", this.language, "-t", "NEWS")); if (pubDate.IsPresent()) { args.Add("-dct"); args.Add(pubDate.Get()); } args.Add(inputFile.GetPath()); // run HeidelTime on the input file ProcessBuilder process = new ProcessBuilder(args); StringWriter outputWriter = new StringWriter(); SystemUtils.Run(process, outputWriter, null); string output = outputWriter.GetBuffer().ToString(); IList <ICoreMap> timexAnns = outputReader.Process(document, output); document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns); if (outputResults) { System.Console.Out.WriteLine(timexAnns); } } catch (Exception e) { Sharpen.Runtime.PrintStackTrace(e, System.Console.Error); System.Console.Error.WriteLine("error running HeidelTime on this doc: " + document.Get(typeof(CoreAnnotations.DocIDAnnotation))); } }
/// <summary>Print a description of this triple, formatted like the ReVerb outputs.</summary> public virtual string ToReverbString(string docid, ICoreMap sentence) { int sentIndex = -1; int subjIndex = -1; int relationIndex = -1; int objIndex = -1; int subjIndexEnd = -1; int relationIndexEnd = -1; int objIndexEnd = -1; if (!relation.IsEmpty()) { sentIndex = relation[0].SentIndex(); relationIndex = relation[0].Index() - 1; relationIndexEnd = relation[relation.Count - 1].Index(); } if (!subject.IsEmpty()) { if (sentIndex < 0) { sentIndex = subject[0].SentIndex(); } subjIndex = subject[0].Index() - 1; subjIndexEnd = subject[subject.Count - 1].Index(); } if ([email protected]()) { if (sentIndex < 0) { sentIndex = subject[0].SentIndex(); } objIndex = @object[0].Index() - 1; objIndexEnd = @object[@object.Count - 1].Index(); } return((docid == null ? "no_doc_id" : docid) + '\t' + sentIndex + '\t' + SubjectGloss().Replace('\t', ' ') + '\t' + RelationGloss().Replace('\t', ' ') + '\t' + ObjectGloss().Replace('\t', ' ') + '\t' + subjIndex + '\t' + subjIndexEnd + '\t' + relationIndex + '\t' + relationIndexEnd + '\t' + objIndex + '\t' + objIndexEnd + '\t' + ConfidenceGloss() + '\t' + StringUtils.Join(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null), " ") + '\t' + StringUtils.Join( sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null), " ") + '\t' + SubjectLemmaGloss().Replace('\t', ' ') + '\t' + RelationLemmaGloss().Replace('\t', ' ') + '\t' + ObjectLemmaGloss().Replace('\t', ' ')); }
protected internal static void ExtractPremarkedEntityMentions(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet) { IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); SemanticGraph dependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)); int beginIndex = -1; foreach (CoreLabel w in sent) { MultiTokenTag t = w.Get(typeof(CoreAnnotations.MentionTokenAnnotation)); if (t != null) { // Part of a mention if (t.IsStart()) { // Start of mention beginIndex = w.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; } if (t.IsEnd()) { // end of mention int endIndex = w.Get(typeof(CoreAnnotations.IndexAnnotation)); if (beginIndex >= 0) { IntPair mSpan = new IntPair(beginIndex, endIndex); int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new List <CoreLabel>(sent.SubList(beginIndex, endIndex))); mentions.Add(m); mentionSpanSet.Add(mSpan); beginIndex = -1; } else { SieveCoreferenceSystem.logger.Warning("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1) + " for " + s.Get(typeof(CoreAnnotations.TextAnnotation))); } } } } }
protected internal virtual void RemoveSpuriousMentionsEn(Annotation doc, IList <IList <Mention> > predictedMentions, Dictionaries dict) { IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < predictedMentions.Count; i++) { ICoreMap s = sentences[i]; IList <Mention> mentions = predictedMentions[i]; IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); ICollection <Mention> remove = Generics.NewHashSet(); foreach (Mention m in mentions) { string headPOS = m.headWord.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation)); // non word such as 'hmm' if (dict.nonWords.Contains(m.headString)) { remove.Add(m); } // adjective form of nations // the [American] policy -> not mention // speak in [Japanese] -> mention // check if the mention is noun and the next word is not noun if (dict.IsAdjectivalDemonym(m.SpanToString())) { if (!headPOS.StartsWith("N") || (m.endIndex < sent.Count && sent[m.endIndex].Tag().StartsWith("N"))) { remove.Add(m); } } // stop list (e.g., U.S., there) if (InStopList(m)) { remove.Add(m); } } mentions.RemoveAll(remove); } }
public virtual bool Matches(ICoreMap o1, ICoreMap o2) { foreach (Type key in keys) { object v1 = o1.Get(key); object v2 = o2.Get(key); if (v1 != null) { if (!v1.Equals(v2)) { return(false); } } else { if (v2 != null) { return(false); } } } return(true); }
public virtual int DetermineRelFlags(ICoreMap annotation, TimeExpression te) { int flags = 0; bool flagsSet = false; if (te.value.GetTags() != null) { IValue v = te.value.GetTags().GetTag("resolveTo"); if (v != null && v.Get() is Number) { flags = ((Number)v.Get()); flagsSet = true; } } if (!flagsSet) { if (te.GetTemporal() is SUTime.PartialTime) { flags = SUTime.ResolveToClosest; } } return(flags); }
// temporary for debug protected internal static void AddGoldMentions(IList <ICoreMap> sentences, IList <ICollection <IntPair> > mentionSpanSetList, IList <IList <Mention> > predictedMentions, IList <IList <Mention> > allGoldMentions) { for (int i = 0; i < sz; i++) { IList <Mention> mentions = predictedMentions[i]; ICoreMap sent = sentences[i]; IList <CoreLabel> tokens = sent.Get(typeof(CoreAnnotations.TokensAnnotation)); ICollection <IntPair> mentionSpanSet = mentionSpanSetList[i]; IList <Mention> golds = allGoldMentions[i]; foreach (Mention g in golds) { IntPair pair = new IntPair(g.startIndex, g.endIndex); if (!mentionSpanSet.Contains(pair)) { int dummyMentionId = -1; Mention m = new Mention(dummyMentionId, g.startIndex, g.endIndex, tokens, sent.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)), sent.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)) != null ? sent .Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)) : sent.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)), new List <CoreLabel>(tokens.SubList(g.startIndex, g.endIndex))); mentions.Add(m); mentionSpanSet.Add(pair); } } } }
private void RecognizeNumberSequences(IList <CoreLabel> words, ICoreMap document, ICoreMap sentence) { // we need to copy here because NumberSequenceClassifier overwrites the AnswerAnnotation IList <CoreLabel> newWords = NumberSequenceClassifier.CopyTokens(words, sentence); nsc.ClassifyWithGlobalInformation(newWords, document, sentence); // copy AnswerAnnotation back. Do not overwrite! // also, copy all the additional annotations generated by SUTime and NumberNormalizer for (int i = 0; i < sz; i++) { CoreLabel origWord = words[i]; CoreLabel newWord = newWords[i]; // log.info(newWord.word() + " => " + newWord.get(CoreAnnotations.AnswerAnnotation.class) + " " + origWord.ner()); string before = origWord.Get(typeof(CoreAnnotations.AnswerAnnotation)); string newGuess = newWord.Get(typeof(CoreAnnotations.AnswerAnnotation)); if ((before == null || before.Equals(nsc.flags.backgroundSymbol) || before.Equals("MISC")) && !newGuess.Equals(nsc.flags.backgroundSymbol)) { origWord.Set(typeof(CoreAnnotations.AnswerAnnotation), newGuess); } // transfer other annotations generated by SUTime or NumberNormalizer NumberSequenceClassifier.TransferAnnotations(newWord, origWord); } }
/// <summary>Parse a string with SUTime.</summary> /// <exception cref="SUTimeParsingError">if anything goes wrong</exception> /// <exception cref="Edu.Stanford.Nlp.Time.SUTimeSimpleParser.SUTimeParsingError"/> public static SUTime.Temporal Parse(string str) { try { Annotation doc = new Annotation(str); pipeline.Annotate(doc); System.Diagnostics.Debug.Assert(doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) != null); System.Diagnostics.Debug.Assert(!doc.Get(typeof(CoreAnnotations.SentencesAnnotation)).IsEmpty()); IList <ICoreMap> timexAnnotations = doc.Get(typeof(TimeAnnotations.TimexAnnotations)); if (timexAnnotations.Count > 1) { throw new Exception("Too many timexes for '" + str + '\''); } ICoreMap timex = timexAnnotations[0]; return(timex.Get(typeof(TimeExpression.Annotation)).GetTemporal()); } catch (Exception e) { SUTimeSimpleParser.SUTimeParsingError parsingError = new SUTimeSimpleParser.SUTimeParsingError(str); parsingError.InitCause(e); throw parsingError; } }
protected internal virtual void FindHead(ICoreMap s, IList <Mention> mentions) { Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation)); tree.IndexSpans(0); foreach (Mention m in mentions) { Tree head = FindSyntacticHead(m, tree, sent); m.headIndex = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1; m.headWord = sent[m.headIndex]; m.headString = m.headWord.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower(Locale.English); int start = m.headIndex - m.startIndex; if (start < 0 || start >= m.originalSpan.Count) { SieveCoreferenceSystem.logger.Warning("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + ": originalSpan=[" + StringUtils.JoinWords(m.originalSpan, " ") + "], head=" + m.headWord); SieveCoreferenceSystem.logger.Warning("Setting head string to entire mention"); m.headIndex = m.startIndex; m.headWord = m.originalSpan.Count > 0 ? m.originalSpan[0] : sent[m.startIndex]; m.headString = m.originalSpan.ToString(); } } }
/// <exception cref="System.IO.IOException"/> private static void RecallErrors(IList<IList<Mention>> goldMentions, IList<IList<Mention>> predictedMentions, Annotation doc) { IList<ICoreMap> coreMaps = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); int numSentences = goldMentions.Count; for (int i = 0; i < numSentences; i++) { ICoreMap coreMap = coreMaps[i]; IList<CoreLabel> words = coreMap.Get(typeof(CoreAnnotations.TokensAnnotation)); Tree tree = coreMap.Get(typeof(TreeCoreAnnotations.TreeAnnotation)); IList<Mention> goldMentionsSent = goldMentions[i]; IList<Pair<int, int>> goldMentionsSpans = ExtractSpans(goldMentionsSent); foreach (Pair<int, int> mentionSpan in goldMentionsSpans) { logger.Finer("RECALL ERROR\n"); logger.Finer(coreMap + "\n"); for (int x = mentionSpan.first; x < mentionSpan.second; x++) { logger.Finer(words[x].Value() + " "); } logger.Finer("\n" + tree + "\n"); } } }
private void AnnotateExpressions(ICoreMap annotation, IList <T> expressions) { // TODO: Logging can be excessive IList <T> toDiscard = new List <T>(); foreach (T te in expressions) { // Add attributes and all if (te.annotation == null) { try { bool extractOkay = te.ExtractAnnotation(env, annotation); if (verbose && extractOkay) { log.Info("annotateExpressions() matched " + te + " from " + annotation); } if (!extractOkay) { // Things didn't turn out so well toDiscard.Add(te); log.Warn("Error extracting annotation from " + te); } } catch (Exception ex) { /*+ ", " + te.getExtractErrorMessage() */ if (verbose) { log.Warn("Error extracting annotation from " + te); log.Warn(ex); } } } } expressions.RemoveAll(toDiscard); }
public static SUTime.Temporal ParseOrNull(string str) { Annotation doc = new Annotation(str); pipeline.Annotate(doc); if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) == null) { return(null); } if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)).IsEmpty()) { return(null); } IList <ICoreMap> timexAnnotations = doc.Get(typeof(TimeAnnotations.TimexAnnotations)); if (timexAnnotations.Count > 1) { return(null); } else { if (timexAnnotations.IsEmpty()) { return(null); } } ICoreMap timex = timexAnnotations[0]; if (timex.Get(typeof(TimeExpression.Annotation)) == null) { return(null); } else { return(timex.Get(typeof(TimeExpression.Annotation)).GetTemporal()); } }
public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees) { try { PrintWriter output = IOUtils.GetPrintWriter(outFile); for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; DependencyTree tree = trees[i]; IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); for (int j = 1; j <= size; ++j) { CoreLabel token = tokens[j - 1]; output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j)); } output.Println(); } output.Close(); } catch (Exception e) { throw new RuntimeIOException(e); } }
private void CleanupTags(ICoreMap cm, IDictionary <object, bool> cleaned) { cm.Remove(typeof(Tags.TagsAnnotation)); foreach (Type key in cm.KeySet()) { object obj = cm.Get(key); if (!cleaned.Contains(obj)) { cleaned[obj] = false; if (obj is ICoreMap) { CleanupTags((ICoreMap)obj, cleaned); } else { if (obj is ICollection) { CleanupTags((ICollection)obj, cleaned); } } cleaned[obj] = true; } } }
/// <exception cref="System.Exception"/> public virtual Document MakeDocument(InputDoc input) { IList <IList <Mention> > mentions = new List <IList <Mention> >(); if (CorefProperties.UseGoldMentions(props)) { IList <ICoreMap> sentences = input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)); for (int i = 0; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; IList <CoreLabel> sentenceWords = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); IList <Mention> sentenceMentions = new List <Mention>(); mentions.Add(sentenceMentions); foreach (Mention g in input.goldMentions[i]) { sentenceMentions.Add(new Mention(-1, g.startIndex, g.endIndex, sentenceWords, null, null, new List <CoreLabel>(sentenceWords.SubList(g.startIndex, g.endIndex)))); } md.FindHead(sentence, sentenceMentions); } } else { foreach (ICoreMap sentence in input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { mentions.Add(sentence.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation))); } } Document doc = new Document(input, mentions); if (input.goldMentions != null) { FindGoldMentionHeads(doc); } DocumentPreprocessor.Preprocess(doc, dict, null, headFinder); return(doc); }
public virtual void TestAlwaysNewlineIsSentenceBreakSettings() { string text = "This is \none sentence\n\nThis is not another."; string[] sents = new string[] { "This is", "one sentence", "This is not another ." }; Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation document1 = new Annotation(text); pipeline.Annotate(document1); IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation)); NUnit.Framework.Assert.AreEqual(3, sentences.Count); // make sure that there are the correct # of tokens (count does contain NL tokens) IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation)); NUnit.Framework.Assert.AreEqual(9, tokens.Count); for (int i = 0; i < Math.Min(sents.Length, sentences.Count); i++) { ICoreMap sentence = sentences[i]; string sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); NUnit.Framework.Assert.AreEqual(sents[i], sentenceText, "Bad sentence #" + i); } }
// for filling in the text of a mention public virtual string TokenRangeToString(Pair <int, int> tokenRange) { IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation)); // see if the token range matches an entity mention IList <ICoreMap> entityMentionsInDoc = doc.Get(typeof(CoreAnnotations.MentionsAnnotation)); int potentialMatchingEntityMentionIndex = tokens[tokenRange.first].Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation)); ICoreMap potentialMatchingEntityMention = null; if (entityMentionsInDoc != null && potentialMatchingEntityMentionIndex != null) { potentialMatchingEntityMention = entityMentionsInDoc[potentialMatchingEntityMentionIndex]; } // if there is a matching entity mention, return it's text (which has been processed to remove // things like newlines and xml)...if there isn't return the full substring of the document text if (potentialMatchingEntityMention != null && potentialMatchingEntityMention.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) == tokens[tokenRange.first].BeginPosition() && potentialMatchingEntityMention.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation )) == tokens[tokenRange.second].EndPosition()) { return(potentialMatchingEntityMention.Get(typeof(CoreAnnotations.TextAnnotation))); } else { return(Sharpen.Runtime.Substring(doc.Get(typeof(CoreAnnotations.TextAnnotation)), tokens[tokenRange.first].BeginPosition(), tokens[tokenRange.second].EndPosition())); } }
public static void AddEnhancedSentences(Annotation doc) { //for every sentence that begins a paragraph: append this sentence and the previous one and see if sentence splitter would make a single sentence out of it. If so, add as extra sentence. //for each sieve that potentially uses augmentedSentences in original: IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation)); WordToSentenceProcessor wsp = new WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak.Never); //create SentenceSplitter that never splits on newline int prevParagraph = 0; for (int i = 1; i < sentences.Count; i++) { ICoreMap sentence = sentences[i]; ICoreMap prevSentence = sentences[i - 1]; IList <CoreLabel> tokensConcat = new List <CoreLabel>(); Sharpen.Collections.AddAll(tokensConcat, prevSentence.Get(typeof(CoreAnnotations.TokensAnnotation))); Sharpen.Collections.AddAll(tokensConcat, sentence.Get(typeof(CoreAnnotations.TokensAnnotation))); IList <IList <CoreLabel> > sentenceTokens = wsp.Process(tokensConcat); if (sentenceTokens.Count == 1) { //wsp would have put them into a single sentence --> add enhanced sentence. sentence.Set(typeof(QuoteAttributionUtils.EnhancedSentenceAnnotation), ConstructSentence(sentenceTokens[0], prevSentence, sentence)); } } }
public static string TokensAndNELabelsToString(ICoreMap sentence) { StringBuilder os = new StringBuilder(); IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation)); if (tokens != null) { bool first = true; foreach (CoreLabel token in tokens) { if (!first) { os.Append(" "); } os.Append(token.Word()); if (token.Ner() != null && !token.Ner().Equals("O")) { os.Append("/" + token.Ner()); } first = false; } } return(os.ToString()); }
/// <summary>Process an entity given the NER tag, extracted modifier and the next word in the document.</summary> /// <remarks> /// Process an entity given the NER tag, extracted modifier and the next word in the document. /// The normalized quantity will be written in place. /// </remarks> /// <param name="l">A collector that collects annotations for the entity.</param> /// <param name="entityType">Quantifiable NER tag.</param> /// <param name="compModifier"> /// The extracted modifier around the entity of interest. Different NER tags should /// have different extraction rules. /// </param> /// <param name="nextWord">Next word in the document.</param> /// <param name="document">Reference to the document.</param> /// <?/> /// <returns/> private static IList <E> ProcessEntity <E>(IList <E> l, string entityType, string compModifier, string nextWord, ICoreMap document) where E : ICoreMap { // convert the entity annotations into a string string s = SingleEntityToString(l); StringBuilder sb = new StringBuilder(); // convert all full digits to half digits for (int i = 0; i < sz; i++) { string ch = Sharpen.Runtime.Substring(s, i, i + 1); if (fullDigitToHalfDigit.Contains(ch)) { ch = fullDigitToHalfDigit[ch]; } sb.Append(ch); } s = sb.ToString(); string p = null; switch (entityType) { case NumberTag: { p = string.Empty; if (compModifier != null) { p = compModifier; } string q = NormalizedNumberString(s, nextWord, 1.0); if (q != null) { p = p.Concat(q); } else { p = null; } break; } case OrdinalTag: { // ordinal won't have modifier p = NormalizedOrdinalString(s, nextWord); break; } case PercentTag: { p = NormalizedPercentString(s, nextWord); break; } case MoneyTag: { p = string.Empty; if (compModifier != null) { p = compModifier; } q = NormalizedMoneyString(s, nextWord); if (q != null) { p = p.Concat(q); } else { p = null; } break; } case DateTag: { if (s.Matches(BasicYyyymmddPattern) || s.Matches(BasicMmddPattern) || s.Matches(EnglishMmddyyyyPattern) || s.Matches(BasicDdPattern) || s.Matches(RelativeTimePattern) || s.Matches(BirthDecadePattern)) { string docdate = document.Get(typeof(CoreAnnotations.DocDateAnnotation)); p = NormalizeDateString(s, docdate); } break; } case TimeTag: { break; } } // Write the normalized NER values in place foreach (E wi in l) { if (p != null) { wi.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), p); } } // This return value is not necessarily useful as the labelling is done in place. return(l); }
// Patterns used by DATE and TIME (must be after the static initializers to make use of the modifiers) // static methods /// <summary> /// Identifies contiguous MONEY, TIME, DATE, or PERCENT entities /// and tags each of their constituents with a "normalizedQuantity" /// label which contains the appropriate normalized string corresponding to /// the full quantity. /// </summary> /// <remarks> /// Identifies contiguous MONEY, TIME, DATE, or PERCENT entities /// and tags each of their constituents with a "normalizedQuantity" /// label which contains the appropriate normalized string corresponding to /// the full quantity. /// Unlike the English normalizer, this method currently does not support /// concatenation or SUTime. /// </remarks> /// <param name="list"> /// A list of /// <see cref="Edu.Stanford.Nlp.Util.ICoreMap"/> /// s representing a single document. /// Note: We assume the NERs has been labelled and the labels /// will be updated in place. /// </param> /// <param name="document"/> /// <param name="sentence"/> /// <?/> public static void AddNormalizedQuantitiesToEntities <E>(IList <E> list, ICoreMap document, ICoreMap sentence) where E : ICoreMap { // Fix the NER sequence if necessay FixupNerBeforeNormalization(list); // Now that NER tags has been fixed up, we do another pass to add the normalization string prevNerTag = BackgroundSymbol; int beforeIndex = -1; List <E> collector = new List <E>(); for (int i = 0; i <= sz; i++) { // we should always keep list.size() unchanged inside the loop E wi = null; string currNerTag = null; string nextWord = string.Empty; if (i < sz) { wi = list[i]; if (i + 1 < sz) { nextWord = list[i + 1].Get(typeof(CoreAnnotations.TextAnnotation)); if (nextWord == null) { nextWord = string.Empty; } } // We assume NERs have been set by previous NER taggers currNerTag = wi.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation)); } // TODO: may need to detect TIME modifier here? E wprev = (i > 0) ? list[i - 1] : null; // if the current wi is a non-continuation and the last one was a // quantity, we close and process the last segment. // TODO: also need to check compatibility as the English normalizer does if ((currNerTag == null || !currNerTag.Equals(prevNerTag)) && quantifiable.Contains(prevNerTag)) { string modifier = null; switch (prevNerTag) { case TimeTag: { // Need different handling for different tags // TODO: add TIME break; } case DateTag: { ProcessEntity(collector, prevNerTag, modifier, nextWord, document); break; } default: { if (prevNerTag.Equals(NumberTag) || prevNerTag.Equals(PercentTag) || prevNerTag.Equals(MoneyTag)) { // we are doing for prev tag so afterIndex should really be i modifier = DetectQuantityModifier(list, beforeIndex, i); } ProcessEntity(collector, prevNerTag, modifier, nextWord); break; } } collector = new List <E>(); } // If currNerTag is quantifiable, we add it into collector if (quantifiable.Contains(currNerTag)) { if (collector.IsEmpty()) { beforeIndex = i - 1; } collector.Add(wi); } // move on and update prev pointer prevNerTag = currNerTag; } }
public virtual ICounter <string> GetTopSpeakers(IList <Sieve.MentionData> closestMentions, IList <Sieve.MentionData> closestMentionsBackward, Person.Gender gender, ICoreMap quote, bool overrideGender) { ICounter <string> topSpeakerInRange = new ClassicCounter <string>(); ICounter <string> topSpeakerInRangeIgnoreGender = new ClassicCounter <string>(); ICollection <Sieve.MentionData> backwardsMentions = new HashSet <Sieve.MentionData>(closestMentionsBackward); foreach (Sieve.MentionData mention in closestMentions) { double weight = backwardsMentions.Contains(mention) ? BackwardWeight : ForwardWeight; if (mention.type.Equals(Name)) { if (!characterMap.Keys.Contains(mention.text)) { continue; } Person p = characterMap[mention.text][0]; if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk)) { topSpeakerInRange.IncrementCount(p.name, weight); } topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight); if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94) { System.Console.Out.WriteLine(p.name + " " + weight + " name"); } } else { if (mention.type.Equals(Pronoun)) { int charBeginKey = doc.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.begin].BeginPosition(); Person p = DoCoreference(charBeginKey, quote); if (p != null) { if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk)) { topSpeakerInRange.IncrementCount(p.name, weight); } topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight); if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94) { System.Console.Out.WriteLine(p.name + " " + weight + " pronoun"); } } } } } if (topSpeakerInRange.Size() > 0) { return(topSpeakerInRange); } else { if (gender != Person.Gender.Unk && !overrideGender) { return(topSpeakerInRange); } } return(topSpeakerInRangeIgnoreGender); }