Exemple #1
0
        /// <summary>
        /// Copy constructor.
        /// </summary>
        /// <param name="other">The ArrayCoreMap to copy. It may not be null.</param>
        public ArrayCoreMap(ICoreMap other)
        {
            /*Set<Class<?>>*/
            var otherKeys = other.KeySet();

            psize = otherKeys.Count;
            keys = new Type[psize];
            values = new Object[psize];

            int i = 0;
            foreach (var key in otherKeys)
            {
                this.keys[i] = key;
                this.values[i] = other.Get(key);
                i++;
            }
        }
        public virtual void TestKbpSpanishWorks()
        {
            Properties props = PropertiesUtils.AsProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard"
                                                            , "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? " + "/[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags"
                                                            , "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags"
                                                            , "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]"
                                                            );
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(kbpSpanishDocument);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int i = 0; i < Math.Min(kbpSpanishSentences.Length, sentences.Count); i++)
            {
                ICoreMap sentence     = sentences[i];
                string   sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                NUnit.Framework.Assert.AreEqual(kbpSpanishSentences[i], sentenceText, "Bad sentence #" + i);
            }
            NUnit.Framework.Assert.AreEqual(kbpSpanishSentences.Length, sentences.Count, "Bad total number of sentences");
        }
Exemple #3
0
        public virtual Person DoCoreference(int corefMapKey, ICoreMap quote)
        {
            if (pronounCorefMap == null)
            {
                return(null);
            }
            ICollection <Person> quoteNames = new HashSet <Person>();

            if (quote != null)
            {
                quoteNames = GetNamesInParagraph(quote);
            }
            string referent  = pronounCorefMap[corefMapKey];
            Person candidate = ResolveAmbiguities(referent);

            if (candidate != null && !quoteNames.Contains(candidate))
            {
                return(candidate);
            }
            return(null);
        }
        public virtual void Process(long id, Annotation doc)
        {
            ICoreMap      sentence = doc.Get(typeof(CoreAnnotations.SentencesAnnotation))[0];
            SemanticGraph depparse = sentence.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation));

            log.Info("| " + sentence.Get(typeof(CoreAnnotations.TextAnnotation)));
            // Get all valid subject spans
            BitSet       consumedAsSubjects = new BitSet();
            IList <Span> subjectSpans       = new List <Span>();

            foreach (IndexedWord head in depparse.TopologicalSort())
            {
                // Check if the node is a noun/pronoun
                if (head.Tag().StartsWith("N") || head.Tag().Equals("PRP"))
                {
                    // Try to get the NP chunk
                    Optional <IList <IndexedWord> > subjectChunk = segmenter.GetValidChunk(depparse, head, segmenter.ValidSubjectArcs, Optional.Empty(), true);
                    if (subjectChunk.IsPresent())
                    {
                        // Make sure it's not already a member of a larger NP
                        foreach (IndexedWord tok in subjectChunk.Get())
                        {
                            if (consumedAsSubjects.Get(tok.Index()))
                            {
                                goto NEXTNODE_continue;
                            }
                        }
                        // Already considered. Continue to the next node.
                        // Register it as an NP
                        foreach (IndexedWord tok_1 in subjectChunk.Get())
                        {
                            consumedAsSubjects.Set(tok_1.Index());
                        }
                        // Add it as a subject
                        subjectSpans.Add(ToSpan(subjectChunk.Get()));
                    }
                }
            }
            NEXTNODE_break :;
        }
        /// <summary>
        /// Converts NamedEntityTagAnnotation tags into
        /// <see cref="Edu.Stanford.Nlp.IE.Machinereading.Structure.EntityMention"/>
        /// s. This
        /// finds the longest sequence of NamedEntityTagAnnotation tags of the matching
        /// type.
        /// </summary>
        /// <param name="sentence">A sentence annotated with NamedEntityTagAnnotation</param>
        public virtual void MakeAnnotationFromAllNERTags(ICoreMap sentence)
        {
            IList <CoreLabel>     words    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
            IList <EntityMention> mentions = sentence.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));

            System.Diagnostics.Debug.Assert(words != null);
            if (mentions == null)
            {
                this.logger.Info("mentions are null");
                mentions = new List <EntityMention>();
            }
            for (int start = 0; start < words.Count; start++)
            {
                int end;
                // find the first token after start that isn't of nerType
                string lastneTag = null;
                string ne        = null;
                for (end = start; end < words.Count; end++)
                {
                    ne = words[end].Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                    if (ne.Equals(SeqClassifierFlags.DefaultBackgroundSymbol) || (lastneTag != null && !ne.Equals(lastneTag)))
                    {
                        break;
                    }
                    lastneTag = ne;
                }
                if (end > start)
                {
                    // found a match!
                    string        entityType = this.GetEntityTypeForTag(lastneTag);
                    EntityMention m          = entityMentionFactory.ConstructEntityMention(EntityMention.MakeUniqueId(), sentence, new Span(start, end), new Span(start, end), entityType, null, null);
                    //TODO: changed entityType in the above sentence to nerTag - Sonal
                    logger.Info("Created " + entityType + " entity mention: " + m);
                    start = end - 1;
                    mentions.Add(m);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), mentions);
        }
Exemple #6
0
        public virtual void AnnotateSentence(ICoreMap sentence)
        {
            // this stores all relation mentions generated by this extractor
            IList <RelationMention> relations = new List <RelationMention>();

            // extractAllRelations creates new objects for every predicted relation
            foreach (RelationMention rel in ExtractAllRelations(sentence))
            {
                // add all relations. potentially useful for a joint model
                // if (! RelationMention.isUnrelatedLabel(rel.getType()))
                relations.Add(rel);
            }
            // caution: this removes the old list of relation mentions!
            foreach (RelationMention r in relations)
            {
                if (!r.GetType().Equals(RelationMention.Unrelated))
                {
                    logger.Fine("Found positive relation in annotateSentence: " + r);
                }
            }
            sentence.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations);
        }
        public virtual void Annotate(Annotation annotation)
        {
            // extract entities and relations
            Annotation output = mr.Annotate(annotation);
            // transfer entities/relations back to the original annotation
            IList <ICoreMap> outputSentences = output.Get(typeof(CoreAnnotations.SentencesAnnotation));
            IList <ICoreMap> origSentences   = annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int i = 0; i < outputSentences.Count; i++)
            {
                ICoreMap outSent  = outputSentences[i];
                ICoreMap origSent = origSentences[i];
                // set entities
                IList <EntityMention> entities = outSent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));
                origSent.Set(typeof(MachineReadingAnnotations.EntityMentionsAnnotation), entities);
                if (verbose && entities != null)
                {
                    log.Info("Extracted the following entities:");
                    foreach (EntityMention e in entities)
                    {
                        log.Info("\t" + e);
                    }
                }
                // set relations
                IList <RelationMention> relations = outSent.Get(typeof(MachineReadingAnnotations.RelationMentionsAnnotation));
                origSent.Set(typeof(MachineReadingAnnotations.RelationMentionsAnnotation), relations);
                if (verbose && relations != null)
                {
                    log.Info("Extracted the following relations:");
                    foreach (RelationMention r in relations)
                    {
                        if (!r.GetType().Equals(RelationMention.Unrelated))
                        {
                            log.Info(r);
                        }
                    }
                }
            }
        }
Exemple #8
0
        private void CountAdjacentMentions(ICoreMap sent)
        {
            IList <EntityMention> mentions = sent.Get(typeof(MachineReadingAnnotations.EntityMentionsAnnotation));

            if (mentions != null)
            {
                foreach (EntityMention m1 in mentions)
                {
                    foreach (EntityMention m2 in mentions)
                    {
                        if (m1 == m2)
                        {
                            continue;
                        }
                        if (m1.GetHeadTokenEnd() == m2.GetHeadTokenStart() && m1.GetType().Equals(m2.GetType()))
                        {
                            adjacentEntityMentions.IncrementCount(m1.GetType());
                        }
                    }
                }
            }
        }
 /// <exception cref="System.IO.IOException"/>
 public virtual void Annotate(ICoreMap document)
 {
     try
     {
         //--Create Input File
         //(create file)
         File inputFile = File.CreateTempFile("heideltime", ".input");
         //(write to file)
         PrintWriter inputWriter = new PrintWriter(inputFile);
         PrepareHeidelTimeInput(inputWriter, document);
         inputWriter.Close();
         Optional <string> pubDate = GetPubDate(document);
         //--Build Command
         IList <string> args = new List <string>(Arrays.AsList("java", "-jar", this.heideltimePath.GetPath() + "/heideltime.jar", "-c", this.heideltimePath.GetPath() + "/config.props", "-l", this.language, "-t", "NEWS"));
         if (pubDate.IsPresent())
         {
             args.Add("-dct");
             args.Add(pubDate.Get());
         }
         args.Add(inputFile.GetPath());
         // run HeidelTime on the input file
         ProcessBuilder process      = new ProcessBuilder(args);
         StringWriter   outputWriter = new StringWriter();
         SystemUtils.Run(process, outputWriter, null);
         string           output    = outputWriter.GetBuffer().ToString();
         IList <ICoreMap> timexAnns = outputReader.Process(document, output);
         document.Set(typeof(TimeAnnotations.TimexAnnotations), timexAnns);
         if (outputResults)
         {
             System.Console.Out.WriteLine(timexAnns);
         }
     }
     catch (Exception e)
     {
         Sharpen.Runtime.PrintStackTrace(e, System.Console.Error);
         System.Console.Error.WriteLine("error running HeidelTime on this doc: " + document.Get(typeof(CoreAnnotations.DocIDAnnotation)));
     }
 }
        /// <summary>Print a description of this triple, formatted like the ReVerb outputs.</summary>
        public virtual string ToReverbString(string docid, ICoreMap sentence)
        {
            int sentIndex        = -1;
            int subjIndex        = -1;
            int relationIndex    = -1;
            int objIndex         = -1;
            int subjIndexEnd     = -1;
            int relationIndexEnd = -1;
            int objIndexEnd      = -1;

            if (!relation.IsEmpty())
            {
                sentIndex        = relation[0].SentIndex();
                relationIndex    = relation[0].Index() - 1;
                relationIndexEnd = relation[relation.Count - 1].Index();
            }
            if (!subject.IsEmpty())
            {
                if (sentIndex < 0)
                {
                    sentIndex = subject[0].SentIndex();
                }
                subjIndex    = subject[0].Index() - 1;
                subjIndexEnd = subject[subject.Count - 1].Index();
            }
            if ([email protected]())
            {
                if (sentIndex < 0)
                {
                    sentIndex = subject[0].SentIndex();
                }
                objIndex    = @object[0].Index() - 1;
                objIndexEnd = @object[@object.Count - 1].Index();
            }
            return((docid == null ? "no_doc_id" : docid) + '\t' + sentIndex + '\t' + SubjectGloss().Replace('\t', ' ') + '\t' + RelationGloss().Replace('\t', ' ') + '\t' + ObjectGloss().Replace('\t', ' ') + '\t' + subjIndex + '\t' + subjIndexEnd + '\t'
                   + relationIndex + '\t' + relationIndexEnd + '\t' + objIndex + '\t' + objIndexEnd + '\t' + ConfidenceGloss() + '\t' + StringUtils.Join(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null), " ") + '\t' + StringUtils.Join(
                       sentence.Get(typeof(CoreAnnotations.TokensAnnotation)).Stream().Map(null), " ") + '\t' + SubjectLemmaGloss().Replace('\t', ' ') + '\t' + RelationLemmaGloss().Replace('\t', ' ') + '\t' + ObjectLemmaGloss().Replace('\t', ' '));
        }
Exemple #11
0
        protected internal static void ExtractPremarkedEntityMentions(ICoreMap s, IList <Mention> mentions, ICollection <IntPair> mentionSpanSet, ICollection <IntPair> namedEntitySpanSet)
        {
            IList <CoreLabel> sent       = s.Get(typeof(CoreAnnotations.TokensAnnotation));
            SemanticGraph     dependency = s.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation));
            int beginIndex = -1;

            foreach (CoreLabel w in sent)
            {
                MultiTokenTag t = w.Get(typeof(CoreAnnotations.MentionTokenAnnotation));
                if (t != null)
                {
                    // Part of a mention
                    if (t.IsStart())
                    {
                        // Start of mention
                        beginIndex = w.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1;
                    }
                    if (t.IsEnd())
                    {
                        // end of mention
                        int endIndex = w.Get(typeof(CoreAnnotations.IndexAnnotation));
                        if (beginIndex >= 0)
                        {
                            IntPair mSpan          = new IntPair(beginIndex, endIndex);
                            int     dummyMentionId = -1;
                            Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new List <CoreLabel>(sent.SubList(beginIndex, endIndex)));
                            mentions.Add(m);
                            mentionSpanSet.Add(mSpan);
                            beginIndex = -1;
                        }
                        else
                        {
                            SieveCoreferenceSystem.logger.Warning("Start of marked mention not found in sentence: " + t + " at tokenIndex=" + (w.Get(typeof(CoreAnnotations.IndexAnnotation)) - 1) + " for " + s.Get(typeof(CoreAnnotations.TextAnnotation)));
                        }
                    }
                }
            }
        }
Exemple #12
0
        protected internal virtual void RemoveSpuriousMentionsEn(Annotation doc, IList <IList <Mention> > predictedMentions, Dictionaries dict)
        {
            IList <ICoreMap> sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));

            for (int i = 0; i < predictedMentions.Count; i++)
            {
                ICoreMap              s        = sentences[i];
                IList <Mention>       mentions = predictedMentions[i];
                IList <CoreLabel>     sent     = s.Get(typeof(CoreAnnotations.TokensAnnotation));
                ICollection <Mention> remove   = Generics.NewHashSet();
                foreach (Mention m in mentions)
                {
                    string headPOS = m.headWord.Get(typeof(CoreAnnotations.PartOfSpeechAnnotation));
                    // non word such as 'hmm'
                    if (dict.nonWords.Contains(m.headString))
                    {
                        remove.Add(m);
                    }
                    // adjective form of nations
                    // the [American] policy -> not mention
                    // speak in [Japanese] -> mention
                    // check if the mention is noun and the next word is not noun
                    if (dict.IsAdjectivalDemonym(m.SpanToString()))
                    {
                        if (!headPOS.StartsWith("N") || (m.endIndex < sent.Count && sent[m.endIndex].Tag().StartsWith("N")))
                        {
                            remove.Add(m);
                        }
                    }
                    // stop list (e.g., U.S., there)
                    if (InStopList(m))
                    {
                        remove.Add(m);
                    }
                }
                mentions.RemoveAll(remove);
            }
        }
 public virtual bool Matches(ICoreMap o1, ICoreMap o2)
 {
     foreach (Type key in keys)
     {
         object v1 = o1.Get(key);
         object v2 = o2.Get(key);
         if (v1 != null)
         {
             if (!v1.Equals(v2))
             {
                 return(false);
             }
         }
         else
         {
             if (v2 != null)
             {
                 return(false);
             }
         }
     }
     return(true);
 }
        public virtual int DetermineRelFlags(ICoreMap annotation, TimeExpression te)
        {
            int  flags    = 0;
            bool flagsSet = false;

            if (te.value.GetTags() != null)
            {
                IValue v = te.value.GetTags().GetTag("resolveTo");
                if (v != null && v.Get() is Number)
                {
                    flags    = ((Number)v.Get());
                    flagsSet = true;
                }
            }
            if (!flagsSet)
            {
                if (te.GetTemporal() is SUTime.PartialTime)
                {
                    flags = SUTime.ResolveToClosest;
                }
            }
            return(flags);
        }
Exemple #15
0
 // temporary for debug
 protected internal static void AddGoldMentions(IList <ICoreMap> sentences, IList <ICollection <IntPair> > mentionSpanSetList, IList <IList <Mention> > predictedMentions, IList <IList <Mention> > allGoldMentions)
 {
     for (int i = 0; i < sz; i++)
     {
         IList <Mention>       mentions       = predictedMentions[i];
         ICoreMap              sent           = sentences[i];
         IList <CoreLabel>     tokens         = sent.Get(typeof(CoreAnnotations.TokensAnnotation));
         ICollection <IntPair> mentionSpanSet = mentionSpanSetList[i];
         IList <Mention>       golds          = allGoldMentions[i];
         foreach (Mention g in golds)
         {
             IntPair pair = new IntPair(g.startIndex, g.endIndex);
             if (!mentionSpanSet.Contains(pair))
             {
                 int     dummyMentionId = -1;
                 Mention m = new Mention(dummyMentionId, g.startIndex, g.endIndex, tokens, sent.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)), sent.Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)) != null ? sent
                                         .Get(typeof(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation)) : sent.Get(typeof(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation)), new List <CoreLabel>(tokens.SubList(g.startIndex, g.endIndex)));
                 mentions.Add(m);
                 mentionSpanSet.Add(pair);
             }
         }
     }
 }
Exemple #16
0
        private void RecognizeNumberSequences(IList <CoreLabel> words, ICoreMap document, ICoreMap sentence)
        {
            // we need to copy here because NumberSequenceClassifier overwrites the AnswerAnnotation
            IList <CoreLabel> newWords = NumberSequenceClassifier.CopyTokens(words, sentence);

            nsc.ClassifyWithGlobalInformation(newWords, document, sentence);
            // copy AnswerAnnotation back. Do not overwrite!
            // also, copy all the additional annotations generated by SUTime and NumberNormalizer
            for (int i = 0; i < sz; i++)
            {
                CoreLabel origWord = words[i];
                CoreLabel newWord  = newWords[i];
                // log.info(newWord.word() + " => " + newWord.get(CoreAnnotations.AnswerAnnotation.class) + " " + origWord.ner());
                string before   = origWord.Get(typeof(CoreAnnotations.AnswerAnnotation));
                string newGuess = newWord.Get(typeof(CoreAnnotations.AnswerAnnotation));
                if ((before == null || before.Equals(nsc.flags.backgroundSymbol) || before.Equals("MISC")) && !newGuess.Equals(nsc.flags.backgroundSymbol))
                {
                    origWord.Set(typeof(CoreAnnotations.AnswerAnnotation), newGuess);
                }
                // transfer other annotations generated by SUTime or NumberNormalizer
                NumberSequenceClassifier.TransferAnnotations(newWord, origWord);
            }
        }
Exemple #17
0
 /// <summary>Parse a string with SUTime.</summary>
 /// <exception cref="SUTimeParsingError">if anything goes wrong</exception>
 /// <exception cref="Edu.Stanford.Nlp.Time.SUTimeSimpleParser.SUTimeParsingError"/>
 public static SUTime.Temporal Parse(string str)
 {
     try
     {
         Annotation doc = new Annotation(str);
         pipeline.Annotate(doc);
         System.Diagnostics.Debug.Assert(doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) != null);
         System.Diagnostics.Debug.Assert(!doc.Get(typeof(CoreAnnotations.SentencesAnnotation)).IsEmpty());
         IList <ICoreMap> timexAnnotations = doc.Get(typeof(TimeAnnotations.TimexAnnotations));
         if (timexAnnotations.Count > 1)
         {
             throw new Exception("Too many timexes for '" + str + '\'');
         }
         ICoreMap timex = timexAnnotations[0];
         return(timex.Get(typeof(TimeExpression.Annotation)).GetTemporal());
     }
     catch (Exception e)
     {
         SUTimeSimpleParser.SUTimeParsingError parsingError = new SUTimeSimpleParser.SUTimeParsingError(str);
         parsingError.InitCause(e);
         throw parsingError;
     }
 }
Exemple #18
0
        protected internal virtual void FindHead(ICoreMap s, IList <Mention> mentions)
        {
            Tree tree = s.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
            IList <CoreLabel> sent = s.Get(typeof(CoreAnnotations.TokensAnnotation));

            tree.IndexSpans(0);
            foreach (Mention m in mentions)
            {
                Tree head = FindSyntacticHead(m, tree, sent);
                m.headIndex  = ((CoreLabel)head.Label()).Get(typeof(CoreAnnotations.IndexAnnotation)) - 1;
                m.headWord   = sent[m.headIndex];
                m.headString = m.headWord.Get(typeof(CoreAnnotations.TextAnnotation)).ToLower(Locale.English);
                int start = m.headIndex - m.startIndex;
                if (start < 0 || start >= m.originalSpan.Count)
                {
                    SieveCoreferenceSystem.logger.Warning("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex + ": originalSpan=[" + StringUtils.JoinWords(m.originalSpan, " ") + "], head=" + m.headWord);
                    SieveCoreferenceSystem.logger.Warning("Setting head string to entire mention");
                    m.headIndex  = m.startIndex;
                    m.headWord   = m.originalSpan.Count > 0 ? m.originalSpan[0] : sent[m.startIndex];
                    m.headString = m.originalSpan.ToString();
                }
            }
        }
 /// <exception cref="System.IO.IOException"/>
 private static void RecallErrors(IList<IList<Mention>> goldMentions, IList<IList<Mention>> predictedMentions, Annotation doc)
 {
     IList<ICoreMap> coreMaps = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
     int numSentences = goldMentions.Count;
     for (int i = 0; i < numSentences; i++)
     {
         ICoreMap coreMap = coreMaps[i];
         IList<CoreLabel> words = coreMap.Get(typeof(CoreAnnotations.TokensAnnotation));
         Tree tree = coreMap.Get(typeof(TreeCoreAnnotations.TreeAnnotation));
         IList<Mention> goldMentionsSent = goldMentions[i];
         IList<Pair<int, int>> goldMentionsSpans = ExtractSpans(goldMentionsSent);
         foreach (Pair<int, int> mentionSpan in goldMentionsSpans)
         {
             logger.Finer("RECALL ERROR\n");
             logger.Finer(coreMap + "\n");
             for (int x = mentionSpan.first; x < mentionSpan.second; x++)
             {
                 logger.Finer(words[x].Value() + " ");
             }
             logger.Finer("\n" + tree + "\n");
         }
     }
 }
        private void AnnotateExpressions(ICoreMap annotation, IList <T> expressions)
        {
            // TODO: Logging can be excessive
            IList <T> toDiscard = new List <T>();

            foreach (T te in expressions)
            {
                // Add attributes and all
                if (te.annotation == null)
                {
                    try
                    {
                        bool extractOkay = te.ExtractAnnotation(env, annotation);
                        if (verbose && extractOkay)
                        {
                            log.Info("annotateExpressions() matched " + te + " from " + annotation);
                        }
                        if (!extractOkay)
                        {
                            // Things didn't turn out so well
                            toDiscard.Add(te);
                            log.Warn("Error extracting annotation from " + te);
                        }
                    }
                    catch (Exception ex)
                    {
                        /*+ ", " + te.getExtractErrorMessage() */
                        if (verbose)
                        {
                            log.Warn("Error extracting annotation from " + te);
                            log.Warn(ex);
                        }
                    }
                }
            }
            expressions.RemoveAll(toDiscard);
        }
Exemple #21
0
        public static SUTime.Temporal ParseOrNull(string str)
        {
            Annotation doc = new Annotation(str);

            pipeline.Annotate(doc);
            if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)) == null)
            {
                return(null);
            }
            if (doc.Get(typeof(CoreAnnotations.SentencesAnnotation)).IsEmpty())
            {
                return(null);
            }
            IList <ICoreMap> timexAnnotations = doc.Get(typeof(TimeAnnotations.TimexAnnotations));

            if (timexAnnotations.Count > 1)
            {
                return(null);
            }
            else
            {
                if (timexAnnotations.IsEmpty())
                {
                    return(null);
                }
            }
            ICoreMap timex = timexAnnotations[0];

            if (timex.Get(typeof(TimeExpression.Annotation)) == null)
            {
                return(null);
            }
            else
            {
                return(timex.Get(typeof(TimeExpression.Annotation)).GetTemporal());
            }
        }
Exemple #22
0
 public static void WriteConllFile(string outFile, IList <ICoreMap> sentences, IList <DependencyTree> trees)
 {
     try
     {
         PrintWriter output = IOUtils.GetPrintWriter(outFile);
         for (int i = 0; i < sentences.Count; i++)
         {
             ICoreMap          sentence = sentences[i];
             DependencyTree    tree     = trees[i];
             IList <CoreLabel> tokens   = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
             for (int j = 1; j <= size; ++j)
             {
                 CoreLabel token = tokens[j - 1];
                 output.Printf("%d\t%s\t_\t%s\t%s\t_\t%d\t%s\t_\t_%n", j, token.Word(), token.Tag(), token.Tag(), tree.GetHead(j), tree.GetLabel(j));
             }
             output.Println();
         }
         output.Close();
     }
     catch (Exception e)
     {
         throw new RuntimeIOException(e);
     }
 }
 private void CleanupTags(ICoreMap cm, IDictionary <object, bool> cleaned)
 {
     cm.Remove(typeof(Tags.TagsAnnotation));
     foreach (Type key in cm.KeySet())
     {
         object obj = cm.Get(key);
         if (!cleaned.Contains(obj))
         {
             cleaned[obj] = false;
             if (obj is ICoreMap)
             {
                 CleanupTags((ICoreMap)obj, cleaned);
             }
             else
             {
                 if (obj is ICollection)
                 {
                     CleanupTags((ICollection)obj, cleaned);
                 }
             }
             cleaned[obj] = true;
         }
     }
 }
Exemple #24
0
        /// <exception cref="System.Exception"/>
        public virtual Document MakeDocument(InputDoc input)
        {
            IList <IList <Mention> > mentions = new List <IList <Mention> >();

            if (CorefProperties.UseGoldMentions(props))
            {
                IList <ICoreMap> sentences = input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation));
                for (int i = 0; i < sentences.Count; i++)
                {
                    ICoreMap          sentence         = sentences[i];
                    IList <CoreLabel> sentenceWords    = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));
                    IList <Mention>   sentenceMentions = new List <Mention>();
                    mentions.Add(sentenceMentions);
                    foreach (Mention g in input.goldMentions[i])
                    {
                        sentenceMentions.Add(new Mention(-1, g.startIndex, g.endIndex, sentenceWords, null, null, new List <CoreLabel>(sentenceWords.SubList(g.startIndex, g.endIndex))));
                    }
                    md.FindHead(sentence, sentenceMentions);
                }
            }
            else
            {
                foreach (ICoreMap sentence in input.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
                {
                    mentions.Add(sentence.Get(typeof(CorefCoreAnnotations.CorefMentionsAnnotation)));
                }
            }
            Document doc = new Document(input, mentions);

            if (input.goldMentions != null)
            {
                FindGoldMentionHeads(doc);
            }
            DocumentPreprocessor.Preprocess(doc, dict, null, headFinder);
            return(doc);
        }
Exemple #25
0
        public virtual void TestAlwaysNewlineIsSentenceBreakSettings()
        {
            string text = "This is \none sentence\n\nThis is not another.";

            string[]        sents     = new string[] { "This is", "one sentence", "This is not another ." };
            Properties      props     = PropertiesUtils.AsProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always");
            StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);
            Annotation      document1 = new Annotation(text);

            pipeline.Annotate(document1);
            IList <ICoreMap> sentences = document1.Get(typeof(CoreAnnotations.SentencesAnnotation));

            NUnit.Framework.Assert.AreEqual(3, sentences.Count);
            // make sure that there are the correct # of tokens (count does contain NL tokens)
            IList <CoreLabel> tokens = document1.Get(typeof(CoreAnnotations.TokensAnnotation));

            NUnit.Framework.Assert.AreEqual(9, tokens.Count);
            for (int i = 0; i < Math.Min(sents.Length, sentences.Count); i++)
            {
                ICoreMap sentence     = sentences[i];
                string   sentenceText = SentenceUtils.ListToString(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                NUnit.Framework.Assert.AreEqual(sents[i], sentenceText, "Bad sentence #" + i);
            }
        }
Exemple #26
0
        // for filling in the text of a mention
        public virtual string TokenRangeToString(Pair <int, int> tokenRange)
        {
            IList <CoreLabel> tokens = doc.Get(typeof(CoreAnnotations.TokensAnnotation));
            // see if the token range matches an entity mention
            IList <ICoreMap> entityMentionsInDoc         = doc.Get(typeof(CoreAnnotations.MentionsAnnotation));
            int      potentialMatchingEntityMentionIndex = tokens[tokenRange.first].Get(typeof(CoreAnnotations.EntityMentionIndexAnnotation));
            ICoreMap potentialMatchingEntityMention      = null;

            if (entityMentionsInDoc != null && potentialMatchingEntityMentionIndex != null)
            {
                potentialMatchingEntityMention = entityMentionsInDoc[potentialMatchingEntityMentionIndex];
            }
            // if there is a matching entity mention, return it's text (which has been processed to remove
            // things like newlines and xml)...if there isn't return the full substring of the document text
            if (potentialMatchingEntityMention != null && potentialMatchingEntityMention.Get(typeof(CoreAnnotations.CharacterOffsetBeginAnnotation)) == tokens[tokenRange.first].BeginPosition() && potentialMatchingEntityMention.Get(typeof(CoreAnnotations.CharacterOffsetEndAnnotation
                                                                                                                                                                                                                                              )) == tokens[tokenRange.second].EndPosition())
            {
                return(potentialMatchingEntityMention.Get(typeof(CoreAnnotations.TextAnnotation)));
            }
            else
            {
                return(Sharpen.Runtime.Substring(doc.Get(typeof(CoreAnnotations.TextAnnotation)), tokens[tokenRange.first].BeginPosition(), tokens[tokenRange.second].EndPosition()));
            }
        }
        public static void AddEnhancedSentences(Annotation doc)
        {
            //for every sentence that begins a paragraph: append this sentence and the previous one and see if sentence splitter would make a single sentence out of it. If so, add as extra sentence.
            //for each sieve that potentially uses augmentedSentences in original:
            IList <ICoreMap>        sentences = doc.Get(typeof(CoreAnnotations.SentencesAnnotation));
            WordToSentenceProcessor wsp       = new WordToSentenceProcessor(WordToSentenceProcessor.NewlineIsSentenceBreak.Never);
            //create SentenceSplitter that never splits on newline
            int prevParagraph = 0;

            for (int i = 1; i < sentences.Count; i++)
            {
                ICoreMap          sentence     = sentences[i];
                ICoreMap          prevSentence = sentences[i - 1];
                IList <CoreLabel> tokensConcat = new List <CoreLabel>();
                Sharpen.Collections.AddAll(tokensConcat, prevSentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                Sharpen.Collections.AddAll(tokensConcat, sentence.Get(typeof(CoreAnnotations.TokensAnnotation)));
                IList <IList <CoreLabel> > sentenceTokens = wsp.Process(tokensConcat);
                if (sentenceTokens.Count == 1)
                {
                    //wsp would have put them into a single sentence --> add enhanced sentence.
                    sentence.Set(typeof(QuoteAttributionUtils.EnhancedSentenceAnnotation), ConstructSentence(sentenceTokens[0], prevSentence, sentence));
                }
            }
        }
        public static string TokensAndNELabelsToString(ICoreMap sentence)
        {
            StringBuilder     os     = new StringBuilder();
            IList <CoreLabel> tokens = sentence.Get(typeof(CoreAnnotations.TokensAnnotation));

            if (tokens != null)
            {
                bool first = true;
                foreach (CoreLabel token in tokens)
                {
                    if (!first)
                    {
                        os.Append(" ");
                    }
                    os.Append(token.Word());
                    if (token.Ner() != null && !token.Ner().Equals("O"))
                    {
                        os.Append("/" + token.Ner());
                    }
                    first = false;
                }
            }
            return(os.ToString());
        }
        /// <summary>Process an entity given the NER tag, extracted modifier and the next word in the document.</summary>
        /// <remarks>
        /// Process an entity given the NER tag, extracted modifier and the next word in the document.
        /// The normalized quantity will be written in place.
        /// </remarks>
        /// <param name="l">A collector that collects annotations for the entity.</param>
        /// <param name="entityType">Quantifiable NER tag.</param>
        /// <param name="compModifier">
        /// The extracted modifier around the entity of interest. Different NER tags should
        /// have different extraction rules.
        /// </param>
        /// <param name="nextWord">Next word in the document.</param>
        /// <param name="document">Reference to the document.</param>
        /// <?/>
        /// <returns/>
        private static IList <E> ProcessEntity <E>(IList <E> l, string entityType, string compModifier, string nextWord, ICoreMap document)
            where E : ICoreMap
        {
            // convert the entity annotations into a string
            string        s  = SingleEntityToString(l);
            StringBuilder sb = new StringBuilder();

            // convert all full digits to half digits
            for (int i = 0; i < sz; i++)
            {
                string ch = Sharpen.Runtime.Substring(s, i, i + 1);
                if (fullDigitToHalfDigit.Contains(ch))
                {
                    ch = fullDigitToHalfDigit[ch];
                }
                sb.Append(ch);
            }
            s = sb.ToString();
            string p = null;

            switch (entityType)
            {
            case NumberTag:
            {
                p = string.Empty;
                if (compModifier != null)
                {
                    p = compModifier;
                }
                string q = NormalizedNumberString(s, nextWord, 1.0);
                if (q != null)
                {
                    p = p.Concat(q);
                }
                else
                {
                    p = null;
                }
                break;
            }

            case OrdinalTag:
            {
                // ordinal won't have modifier
                p = NormalizedOrdinalString(s, nextWord);
                break;
            }

            case PercentTag:
            {
                p = NormalizedPercentString(s, nextWord);
                break;
            }

            case MoneyTag:
            {
                p = string.Empty;
                if (compModifier != null)
                {
                    p = compModifier;
                }
                q = NormalizedMoneyString(s, nextWord);
                if (q != null)
                {
                    p = p.Concat(q);
                }
                else
                {
                    p = null;
                }
                break;
            }

            case DateTag:
            {
                if (s.Matches(BasicYyyymmddPattern) || s.Matches(BasicMmddPattern) || s.Matches(EnglishMmddyyyyPattern) || s.Matches(BasicDdPattern) || s.Matches(RelativeTimePattern) || s.Matches(BirthDecadePattern))
                {
                    string docdate = document.Get(typeof(CoreAnnotations.DocDateAnnotation));
                    p = NormalizeDateString(s, docdate);
                }
                break;
            }

            case TimeTag:
            {
                break;
            }
            }
            // Write the normalized NER values in place
            foreach (E wi in l)
            {
                if (p != null)
                {
                    wi.Set(typeof(CoreAnnotations.NormalizedNamedEntityTagAnnotation), p);
                }
            }
            // This return value is not necessarily useful as the labelling is done in place.
            return(l);
        }
        // Patterns used by DATE and TIME (must be after the static initializers to make use of the modifiers)
        // static methods
        /// <summary>
        /// Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
        /// and tags each of their constituents with a "normalizedQuantity"
        /// label which contains the appropriate normalized string corresponding to
        /// the full quantity.
        /// </summary>
        /// <remarks>
        /// Identifies contiguous MONEY, TIME, DATE, or PERCENT entities
        /// and tags each of their constituents with a "normalizedQuantity"
        /// label which contains the appropriate normalized string corresponding to
        /// the full quantity.
        /// Unlike the English normalizer, this method currently does not support
        /// concatenation or SUTime.
        /// </remarks>
        /// <param name="list">
        /// A list of
        /// <see cref="Edu.Stanford.Nlp.Util.ICoreMap"/>
        /// s representing a single document.
        /// Note: We assume the NERs has been labelled and the labels
        /// will be updated in place.
        /// </param>
        /// <param name="document"/>
        /// <param name="sentence"/>
        /// <?/>
        public static void AddNormalizedQuantitiesToEntities <E>(IList <E> list, ICoreMap document, ICoreMap sentence)
            where E : ICoreMap
        {
            // Fix the NER sequence if necessay
            FixupNerBeforeNormalization(list);
            // Now that NER tags has been fixed up, we do another pass to add the normalization
            string   prevNerTag  = BackgroundSymbol;
            int      beforeIndex = -1;
            List <E> collector   = new List <E>();

            for (int i = 0; i <= sz; i++)
            {
                // we should always keep list.size() unchanged inside the loop
                E      wi         = null;
                string currNerTag = null;
                string nextWord   = string.Empty;
                if (i < sz)
                {
                    wi = list[i];
                    if (i + 1 < sz)
                    {
                        nextWord = list[i + 1].Get(typeof(CoreAnnotations.TextAnnotation));
                        if (nextWord == null)
                        {
                            nextWord = string.Empty;
                        }
                    }
                    // We assume NERs have been set by previous NER taggers
                    currNerTag = wi.Get(typeof(CoreAnnotations.NamedEntityTagAnnotation));
                }
                // TODO: may need to detect TIME modifier here?
                E wprev = (i > 0) ? list[i - 1] : null;
                // if the current wi is a non-continuation and the last one was a
                // quantity, we close and process the last segment.
                // TODO: also need to check compatibility as the English normalizer does
                if ((currNerTag == null || !currNerTag.Equals(prevNerTag)) && quantifiable.Contains(prevNerTag))
                {
                    string modifier = null;
                    switch (prevNerTag)
                    {
                    case TimeTag:
                    {
                        // Need different handling for different tags
                        // TODO: add TIME
                        break;
                    }

                    case DateTag:
                    {
                        ProcessEntity(collector, prevNerTag, modifier, nextWord, document);
                        break;
                    }

                    default:
                    {
                        if (prevNerTag.Equals(NumberTag) || prevNerTag.Equals(PercentTag) || prevNerTag.Equals(MoneyTag))
                        {
                            // we are doing for prev tag so afterIndex should really be i
                            modifier = DetectQuantityModifier(list, beforeIndex, i);
                        }
                        ProcessEntity(collector, prevNerTag, modifier, nextWord);
                        break;
                    }
                    }
                    collector = new List <E>();
                }
                // If currNerTag is quantifiable, we add it into collector
                if (quantifiable.Contains(currNerTag))
                {
                    if (collector.IsEmpty())
                    {
                        beforeIndex = i - 1;
                    }
                    collector.Add(wi);
                }
                // move on and update prev pointer
                prevNerTag = currNerTag;
            }
        }
Exemple #31
0
        public virtual ICounter <string> GetTopSpeakers(IList <Sieve.MentionData> closestMentions, IList <Sieve.MentionData> closestMentionsBackward, Person.Gender gender, ICoreMap quote, bool overrideGender)
        {
            ICounter <string> topSpeakerInRange               = new ClassicCounter <string>();
            ICounter <string> topSpeakerInRangeIgnoreGender   = new ClassicCounter <string>();
            ICollection <Sieve.MentionData> backwardsMentions = new HashSet <Sieve.MentionData>(closestMentionsBackward);

            foreach (Sieve.MentionData mention in closestMentions)
            {
                double weight = backwardsMentions.Contains(mention) ? BackwardWeight : ForwardWeight;
                if (mention.type.Equals(Name))
                {
                    if (!characterMap.Keys.Contains(mention.text))
                    {
                        continue;
                    }
                    Person p = characterMap[mention.text][0];
                    if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk))
                    {
                        topSpeakerInRange.IncrementCount(p.name, weight);
                    }
                    topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight);
                    if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94)
                    {
                        System.Console.Out.WriteLine(p.name + " " + weight + " name");
                    }
                }
                else
                {
                    if (mention.type.Equals(Pronoun))
                    {
                        int    charBeginKey = doc.Get(typeof(CoreAnnotations.TokensAnnotation))[mention.begin].BeginPosition();
                        Person p            = DoCoreference(charBeginKey, quote);
                        if (p != null)
                        {
                            if ((gender == Person.Gender.Male && p.gender == Person.Gender.Male) || (gender == Person.Gender.Female && p.gender == Person.Gender.Female) || (gender == Person.Gender.Unk))
                            {
                                topSpeakerInRange.IncrementCount(p.name, weight);
                            }
                            topSpeakerInRangeIgnoreGender.IncrementCount(p.name, weight);
                            if (closestMentions.Count == 128 && closestMentionsBackward.Count == 94)
                            {
                                System.Console.Out.WriteLine(p.name + " " + weight + " pronoun");
                            }
                        }
                    }
                }
            }
            if (topSpeakerInRange.Size() > 0)
            {
                return(topSpeakerInRange);
            }
            else
            {
                if (gender != Person.Gender.Unk && !overrideGender)
                {
                    return(topSpeakerInRange);
                }
            }
            return(topSpeakerInRangeIgnoreGender);
        }