コード例 #1
0
        public virtual void RunCoref(Document document)
        {
            IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetUnlabeledMentionPairs(document);

            if (mentionPairs.Count == 0)
            {
                return;
            }
            Compressor <string>         compressor           = new Compressor <string>();
            DocumentExamples            examples             = extractor.Extract(0, document, mentionPairs, compressor);
            ICounter <Pair <int, int> > classificationScores = new ClassicCounter <Pair <int, int> >();
            ICounter <Pair <int, int> > rankingScores        = new ClassicCounter <Pair <int, int> >();
            ICounter <int> anaphoricityScores = new ClassicCounter <int>();

            foreach (Example example in examples.examples)
            {
                CorefUtils.CheckForInterrupt();
                Pair <int, int> mentionPair = new Pair <int, int>(example.mentionId1, example.mentionId2);
                classificationScores.IncrementCount(mentionPair, classificationModel.Predict(example, examples.mentionFeatures, compressor));
                rankingScores.IncrementCount(mentionPair, rankingModel.Predict(example, examples.mentionFeatures, compressor));
                if (!anaphoricityScores.ContainsKey(example.mentionId2))
                {
                    anaphoricityScores.IncrementCount(example.mentionId2, anaphoricityModel.Predict(new Example(example, false), examples.mentionFeatures, compressor));
                }
            }
            ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.Stream().Collect(Collectors.ToMap(null, null)));
            foreach (Pair <int, int> mentionPair_1 in clusterer.GetClusterMerges(doc))
            {
                CorefUtils.MergeCoreferenceClusters(mentionPair_1, document);
            }
        }
コード例 #2
0
 /// <summary>main entry of coreference system.</summary>
 /// <param name="document">Input document for coref format (Annotation and optional information)</param>
 /// <param name="output">For output of coref system (conll format and log. list size should be 4.)</param>
 /// <returns>Map of coref chain ID and corresponding chain</returns>
 /// <exception cref="System.Exception"/>
 public virtual IDictionary <int, CorefChain> Coref(Document document, StringBuilder[] output)
 {
     if (HybridCorefProperties.PrintMDLog(props))
     {
         Redwood.Log(HybridCorefPrinter.PrintMentionDetectionLog(document));
     }
     if (HybridCorefProperties.DoScore(props))
     {
         output[0] = (new StringBuilder()).Append(CorefPrinter.PrintConllOutput(document, true));
         // gold
         output[1] = (new StringBuilder()).Append(CorefPrinter.PrintConllOutput(document, false));
     }
     // before coref
     output[3] = new StringBuilder();
     // log from sieves
     foreach (Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve sieve in sieves)
     {
         CorefUtils.CheckForInterrupt();
         output[3].Append(sieve.ResolveMention(document, dictionaries, props));
     }
     // post processing
     if (HybridCorefProperties.DoPostProcessing(props))
     {
         PostProcessing(document);
     }
     if (HybridCorefProperties.DoScore(props))
     {
         output[2] = (new StringBuilder()).Append(CorefPrinter.PrintConllOutput(document, false, true));
     }
     // after coref
     return(MakeCorefOutput(document));
 }
コード例 #3
0
        public virtual void RunCoref(Document document)
        {
            Compressor <string> compressor = new Compressor <string>();

            if (Thread.Interrupted())
            {
                // Allow interrupting
                throw new RuntimeInterruptedException();
            }
            IDictionary <Pair <int, int>, bool> pairs = new Dictionary <Pair <int, int>, bool>();

            foreach (KeyValuePair <int, IList <int> > e in CorefUtils.HeuristicFilter(CorefUtils.GetSortedMentions(document), maxMentionDistance, maxMentionDistanceWithStringMatch))
            {
                foreach (int m1 in e.Value)
                {
                    pairs[new Pair <int, int>(m1, e.Key)] = true;
                }
            }
            DocumentExamples            examples       = extractor.Extract(0, document, pairs, compressor);
            ICounter <Pair <int, int> > pairwiseScores = new ClassicCounter <Pair <int, int> >();

            foreach (Example mentionPair in examples.examples)
            {
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                pairwiseScores.IncrementCount(new Pair <int, int>(mentionPair.mentionId1, mentionPair.mentionId2), classifier.Predict(mentionPair, examples.mentionFeatures, compressor));
            }
            IList <Pair <int, int> > mentionPairs = new List <Pair <int, int> >(pairwiseScores.KeySet());

            mentionPairs.Sort(null);
            ICollection <int> seenAnaphors = new HashSet <int>();

            foreach (Pair <int, int> pair in mentionPairs)
            {
                if (seenAnaphors.Contains(pair.second))
                {
                    continue;
                }
                if (Thread.Interrupted())
                {
                    // Allow interrupting
                    throw new RuntimeInterruptedException();
                }
                seenAnaphors.Add(pair.second);
                Dictionaries.MentionType mt1 = document.predictedMentionsByID[pair.first].mentionType;
                Dictionaries.MentionType mt2 = document.predictedMentionsByID[pair.second].mentionType;
                if (pairwiseScores.GetCount(pair) > thresholds[new Pair <bool, bool>(mt1 == Dictionaries.MentionType.Pronominal, mt2 == Dictionaries.MentionType.Pronominal)])
                {
                    CorefUtils.MergeCoreferenceClusters(pair, document);
                }
            }
        }
コード例 #4
0
 public virtual void RunCoref(Document document)
 {
     if (toMerge.Contains(currentDocId))
     {
         foreach (Pair <int, int> pair in toMerge[currentDocId])
         {
             CorefUtils.MergeCoreferenceClusters(pair, document);
         }
     }
     currentDocId += 1;
 }
コード例 #5
0
        public virtual void RunCoref(Document document)
        {
            IList <Mention> sortedMentions = CorefUtils.GetSortedMentions(document);
            IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >();

            foreach (Mention m in sortedMentions)
            {
                IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m.headIndex, null);
                withIndex.Add(m);
            }
            SimpleMatrix documentEmbedding = embeddingExtractor.GetDocumentEmbedding(document);
            IDictionary <int, SimpleMatrix> antecedentEmbeddings = new Dictionary <int, SimpleMatrix>();
            IDictionary <int, SimpleMatrix> anaphorEmbeddings    = new Dictionary <int, SimpleMatrix>();
            ICounter <int> anaphoricityScores = new ClassicCounter <int>();

            foreach (Mention m_1 in sortedMentions)
            {
                SimpleMatrix mentionEmbedding = embeddingExtractor.GetMentionEmbeddings(m_1, documentEmbedding);
                antecedentEmbeddings[m_1.mentionID] = model.GetAntecedentEmbedding(mentionEmbedding);
                anaphorEmbeddings[m_1.mentionID]    = model.GetAnaphorEmbedding(mentionEmbedding);
                anaphoricityScores.IncrementCount(m_1.mentionID, model.GetAnaphoricityScore(mentionEmbedding, featureExtractor.GetAnaphoricityFeatures(m_1, document, mentionsByHeadIndex)));
            }
            IDictionary <int, IList <int> > mentionToCandidateAntecedents = CorefUtils.HeuristicFilter(sortedMentions, maxMentionDistance, maxMentionDistanceWithStringMatch);

            foreach (KeyValuePair <int, IList <int> > e in mentionToCandidateAntecedents)
            {
                double bestScore  = anaphoricityScores.GetCount(e.Key) - 50 * (greedyness - 0.5);
                int    m_2        = e.Key;
                int    antecedent = null;
                foreach (int ca in e.Value)
                {
                    double score = model.GetPairwiseScore(antecedentEmbeddings[ca], anaphorEmbeddings[m_2], featureExtractor.GetPairFeatures(new Pair <int, int>(ca, m_2), document, mentionsByHeadIndex));
                    if (score > bestScore)
                    {
                        bestScore  = score;
                        antecedent = ca;
                    }
                }
                if (antecedent != null)
                {
                    CorefUtils.MergeCoreferenceClusters(new Pair <int, int>(antecedent, m_2), document);
                }
            }
        }
コード例 #6
0
        public virtual void Process(int id, Document document)
        {
            IDictionary <Pair <int, int>, bool> labeledPairs = CorefUtils.GetLabeledMentionPairs(document);
            long numP = labeledPairs.Keys.Stream().Filter(null).Count();
            IList <Pair <int, int> > negative = labeledPairs.Keys.Stream().Filter(null).Collect(Collectors.ToList());
            int numN = negative.Count;

            if (numP / (float)(numP + numN) < minClassImbalancedPerDocument)
            {
                numN = (int)(numP / minClassImbalancedPerDocument - numP);
                Java.Util.Collections.Shuffle(negative);
                for (int i = numN; i < negative.Count; i++)
                {
                    Sharpen.Collections.Remove(labeledPairs, negative[i]);
                }
            }
            IDictionary <int, IList <int> > mentionToCandidateAntecedents = new Dictionary <int, IList <int> >();

            foreach (Pair <int, int> pair in labeledPairs.Keys)
            {
                IList <int> candidateAntecedents = mentionToCandidateAntecedents[pair.second];
                if (candidateAntecedents == null)
                {
                    candidateAntecedents = new List <int>();
                    mentionToCandidateAntecedents[pair.second] = candidateAntecedents;
                }
                candidateAntecedents.Add(pair.first);
            }
            IList <int> mentions = new List <int>(mentionToCandidateAntecedents.Keys);

            while (labeledPairs.Count > maxExamplesPerDocument)
            {
                int mention = mentions.Remove(random.NextInt(mentions.Count));
                foreach (int candidateAntecedent in mentionToCandidateAntecedents[mention])
                {
                    Sharpen.Collections.Remove(labeledPairs, new Pair <int, int>(candidateAntecedent, mention));
                }
            }
            mentionPairs[id] = labeledPairs;
        }
コード例 #7
0
        public virtual DocumentExamples Extract(int id, Document document, IDictionary <Pair <int, int>, bool> labeledPairs, Compressor <string> compressor)
        {
            IList <Mention> mentionsList = CorefUtils.GetSortedMentions(document);
            IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >();

            foreach (Mention m in mentionsList)
            {
                IList <Mention> withIndex = mentionsByHeadIndex[m.headIndex];
                if (withIndex == null)
                {
                    withIndex = new List <Mention>();
                    mentionsByHeadIndex[m.headIndex] = withIndex;
                }
                withIndex.Add(m);
            }
            IDictionary <int, Mention> mentions          = document.predictedMentionsByID;
            IList <Example>            examples          = new List <Example>();
            ICollection <int>          mentionsToExtract = new HashSet <int>();

            foreach (KeyValuePair <Pair <int, int>, bool> pair in labeledPairs)
            {
                Mention m1 = mentions[pair.Key.first];
                Mention m2 = mentions[pair.Key.second];
                mentionsToExtract.Add(m1.mentionID);
                mentionsToExtract.Add(m2.mentionID);
                CompressedFeatureVector features = compressor.Compress(GetFeatures(document, m1, m2));
                examples.Add(new Example(id, m1, m2, pair.Value ? 1.0 : 0.0, features));
            }
            IDictionary <int, CompressedFeatureVector> mentionFeatures = new Dictionary <int, CompressedFeatureVector>();

            foreach (int mentionID in mentionsToExtract)
            {
                mentionFeatures[mentionID] = compressor.Compress(GetFeatures(document, document.predictedMentionsByID[mentionID], mentionsByHeadIndex));
            }
            return(new DocumentExamples(id, examples, mentionFeatures));
        }
コード例 #8
0
        public virtual void Process(int id, Document document)
        {
            IJsonArrayBuilder clusters = Javax.Json.Json.CreateArrayBuilder();

            foreach (CorefCluster gold in document.goldCorefClusters.Values)
            {
                IJsonArrayBuilder c = Javax.Json.Json.CreateArrayBuilder();
                foreach (Mention m in gold.corefMentions)
                {
                    c.Add(m.mentionID);
                }
                clusters.Add(c.Build());
            }
            goldClusterWriter.Println(Javax.Json.Json.CreateObjectBuilder().Add(id.ToString(), clusters.Build()).Build());
            IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetLabeledMentionPairs(document);
            IList <Mention> mentionsList = CorefUtils.GetSortedMentions(document);
            IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >();

            foreach (Mention m_1 in mentionsList)
            {
                IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m_1.headIndex, null);
                withIndex.Add(m_1);
            }
            IJsonObjectBuilder docFeatures = Javax.Json.Json.CreateObjectBuilder();

            docFeatures.Add("doc_id", id);
            docFeatures.Add("type", document.docType == Document.DocType.Article ? 1 : 0);
            docFeatures.Add("source", document.docInfo["DOC_ID"].Split("/")[0]);
            IJsonArrayBuilder sentences = Javax.Json.Json.CreateArrayBuilder();

            foreach (ICoreMap sentence in document.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation)))
            {
                sentences.Add(GetSentenceArray(sentence.Get(typeof(CoreAnnotations.TokensAnnotation))));
            }
            IJsonObjectBuilder mentions = Javax.Json.Json.CreateObjectBuilder();

            foreach (Mention m_2 in document.predictedMentionsByID.Values)
            {
                IEnumerator <SemanticGraphEdge> iterator = m_2.enhancedDependency.IncomingEdgeIterator(m_2.headIndexedWord);
                SemanticGraphEdge relation    = iterator.MoveNext() ? iterator.Current : null;
                string            depRelation = relation == null ? "no-parent" : relation.GetRelation().ToString();
                string            depParent   = relation == null ? "<missing>" : relation.GetSource().Word();
                mentions.Add(m_2.mentionNum.ToString(), Javax.Json.Json.CreateObjectBuilder().Add("doc_id", id).Add("mention_id", m_2.mentionID).Add("mention_num", m_2.mentionNum).Add("sent_num", m_2.sentNum).Add("start_index", m_2.startIndex).Add("end_index"
                                                                                                                                                                                                                                                        , m_2.endIndex).Add("head_index", m_2.headIndex).Add("mention_type", m_2.mentionType.ToString()).Add("dep_relation", depRelation).Add("dep_parent", depParent).Add("sentence", GetSentenceArray(m_2.sentenceWords)).Add("contained-in-other-mention"
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                , mentionsByHeadIndex[m_2.headIndex].Stream().AnyMatch(null) ? 1 : 0).Build());
            }
            IJsonArrayBuilder featureNames = Javax.Json.Json.CreateArrayBuilder().Add("same-speaker").Add("antecedent-is-mention-speaker").Add("mention-is-antecedent-speaker").Add("relaxed-head-match").Add("exact-string-match").Add("relaxed-string-match"
                                                                                                                                                                                                                                        );
            IJsonObjectBuilder features = Javax.Json.Json.CreateObjectBuilder();
            IJsonObjectBuilder labels   = Javax.Json.Json.CreateObjectBuilder();

            foreach (KeyValuePair <Pair <int, int>, bool> e in mentionPairs)
            {
                Mention           m1      = document.predictedMentionsByID[e.Key.first];
                Mention           m2      = document.predictedMentionsByID[e.Key.second];
                string            key     = m1.mentionNum + " " + m2.mentionNum;
                IJsonArrayBuilder builder = Javax.Json.Json.CreateArrayBuilder();
                foreach (int val in CategoricalFeatureExtractor.PairwiseFeatures(document, m1, m2, dictionaries, conll))
                {
                    builder.Add(val);
                }
                features.Add(key, builder.Build());
                labels.Add(key, e.Value ? 1 : 0);
            }
            IJsonObject docData = Javax.Json.Json.CreateObjectBuilder().Add("sentences", sentences.Build()).Add("mentions", mentions.Build()).Add("labels", labels.Build()).Add("pair_feature_names", featureNames.Build()).Add("pair_features", features.Build
                                                                                                                                                                                                                                    ()).Add("document_features", docFeatures.Build()).Build();

            dataWriter.Println(docData);
        }