public virtual void RunCoref(Document document) { IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetUnlabeledMentionPairs(document); if (mentionPairs.Count == 0) { return; } Compressor <string> compressor = new Compressor <string>(); DocumentExamples examples = extractor.Extract(0, document, mentionPairs, compressor); ICounter <Pair <int, int> > classificationScores = new ClassicCounter <Pair <int, int> >(); ICounter <Pair <int, int> > rankingScores = new ClassicCounter <Pair <int, int> >(); ICounter <int> anaphoricityScores = new ClassicCounter <int>(); foreach (Example example in examples.examples) { CorefUtils.CheckForInterrupt(); Pair <int, int> mentionPair = new Pair <int, int>(example.mentionId1, example.mentionId2); classificationScores.IncrementCount(mentionPair, classificationModel.Predict(example, examples.mentionFeatures, compressor)); rankingScores.IncrementCount(mentionPair, rankingModel.Predict(example, examples.mentionFeatures, compressor)); if (!anaphoricityScores.ContainsKey(example.mentionId2)) { anaphoricityScores.IncrementCount(example.mentionId2, anaphoricityModel.Predict(new Example(example, false), examples.mentionFeatures, compressor)); } } ClustererDataLoader.ClustererDoc doc = new ClustererDataLoader.ClustererDoc(0, classificationScores, rankingScores, anaphoricityScores, mentionPairs, null, document.predictedMentionsByID.Stream().Collect(Collectors.ToMap(null, null))); foreach (Pair <int, int> mentionPair_1 in clusterer.GetClusterMerges(doc)) { CorefUtils.MergeCoreferenceClusters(mentionPair_1, document); } }
/// <summary>main entry of coreference system.</summary> /// <param name="document">Input document for coref format (Annotation and optional information)</param> /// <param name="output">For output of coref system (conll format and log. list size should be 4.)</param> /// <returns>Map of coref chain ID and corresponding chain</returns> /// <exception cref="System.Exception"/> public virtual IDictionary <int, CorefChain> Coref(Document document, StringBuilder[] output) { if (HybridCorefProperties.PrintMDLog(props)) { Redwood.Log(HybridCorefPrinter.PrintMentionDetectionLog(document)); } if (HybridCorefProperties.DoScore(props)) { output[0] = (new StringBuilder()).Append(CorefPrinter.PrintConllOutput(document, true)); // gold output[1] = (new StringBuilder()).Append(CorefPrinter.PrintConllOutput(document, false)); } // before coref output[3] = new StringBuilder(); // log from sieves foreach (Edu.Stanford.Nlp.Coref.Hybrid.Sieve.Sieve sieve in sieves) { CorefUtils.CheckForInterrupt(); output[3].Append(sieve.ResolveMention(document, dictionaries, props)); } // post processing if (HybridCorefProperties.DoPostProcessing(props)) { PostProcessing(document); } if (HybridCorefProperties.DoScore(props)) { output[2] = (new StringBuilder()).Append(CorefPrinter.PrintConllOutput(document, false, true)); } // after coref return(MakeCorefOutput(document)); }
public virtual void RunCoref(Document document) { Compressor <string> compressor = new Compressor <string>(); if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } IDictionary <Pair <int, int>, bool> pairs = new Dictionary <Pair <int, int>, bool>(); foreach (KeyValuePair <int, IList <int> > e in CorefUtils.HeuristicFilter(CorefUtils.GetSortedMentions(document), maxMentionDistance, maxMentionDistanceWithStringMatch)) { foreach (int m1 in e.Value) { pairs[new Pair <int, int>(m1, e.Key)] = true; } } DocumentExamples examples = extractor.Extract(0, document, pairs, compressor); ICounter <Pair <int, int> > pairwiseScores = new ClassicCounter <Pair <int, int> >(); foreach (Example mentionPair in examples.examples) { if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } pairwiseScores.IncrementCount(new Pair <int, int>(mentionPair.mentionId1, mentionPair.mentionId2), classifier.Predict(mentionPair, examples.mentionFeatures, compressor)); } IList <Pair <int, int> > mentionPairs = new List <Pair <int, int> >(pairwiseScores.KeySet()); mentionPairs.Sort(null); ICollection <int> seenAnaphors = new HashSet <int>(); foreach (Pair <int, int> pair in mentionPairs) { if (seenAnaphors.Contains(pair.second)) { continue; } if (Thread.Interrupted()) { // Allow interrupting throw new RuntimeInterruptedException(); } seenAnaphors.Add(pair.second); Dictionaries.MentionType mt1 = document.predictedMentionsByID[pair.first].mentionType; Dictionaries.MentionType mt2 = document.predictedMentionsByID[pair.second].mentionType; if (pairwiseScores.GetCount(pair) > thresholds[new Pair <bool, bool>(mt1 == Dictionaries.MentionType.Pronominal, mt2 == Dictionaries.MentionType.Pronominal)]) { CorefUtils.MergeCoreferenceClusters(pair, document); } } }
public virtual void RunCoref(Document document) { if (toMerge.Contains(currentDocId)) { foreach (Pair <int, int> pair in toMerge[currentDocId]) { CorefUtils.MergeCoreferenceClusters(pair, document); } } currentDocId += 1; }
public virtual void RunCoref(Document document) { IList <Mention> sortedMentions = CorefUtils.GetSortedMentions(document); IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >(); foreach (Mention m in sortedMentions) { IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m.headIndex, null); withIndex.Add(m); } SimpleMatrix documentEmbedding = embeddingExtractor.GetDocumentEmbedding(document); IDictionary <int, SimpleMatrix> antecedentEmbeddings = new Dictionary <int, SimpleMatrix>(); IDictionary <int, SimpleMatrix> anaphorEmbeddings = new Dictionary <int, SimpleMatrix>(); ICounter <int> anaphoricityScores = new ClassicCounter <int>(); foreach (Mention m_1 in sortedMentions) { SimpleMatrix mentionEmbedding = embeddingExtractor.GetMentionEmbeddings(m_1, documentEmbedding); antecedentEmbeddings[m_1.mentionID] = model.GetAntecedentEmbedding(mentionEmbedding); anaphorEmbeddings[m_1.mentionID] = model.GetAnaphorEmbedding(mentionEmbedding); anaphoricityScores.IncrementCount(m_1.mentionID, model.GetAnaphoricityScore(mentionEmbedding, featureExtractor.GetAnaphoricityFeatures(m_1, document, mentionsByHeadIndex))); } IDictionary <int, IList <int> > mentionToCandidateAntecedents = CorefUtils.HeuristicFilter(sortedMentions, maxMentionDistance, maxMentionDistanceWithStringMatch); foreach (KeyValuePair <int, IList <int> > e in mentionToCandidateAntecedents) { double bestScore = anaphoricityScores.GetCount(e.Key) - 50 * (greedyness - 0.5); int m_2 = e.Key; int antecedent = null; foreach (int ca in e.Value) { double score = model.GetPairwiseScore(antecedentEmbeddings[ca], anaphorEmbeddings[m_2], featureExtractor.GetPairFeatures(new Pair <int, int>(ca, m_2), document, mentionsByHeadIndex)); if (score > bestScore) { bestScore = score; antecedent = ca; } } if (antecedent != null) { CorefUtils.MergeCoreferenceClusters(new Pair <int, int>(antecedent, m_2), document); } } }
public virtual void Process(int id, Document document) { IDictionary <Pair <int, int>, bool> labeledPairs = CorefUtils.GetLabeledMentionPairs(document); long numP = labeledPairs.Keys.Stream().Filter(null).Count(); IList <Pair <int, int> > negative = labeledPairs.Keys.Stream().Filter(null).Collect(Collectors.ToList()); int numN = negative.Count; if (numP / (float)(numP + numN) < minClassImbalancedPerDocument) { numN = (int)(numP / minClassImbalancedPerDocument - numP); Java.Util.Collections.Shuffle(negative); for (int i = numN; i < negative.Count; i++) { Sharpen.Collections.Remove(labeledPairs, negative[i]); } } IDictionary <int, IList <int> > mentionToCandidateAntecedents = new Dictionary <int, IList <int> >(); foreach (Pair <int, int> pair in labeledPairs.Keys) { IList <int> candidateAntecedents = mentionToCandidateAntecedents[pair.second]; if (candidateAntecedents == null) { candidateAntecedents = new List <int>(); mentionToCandidateAntecedents[pair.second] = candidateAntecedents; } candidateAntecedents.Add(pair.first); } IList <int> mentions = new List <int>(mentionToCandidateAntecedents.Keys); while (labeledPairs.Count > maxExamplesPerDocument) { int mention = mentions.Remove(random.NextInt(mentions.Count)); foreach (int candidateAntecedent in mentionToCandidateAntecedents[mention]) { Sharpen.Collections.Remove(labeledPairs, new Pair <int, int>(candidateAntecedent, mention)); } } mentionPairs[id] = labeledPairs; }
public virtual DocumentExamples Extract(int id, Document document, IDictionary <Pair <int, int>, bool> labeledPairs, Compressor <string> compressor) { IList <Mention> mentionsList = CorefUtils.GetSortedMentions(document); IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >(); foreach (Mention m in mentionsList) { IList <Mention> withIndex = mentionsByHeadIndex[m.headIndex]; if (withIndex == null) { withIndex = new List <Mention>(); mentionsByHeadIndex[m.headIndex] = withIndex; } withIndex.Add(m); } IDictionary <int, Mention> mentions = document.predictedMentionsByID; IList <Example> examples = new List <Example>(); ICollection <int> mentionsToExtract = new HashSet <int>(); foreach (KeyValuePair <Pair <int, int>, bool> pair in labeledPairs) { Mention m1 = mentions[pair.Key.first]; Mention m2 = mentions[pair.Key.second]; mentionsToExtract.Add(m1.mentionID); mentionsToExtract.Add(m2.mentionID); CompressedFeatureVector features = compressor.Compress(GetFeatures(document, m1, m2)); examples.Add(new Example(id, m1, m2, pair.Value ? 1.0 : 0.0, features)); } IDictionary <int, CompressedFeatureVector> mentionFeatures = new Dictionary <int, CompressedFeatureVector>(); foreach (int mentionID in mentionsToExtract) { mentionFeatures[mentionID] = compressor.Compress(GetFeatures(document, document.predictedMentionsByID[mentionID], mentionsByHeadIndex)); } return(new DocumentExamples(id, examples, mentionFeatures)); }
public virtual void Process(int id, Document document) { IJsonArrayBuilder clusters = Javax.Json.Json.CreateArrayBuilder(); foreach (CorefCluster gold in document.goldCorefClusters.Values) { IJsonArrayBuilder c = Javax.Json.Json.CreateArrayBuilder(); foreach (Mention m in gold.corefMentions) { c.Add(m.mentionID); } clusters.Add(c.Build()); } goldClusterWriter.Println(Javax.Json.Json.CreateObjectBuilder().Add(id.ToString(), clusters.Build()).Build()); IDictionary <Pair <int, int>, bool> mentionPairs = CorefUtils.GetLabeledMentionPairs(document); IList <Mention> mentionsList = CorefUtils.GetSortedMentions(document); IDictionary <int, IList <Mention> > mentionsByHeadIndex = new Dictionary <int, IList <Mention> >(); foreach (Mention m_1 in mentionsList) { IList <Mention> withIndex = mentionsByHeadIndex.ComputeIfAbsent(m_1.headIndex, null); withIndex.Add(m_1); } IJsonObjectBuilder docFeatures = Javax.Json.Json.CreateObjectBuilder(); docFeatures.Add("doc_id", id); docFeatures.Add("type", document.docType == Document.DocType.Article ? 1 : 0); docFeatures.Add("source", document.docInfo["DOC_ID"].Split("/")[0]); IJsonArrayBuilder sentences = Javax.Json.Json.CreateArrayBuilder(); foreach (ICoreMap sentence in document.annotation.Get(typeof(CoreAnnotations.SentencesAnnotation))) { sentences.Add(GetSentenceArray(sentence.Get(typeof(CoreAnnotations.TokensAnnotation)))); } IJsonObjectBuilder mentions = Javax.Json.Json.CreateObjectBuilder(); foreach (Mention m_2 in document.predictedMentionsByID.Values) { IEnumerator <SemanticGraphEdge> iterator = m_2.enhancedDependency.IncomingEdgeIterator(m_2.headIndexedWord); SemanticGraphEdge relation = iterator.MoveNext() ? iterator.Current : null; string depRelation = relation == null ? "no-parent" : relation.GetRelation().ToString(); string depParent = relation == null ? "<missing>" : relation.GetSource().Word(); mentions.Add(m_2.mentionNum.ToString(), Javax.Json.Json.CreateObjectBuilder().Add("doc_id", id).Add("mention_id", m_2.mentionID).Add("mention_num", m_2.mentionNum).Add("sent_num", m_2.sentNum).Add("start_index", m_2.startIndex).Add("end_index" , m_2.endIndex).Add("head_index", m_2.headIndex).Add("mention_type", m_2.mentionType.ToString()).Add("dep_relation", depRelation).Add("dep_parent", depParent).Add("sentence", GetSentenceArray(m_2.sentenceWords)).Add("contained-in-other-mention" , mentionsByHeadIndex[m_2.headIndex].Stream().AnyMatch(null) ? 1 : 0).Build()); } IJsonArrayBuilder featureNames = Javax.Json.Json.CreateArrayBuilder().Add("same-speaker").Add("antecedent-is-mention-speaker").Add("mention-is-antecedent-speaker").Add("relaxed-head-match").Add("exact-string-match").Add("relaxed-string-match" ); IJsonObjectBuilder features = Javax.Json.Json.CreateObjectBuilder(); IJsonObjectBuilder labels = Javax.Json.Json.CreateObjectBuilder(); foreach (KeyValuePair <Pair <int, int>, bool> e in mentionPairs) { Mention m1 = document.predictedMentionsByID[e.Key.first]; Mention m2 = document.predictedMentionsByID[e.Key.second]; string key = m1.mentionNum + " " + m2.mentionNum; IJsonArrayBuilder builder = Javax.Json.Json.CreateArrayBuilder(); foreach (int val in CategoricalFeatureExtractor.PairwiseFeatures(document, m1, m2, dictionaries, conll)) { builder.Add(val); } features.Add(key, builder.Build()); labels.Add(key, e.Value ? 1 : 0); } IJsonObject docData = Javax.Json.Json.CreateObjectBuilder().Add("sentences", sentences.Build()).Add("mentions", mentions.Build()).Add("labels", labels.Build()).Add("pair_feature_names", featureNames.Build()).Add("pair_features", features.Build ()).Add("document_features", docFeatures.Build()).Build(); dataWriter.Println(docData); }