public virtual void DoAction(bool isMerge) { if (isMerge) { if (c2.Size() > c1.Size()) { Clusterer.Cluster tmp = c1; c1 = c2; c2 = tmp; } hash ^= 7 * c1.hash; hash ^= 7 * c2.hash; c1.Merge(c2); foreach (int m in c2.mentions) { mentionToCluster[m] = c1; } clusters.Remove(c2); hash ^= 7 * c1.hash; } currentIndex++; if (!IsComplete()) { SetClusters(); } while (c1 == c2) { currentIndex++; if (IsComplete()) { break; } SetClusters(); } }
public virtual void SetClusters() { Pair <int, int> currentPair = mentionPairs[currentIndex]; c1 = mentionToCluster[currentPair.first]; c2 = mentionToCluster[currentPair.second]; }
public State(ClustererDataLoader.ClustererDoc doc) { currentDocId = doc.id; this.doc = doc; this.hashedScores = new Dictionary <Clusterer.MergeKey, bool>(); this.hashedCosts = new Dictionary <long, double>(); this.clusters = new List <Clusterer.Cluster>(); this.hash = 0; mentionToCluster = new Dictionary <int, Clusterer.Cluster>(); foreach (int m in doc.mentions) { Clusterer.Cluster c = new Clusterer.Cluster(m); clusters.Add(c); mentionToCluster[m] = c; hash ^= c.hash * 7; } IList <Pair <int, int> > allPairs = new List <Pair <int, int> >(doc.classificationScores.KeySet()); ICounter <Pair <int, int> > scores = UseRanking ? doc.rankingScores : doc.classificationScores; allPairs.Sort(null); int i = 0; for (i = 0; i < allPairs.Count; i++) { double score = scores.GetCount(allPairs[i]); if (score < MinPairwiseScore && i > MinPairs) { break; } if (i >= EarlyStopThreshold && i / score > EarlyStopVal) { break; } } mentionPairs = allPairs.SubList(0, i); ICounter <int> seenAnaphors = new ClassicCounter <int>(); ICounter <int> seenAntecedents = new ClassicCounter <int>(); globalFeatures = new List <Clusterer.GlobalFeatures>(); for (int j = 0; j < allPairs.Count; j++) { Pair <int, int> mentionPair = allPairs[j]; Clusterer.GlobalFeatures gf = new Clusterer.GlobalFeatures(); gf.currentIndex = j; gf.anaphorSeen = seenAnaphors.ContainsKey(mentionPair.second); gf.size = mentionPairs.Count; gf.docSize = doc.mentions.Count / 300.0; globalFeatures.Add(gf); seenAnaphors.IncrementCount(mentionPair.second); seenAntecedents.IncrementCount(mentionPair.first); } currentIndex = 0; SetClusters(); }
private static int EarliestMention(Clusterer.Cluster c, ClustererDataLoader.ClustererDoc doc) { int earliest = -1; foreach (int m in c.mentions) { int pos = doc.mentionIndices[m]; if (earliest == -1 || pos < doc.mentionIndices[earliest]) { earliest = m; } } return(earliest); }
public State(Clusterer.State state) { this.hashedScores = state.hashedScores; this.hashedCosts = state.hashedCosts; this.doc = state.doc; this.hash = state.hash; this.mentionPairs = state.mentionPairs; this.currentIndex = state.currentIndex; this.globalFeatures = state.globalFeatures; this.clusters = new List <Clusterer.Cluster>(); this.mentionToCluster = new Dictionary <int, Clusterer.Cluster>(); foreach (Clusterer.Cluster c in state.clusters) { Clusterer.Cluster copy = new Clusterer.Cluster(c); clusters.Add(copy); foreach (int m in copy.mentions) { mentionToCluster[m] = copy; } } SetClusters(); }
private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, Clusterer.Cluster c1, Clusterer.Cluster c2, Clusterer.GlobalFeatures gf) { Clusterer.MergeKey key = new Clusterer.MergeKey(c1, c2, gf.currentIndex); CompressedFeatureVector cfv = featuresCache[key]; ICounter <string> features = cfv == null ? null : compressor.Uncompress(cfv); if (features != null) { featuresCacheHits += isTraining; return(features); } featuresCacheMisses += isTraining; features = new ClassicCounter <string>(); if (gf.anaphorSeen) { features.IncrementCount("anaphorSeen"); } features.IncrementCount("docSize", gf.docSize); features.IncrementCount("percentComplete", gf.currentIndex / (double)gf.size); features.IncrementCount("bias", 1.0); int earliest1 = EarliestMention(c1, doc); int earliest2 = EarliestMention(c2, doc); if (doc.mentionIndices[earliest1] > doc.mentionIndices[earliest2]) { int tmp = earliest1; earliest1 = earliest2; earliest2 = tmp; } features.IncrementCount("anaphoricity", doc.anaphoricityScores.GetCount(earliest2)); if (c1.mentions.Count == 1 && c2.mentions.Count == 1) { Pair <int, int> mentionPair = new Pair <int, int>(c1.mentions[0], c2.mentions[0]); features.AddAll(AddSuffix(GetFeatures(doc, mentionPair, doc.classificationScores), "-classification")); features.AddAll(AddSuffix(GetFeatures(doc, mentionPair, doc.rankingScores), "-ranking")); features = AddSuffix(features, "-single"); } else { IList <Pair <int, int> > between = new List <Pair <int, int> >(); foreach (int m1 in c1.mentions) { foreach (int m2 in c2.mentions) { between.Add(new Pair <int, int>(m1, m2)); } } features.AddAll(AddSuffix(GetFeatures(doc, between, doc.classificationScores), "-classification")); features.AddAll(AddSuffix(GetFeatures(doc, between, doc.rankingScores), "-ranking")); } featuresCache[key] = compressor.Compress(features); return(features); }
public virtual void Merge(Clusterer.Cluster c) { Sharpen.Collections.AddAll(mentions, c.mentions); hash ^= c.hash; }
public Cluster(Clusterer.Cluster c) { mentions = new List <int>(c.mentions); hash = c.hash; }
public MergeKey(Clusterer.Cluster c1, Clusterer.Cluster c2, int ind) { hash = (int)(c1.hash ^ c2.hash) + (2003 * ind) + currentDocId; }