public State(ClustererDataLoader.ClustererDoc doc) { currentDocId = doc.id; this.doc = doc; this.hashedScores = new Dictionary <Clusterer.MergeKey, bool>(); this.hashedCosts = new Dictionary <long, double>(); this.clusters = new List <Clusterer.Cluster>(); this.hash = 0; mentionToCluster = new Dictionary <int, Clusterer.Cluster>(); foreach (int m in doc.mentions) { Clusterer.Cluster c = new Clusterer.Cluster(m); clusters.Add(c); mentionToCluster[m] = c; hash ^= c.hash * 7; } IList <Pair <int, int> > allPairs = new List <Pair <int, int> >(doc.classificationScores.KeySet()); ICounter <Pair <int, int> > scores = UseRanking ? doc.rankingScores : doc.classificationScores; allPairs.Sort(null); int i = 0; for (i = 0; i < allPairs.Count; i++) { double score = scores.GetCount(allPairs[i]); if (score < MinPairwiseScore && i > MinPairs) { break; } if (i >= EarlyStopThreshold && i / score > EarlyStopVal) { break; } } mentionPairs = allPairs.SubList(0, i); ICounter <int> seenAnaphors = new ClassicCounter <int>(); ICounter <int> seenAntecedents = new ClassicCounter <int>(); globalFeatures = new List <Clusterer.GlobalFeatures>(); for (int j = 0; j < allPairs.Count; j++) { Pair <int, int> mentionPair = allPairs[j]; Clusterer.GlobalFeatures gf = new Clusterer.GlobalFeatures(); gf.currentIndex = j; gf.anaphorSeen = seenAnaphors.ContainsKey(mentionPair.second); gf.size = mentionPairs.Count; gf.docSize = doc.mentions.Count / 300.0; globalFeatures.Add(gf); seenAnaphors.IncrementCount(mentionPair.second); seenAntecedents.IncrementCount(mentionPair.first); } currentIndex = 0; SetClusters(); }
private static ICounter <string> GetFeatures(ClustererDataLoader.ClustererDoc doc, Clusterer.Cluster c1, Clusterer.Cluster c2, Clusterer.GlobalFeatures gf) { Clusterer.MergeKey key = new Clusterer.MergeKey(c1, c2, gf.currentIndex); CompressedFeatureVector cfv = featuresCache[key]; ICounter <string> features = cfv == null ? null : compressor.Uncompress(cfv); if (features != null) { featuresCacheHits += isTraining; return(features); } featuresCacheMisses += isTraining; features = new ClassicCounter <string>(); if (gf.anaphorSeen) { features.IncrementCount("anaphorSeen"); } features.IncrementCount("docSize", gf.docSize); features.IncrementCount("percentComplete", gf.currentIndex / (double)gf.size); features.IncrementCount("bias", 1.0); int earliest1 = EarliestMention(c1, doc); int earliest2 = EarliestMention(c2, doc); if (doc.mentionIndices[earliest1] > doc.mentionIndices[earliest2]) { int tmp = earliest1; earliest1 = earliest2; earliest2 = tmp; } features.IncrementCount("anaphoricity", doc.anaphoricityScores.GetCount(earliest2)); if (c1.mentions.Count == 1 && c2.mentions.Count == 1) { Pair <int, int> mentionPair = new Pair <int, int>(c1.mentions[0], c2.mentions[0]); features.AddAll(AddSuffix(GetFeatures(doc, mentionPair, doc.classificationScores), "-classification")); features.AddAll(AddSuffix(GetFeatures(doc, mentionPair, doc.rankingScores), "-ranking")); features = AddSuffix(features, "-single"); } else { IList <Pair <int, int> > between = new List <Pair <int, int> >(); foreach (int m1 in c1.mentions) { foreach (int m2 in c2.mentions) { between.Add(new Pair <int, int>(m1, m2)); } } features.AddAll(AddSuffix(GetFeatures(doc, between, doc.classificationScores), "-classification")); features.AddAll(AddSuffix(GetFeatures(doc, between, doc.rankingScores), "-ranking")); } featuresCache[key] = compressor.Compress(features); return(features); }