/// <summary>Calculates the KL divergence between the two distributions.</summary> /// <remarks> /// Calculates the KL divergence between the two distributions. /// That is, it calculates KL(from || to). /// In other words, how well can d1 be represented by d2. /// if there is some value in d1 that gets zero prob in d2, then return positive infinity. /// </remarks> /// <returns>The KL divergence between the distributions</returns> public static double KlDivergence <K>(Distribution <K> from, Distribution <K> to) { ICollection <K> allKeys = GetSetOfAllKeys(from, to); int numKeysRemaining = from.GetNumberOfKeys(); double result = 0.0; double assignedMass1 = 0.0; double assignedMass2 = 0.0; double log2 = Math.Log(2.0); double p1; double p2; double epsilon = 1e-10; foreach (K key in allKeys) { p1 = from.ProbabilityOf(key); p2 = to.ProbabilityOf(key); numKeysRemaining--; assignedMass1 += p1; assignedMass2 += p2; if (p1 < epsilon) { continue; } double logFract = Math.Log(p1 / p2); if (logFract == double.PositiveInfinity) { System.Console.Out.WriteLine("Didtributions.kldivergence returning +inf: p1=" + p1 + ", p2=" + p2); System.Console.Out.Flush(); return(double.PositiveInfinity); } // can't recover result += p1 * (logFract / log2); } // express it in log base 2 if (numKeysRemaining != 0) { p1 = (1.0 - assignedMass1) / numKeysRemaining; if (p1 > epsilon) { p2 = (1.0 - assignedMass2) / numKeysRemaining; double logFract = Math.Log(p1 / p2); if (logFract == double.PositiveInfinity) { System.Console.Out.WriteLine("Distributions.klDivergence (remaining mass) returning +inf: p1=" + p1 + ", p2=" + p2); System.Console.Out.Flush(); return(double.PositiveInfinity); } // can't recover result += numKeysRemaining * p1 * (logFract / log2); } } // express it in log base 2 return(result); }
/// <summary>Returns a new Distribution<K> with counts averaged from the two given Distributions.</summary> /// <remarks> /// Returns a new Distribution<K> with counts averaged from the two given Distributions. /// The average Distribution<K> will contain the union of keys in both /// source Distributions, and each count will be the weighted average of the two source /// counts for that key, a missing count in one Distribution /// is treated as if it has probability equal to that returned by the probabilityOf() function. /// </remarks> /// <returns> /// A new distribution with counts that are the mean of the resp. counts /// in the given distributions with the remaining probability mass adjusted accordingly. /// </returns> public static Distribution <K> WeightedAverage <K>(Distribution <K> d1, double w1, Distribution <K> d2) { double w2 = 1.0 - w1; ICollection <K> allKeys = GetSetOfAllKeys(d1, d2); int numKeys = d1.GetNumberOfKeys(); ICounter <K> c = new ClassicCounter <K>(); foreach (K key in allKeys) { double newProbability = d1.ProbabilityOf(key) * w1 + d2.ProbabilityOf(key) * w2; c.SetCount(key, newProbability); } return(Distribution.GetDistributionFromPartiallySpecifiedCounter(c, numKeys)); }
/// <summary>Returns a double between 0 and 1 representing the overlap of d1 and d2.</summary> /// <remarks> /// Returns a double between 0 and 1 representing the overlap of d1 and d2. /// Equals 0 if there is no overlap, equals 1 iff d1==d2 /// </remarks> public static double Overlap <K>(Distribution <K> d1, Distribution <K> d2) { ICollection <K> allKeys = GetSetOfAllKeys(d1, d2); double result = 0.0; double remainingMass1 = 1.0; double remainingMass2 = 1.0; foreach (K key in allKeys) { double p1 = d1.ProbabilityOf(key); double p2 = d2.ProbabilityOf(key); remainingMass1 -= p1; remainingMass2 -= p2; result += Math.Min(p1, p2); } result += Math.Min(remainingMass1, remainingMass2); return(result); }
/// <summary>For internal testing purposes only.</summary> public static void Main(string[] args) { ICounter <string> c2 = new ClassicCounter <string>(); c2.IncrementCount("p", 13); c2.SetCount("q", 12); c2.SetCount("w", 5); c2.IncrementCount("x", 7.5); // System.out.println(getDistribution(c2).getCount("w") + " should be 0.13333"); ClassicCounter <string> c = new ClassicCounter <string>(); double p = 1000; string Unk = "!*UNKNOWN*!"; ICollection <string> s = Generics.NewHashSet(); s.Add(Unk); // fill counter with roughly Zipfian distribution // "1" : 1000 // "2" : 500 // "3" : 333 // ... // "UNK" : 45 // ... // "666" : 2 // "667" : 1 // ... // "1000" : 1 for (int rank = 1; rank < 2000; rank++) { string i = rank.ToString(); c.SetCount(i, Math.Round(p / rank)); s.Add(i); } for (int rank_1 = 2000; rank_1 <= 4000; rank_1++) { string i = rank_1.ToString(); s.Add(i); } Distribution <string> n = GetDistribution(c); Distribution <string> prior = GetUniformDistribution(s); Distribution <string> dir1 = DistributionWithDirichletPrior(c, prior, 4000); Distribution <string> dir2 = DynamicCounterWithDirichletPrior(c, prior, 4000); Distribution <string> add1; Distribution <string> gt; if (true) { add1 = LaplaceSmoothedDistribution(c, 4000); gt = GoodTuringSmoothedCounter(c, 4000); } else { c.SetCount(Unk, 45); add1 = LaplaceWithExplicitUnknown(c, 0.5, Unk); gt = GoodTuringWithExplicitUnknown(c, Unk); } Distribution <string> sgt = SimpleGoodTuring(c, 4000); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "Freq", "Norm", "Add1", "Dir1", "Dir2", "GT", "SGT"); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); for (int i_1 = 1; i_1 < 5; i_1++) { System.Console.Out.Printf("%10d ", Math.Round(p / i_1)); string @in = i_1.ToString(); System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(@in.ToString())); System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(@in)); System.Console.Out.WriteLine(); } System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10d ", 1); string last = 1500.ToString(); System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(last)); System.Console.Out.WriteLine(); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10s ", "UNK"); System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(Unk)); System.Console.Out.WriteLine(); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10s ", "RESERVE"); System.Console.Out.Printf("%10.8f ", n.GetReservedMass()); System.Console.Out.Printf("%10.8f ", add1.GetReservedMass()); System.Console.Out.Printf("%10.8f ", dir1.GetReservedMass()); System.Console.Out.Printf("%10.8f ", dir2.GetReservedMass()); System.Console.Out.Printf("%10.8f ", gt.GetReservedMass()); System.Console.Out.Printf("%10.8f ", sgt.GetReservedMass()); System.Console.Out.WriteLine(); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10s ", "Total"); System.Console.Out.Printf("%10.8f ", n.TotalCount()); System.Console.Out.Printf("%10.8f ", add1.TotalCount()); System.Console.Out.Printf("%10.8f ", dir1.TotalCount()); System.Console.Out.Printf("%10.8f ", dir2.TotalCount()); System.Console.Out.Printf("%10.8f ", gt.TotalCount()); System.Console.Out.Printf("%10.8f ", sgt.TotalCount()); System.Console.Out.WriteLine(); }
public override double ProbabilityOf(E o) { return(this.counter.GetCount(o) + prior.ProbabilityOf(o) * priorMultiplier); }