Example #1
0
        /// <summary>Calculates the KL divergence between the two distributions.</summary>
        /// <remarks>
        /// Calculates the KL divergence between the two distributions.
        /// That is, it calculates KL(from || to).
        /// In other words, how well can d1 be represented by d2.
        /// if there is some value in d1 that gets zero prob in d2, then return positive infinity.
        /// </remarks>
        /// <returns>The KL divergence between the distributions</returns>
        public static double KlDivergence <K>(Distribution <K> from, Distribution <K> to)
        {
            ICollection <K> allKeys          = GetSetOfAllKeys(from, to);
            int             numKeysRemaining = from.GetNumberOfKeys();
            double          result           = 0.0;
            double          assignedMass1    = 0.0;
            double          assignedMass2    = 0.0;
            double          log2             = Math.Log(2.0);
            double          p1;
            double          p2;
            double          epsilon = 1e-10;

            foreach (K key in allKeys)
            {
                p1 = from.ProbabilityOf(key);
                p2 = to.ProbabilityOf(key);
                numKeysRemaining--;
                assignedMass1 += p1;
                assignedMass2 += p2;
                if (p1 < epsilon)
                {
                    continue;
                }
                double logFract = Math.Log(p1 / p2);
                if (logFract == double.PositiveInfinity)
                {
                    System.Console.Out.WriteLine("Didtributions.kldivergence returning +inf: p1=" + p1 + ", p2=" + p2);
                    System.Console.Out.Flush();
                    return(double.PositiveInfinity);
                }
                // can't recover
                result += p1 * (logFract / log2);
            }
            // express it in log base 2
            if (numKeysRemaining != 0)
            {
                p1 = (1.0 - assignedMass1) / numKeysRemaining;
                if (p1 > epsilon)
                {
                    p2 = (1.0 - assignedMass2) / numKeysRemaining;
                    double logFract = Math.Log(p1 / p2);
                    if (logFract == double.PositiveInfinity)
                    {
                        System.Console.Out.WriteLine("Distributions.klDivergence (remaining mass) returning +inf: p1=" + p1 + ", p2=" + p2);
                        System.Console.Out.Flush();
                        return(double.PositiveInfinity);
                    }
                    // can't recover
                    result += numKeysRemaining * p1 * (logFract / log2);
                }
            }
            // express it in log base 2
            return(result);
        }
Example #2
0
        /// <summary>Returns a new Distribution<K> with counts averaged from the two given Distributions.</summary>
        /// <remarks>
        /// Returns a new Distribution<K> with counts averaged from the two given Distributions.
        /// The average Distribution<K> will contain the union of keys in both
        /// source Distributions, and each count will be the weighted average of the two source
        /// counts for that key,  a missing count in one Distribution
        /// is treated as if it has probability equal to that returned by the probabilityOf() function.
        /// </remarks>
        /// <returns>
        /// A new distribution with counts that are the mean of the resp. counts
        /// in the given distributions with the remaining probability mass adjusted accordingly.
        /// </returns>
        public static Distribution <K> WeightedAverage <K>(Distribution <K> d1, double w1, Distribution <K> d2)
        {
            double          w2      = 1.0 - w1;
            ICollection <K> allKeys = GetSetOfAllKeys(d1, d2);
            int             numKeys = d1.GetNumberOfKeys();
            ICounter <K>    c       = new ClassicCounter <K>();

            foreach (K key in allKeys)
            {
                double newProbability = d1.ProbabilityOf(key) * w1 + d2.ProbabilityOf(key) * w2;
                c.SetCount(key, newProbability);
            }
            return(Distribution.GetDistributionFromPartiallySpecifiedCounter(c, numKeys));
        }
Example #3
0
        /// <summary>Returns a double between 0 and 1 representing the overlap of d1 and d2.</summary>
        /// <remarks>
        /// Returns a double between 0 and 1 representing the overlap of d1 and d2.
        /// Equals 0 if there is no overlap, equals 1 iff d1==d2
        /// </remarks>
        public static double Overlap <K>(Distribution <K> d1, Distribution <K> d2)
        {
            ICollection <K> allKeys        = GetSetOfAllKeys(d1, d2);
            double          result         = 0.0;
            double          remainingMass1 = 1.0;
            double          remainingMass2 = 1.0;

            foreach (K key in allKeys)
            {
                double p1 = d1.ProbabilityOf(key);
                double p2 = d2.ProbabilityOf(key);
                remainingMass1 -= p1;
                remainingMass2 -= p2;
                result         += Math.Min(p1, p2);
            }
            result += Math.Min(remainingMass1, remainingMass2);
            return(result);
        }
Example #4
0
        /// <summary>For internal testing purposes only.</summary>
        public static void Main(string[] args)
        {
            ICounter <string> c2 = new ClassicCounter <string>();

            c2.IncrementCount("p", 13);
            c2.SetCount("q", 12);
            c2.SetCount("w", 5);
            c2.IncrementCount("x", 7.5);
            // System.out.println(getDistribution(c2).getCount("w") + " should be 0.13333");
            ClassicCounter <string> c = new ClassicCounter <string>();
            double p               = 1000;
            string Unk             = "!*UNKNOWN*!";
            ICollection <string> s = Generics.NewHashSet();

            s.Add(Unk);
            // fill counter with roughly Zipfian distribution
            //    "1" : 1000
            //    "2" :  500
            //    "3" :  333
            //       ...
            //  "UNK" :   45
            //       ...
            //  "666" :    2
            //  "667" :    1
            //       ...
            // "1000" :    1
            for (int rank = 1; rank < 2000; rank++)
            {
                string i = rank.ToString();
                c.SetCount(i, Math.Round(p / rank));
                s.Add(i);
            }
            for (int rank_1 = 2000; rank_1 <= 4000; rank_1++)
            {
                string i = rank_1.ToString();
                s.Add(i);
            }
            Distribution <string> n     = GetDistribution(c);
            Distribution <string> prior = GetUniformDistribution(s);
            Distribution <string> dir1  = DistributionWithDirichletPrior(c, prior, 4000);
            Distribution <string> dir2  = DynamicCounterWithDirichletPrior(c, prior, 4000);
            Distribution <string> add1;
            Distribution <string> gt;

            if (true)
            {
                add1 = LaplaceSmoothedDistribution(c, 4000);
                gt   = GoodTuringSmoothedCounter(c, 4000);
            }
            else
            {
                c.SetCount(Unk, 45);
                add1 = LaplaceWithExplicitUnknown(c, 0.5, Unk);
                gt   = GoodTuringWithExplicitUnknown(c, Unk);
            }
            Distribution <string> sgt = SimpleGoodTuring(c, 4000);

            System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "Freq", "Norm", "Add1", "Dir1", "Dir2", "GT", "SGT");
            System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------");
            for (int i_1 = 1; i_1 < 5; i_1++)
            {
                System.Console.Out.Printf("%10d ", Math.Round(p / i_1));
                string @in = i_1.ToString();
                System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(@in.ToString()));
                System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(@in));
                System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(@in));
                System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(@in));
                System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(@in));
                System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(@in));
                System.Console.Out.WriteLine();
            }
            System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------");
            System.Console.Out.Printf("%10d ", 1);
            string last = 1500.ToString();

            System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(last));
            System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(last));
            System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(last));
            System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(last));
            System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(last));
            System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(last));
            System.Console.Out.WriteLine();
            System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------");
            System.Console.Out.Printf("%10s ", "UNK");
            System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(Unk));
            System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(Unk));
            System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(Unk));
            System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(Unk));
            System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(Unk));
            System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(Unk));
            System.Console.Out.WriteLine();
            System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------");
            System.Console.Out.Printf("%10s ", "RESERVE");
            System.Console.Out.Printf("%10.8f ", n.GetReservedMass());
            System.Console.Out.Printf("%10.8f ", add1.GetReservedMass());
            System.Console.Out.Printf("%10.8f ", dir1.GetReservedMass());
            System.Console.Out.Printf("%10.8f ", dir2.GetReservedMass());
            System.Console.Out.Printf("%10.8f ", gt.GetReservedMass());
            System.Console.Out.Printf("%10.8f ", sgt.GetReservedMass());
            System.Console.Out.WriteLine();
            System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------");
            System.Console.Out.Printf("%10s ", "Total");
            System.Console.Out.Printf("%10.8f ", n.TotalCount());
            System.Console.Out.Printf("%10.8f ", add1.TotalCount());
            System.Console.Out.Printf("%10.8f ", dir1.TotalCount());
            System.Console.Out.Printf("%10.8f ", dir2.TotalCount());
            System.Console.Out.Printf("%10.8f ", gt.TotalCount());
            System.Console.Out.Printf("%10.8f ", sgt.TotalCount());
            System.Console.Out.WriteLine();
        }
Example #5
0
 public override double ProbabilityOf(E o)
 {
     return(this.counter.GetCount(o) + prior.ProbabilityOf(o) * priorMultiplier);
 }