Exemplo n.º 1
0
        public static Edu.Stanford.Nlp.Stats.Distribution <E> AbsolutelyDiscountedDistribution <E>(ICounter <E> counter, int numberOfKeys, double discount)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter = new ClassicCounter <E>();
            double total        = counter.TotalCount();
            double reservedMass = 0.0;

            foreach (E key in counter.KeySet())
            {
                double count = counter.GetCount(key);
                if (count > discount)
                {
                    double newCount = (count - discount) / total;
                    norm.counter.SetCount(key, newCount);
                    // a positive count left over
                    //        System.out.println("seen: " + newCount);
                    reservedMass += discount;
                }
                else
                {
                    // count <= discount
                    reservedMass += count;
                }
            }
            // if the count <= discount, don't put key in counter, and we treat it as unseen!!
            norm.numberOfKeys = numberOfKeys;
            norm.reservedMass = reservedMass / total;
            //    System.out.println("UNSEEN: " + reservedMass / total / (numberOfKeys - counter.size()));
            return(norm);
        }
Exemplo n.º 2
0
        // ----------------------------------------------------------------------------
        /// <summary>
        /// Returns a Distribution that uses prior as a Dirichlet prior
        /// weighted by weight.
        /// </summary>
        /// <remarks>
        /// Returns a Distribution that uses prior as a Dirichlet prior
        /// weighted by weight.  Essentially adds "pseudo-counts" for each Object
        /// in prior equal to that Object's mass in prior times weight,
        /// then normalizes.
        /// <p>
        /// WARNING: If unseen item is encountered in c, total may not be 1.
        /// NOTE: This will not work if prior is a DynamicDistribution
        /// to fix this, you could add a CounterView to Distribution and use that
        /// in the linearCombination call below
        /// </remarks>
        /// <param name="weight">multiplier of prior to get "pseudo-count"</param>
        /// <returns>new Distribution</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> DistributionWithDirichletPrior <E>(ICounter <E> c, Edu.Stanford.Nlp.Stats.Distribution <E> prior, double weight)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            double totalWeight = c.TotalCount() + weight;

            if (prior is Distribution.DynamicDistribution)
            {
                throw new NotSupportedException("Cannot make normalized counter with Dynamic prior.");
            }
            norm.counter      = Counters.LinearCombination(c, 1 / totalWeight, prior.counter, weight / totalWeight);
            norm.numberOfKeys = prior.numberOfKeys;
            norm.reservedMass = prior.reservedMass * weight / totalWeight;
            //System.out.println("totalCount: " + norm.totalCount());
            return(norm);
        }
Exemplo n.º 3
0
        //--- end JM added
        /// <param name="s">a Collection of keys.</param>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> GetUniformDistribution <E>(ICollection <E> s)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter      = new ClassicCounter <E>();
            norm.numberOfKeys = s.Count;
            norm.reservedMass = 0;
            double total = s.Count;
            double count = 1.0 / total;

            foreach (E key in s)
            {
                norm.counter.SetCount(key, count);
            }
            return(norm);
        }
Exemplo n.º 4
0
        /// <param name="s">a Collection of keys.</param>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> GetPerturbedUniformDistribution <E>(ICollection <E> s, Random r)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter      = new ClassicCounter <E>();
            norm.numberOfKeys = s.Count;
            norm.reservedMass = 0;
            double total = s.Count;
            double prob  = 1.0 / total;
            double stdev = prob / 1000.0;

            foreach (E key in s)
            {
                norm.counter.SetCount(key, prob + (r.NextGaussian() * stdev));
            }
            return(norm);
        }
Exemplo n.º 5
0
        /// <summary>
        /// Creates a smoothed Distribution using Lidstone's law, ie adds lambda (typically
        /// between 0 and 1) to every item, including unseen ones, and divides by the total count.
        /// </summary>
        /// <returns>a new Lidstone smoothed Distribution</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> LaplaceSmoothedDistribution <E>(ICounter <E> counter, int numberOfKeys, double lambda)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter = new ClassicCounter <E>();
            double total        = counter.TotalCount();
            double newTotal     = total + (lambda * numberOfKeys);
            double reservedMass = ((double)numberOfKeys - counter.Size()) * lambda / newTotal;

            norm.numberOfKeys = numberOfKeys;
            norm.reservedMass = reservedMass;
            foreach (E key in counter.KeySet())
            {
                double count = counter.GetCount(key);
                norm.counter.SetCount(key, (count + lambda) / newTotal);
            }
            return(norm);
        }
Exemplo n.º 6
0
        /// <summary>Creates a Good-Turing smoothed Distribution from the given counter.</summary>
        /// <returns>a new Good-Turing smoothed Distribution.</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> GoodTuringSmoothedCounter <E>(ICounter <E> counter, int numberOfKeys)
        {
            // gather count-counts
            int[] countCounts = GetCountCounts(counter);
            // if count-counts are unreliable, we shouldn't be using G-T
            // revert to laplace
            for (int i = 1; i <= 10; i++)
            {
                if (countCounts[i] < 3)
                {
                    return(LaplaceSmoothedDistribution(counter, numberOfKeys, 0.5));
                }
            }
            double observedMass = counter.TotalCount();
            double reservedMass = countCounts[1] / observedMass;

            // calculate and cache adjusted frequencies
            // also adjusting total mass of observed items
            double[] adjustedFreq = new double[10];
            for (int freq = 1; freq < 10; freq++)
            {
                adjustedFreq[freq] = (double)(freq + 1) * (double)countCounts[freq + 1] / countCounts[freq];
                observedMass      -= (freq - adjustedFreq[freq]) * countCounts[freq];
            }
            double normFactor = (1.0 - reservedMass) / observedMass;

            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter = new ClassicCounter <E>();
            // fill in the new Distribution, renormalizing as we go
            foreach (E key in counter.KeySet())
            {
                int origFreq = (int)Math.Round(counter.GetCount(key));
                if (origFreq < 10)
                {
                    norm.counter.SetCount(key, adjustedFreq[origFreq] * normFactor);
                }
                else
                {
                    norm.counter.SetCount(key, origFreq * normFactor);
                }
            }
            norm.numberOfKeys = numberOfKeys;
            norm.reservedMass = reservedMass;
            return(norm);
        }
Exemplo n.º 7
0
        //---- end cdm added
        //--- JM added for Distributions
        /// <summary>Assuming that c has a total count &lt; 1, returns a new Distribution using the counts in c as probabilities.</summary>
        /// <remarks>
        /// Assuming that c has a total count &lt; 1, returns a new Distribution using the counts in c as probabilities.
        /// If c has a total count &gt; 1, returns a normalized distribution with no remaining mass.
        /// </remarks>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> GetDistributionFromPartiallySpecifiedCounter <E>(ICounter <E> c, int numKeys)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> d;
            double total = c.TotalCount();

            if (total >= 1.0)
            {
                d = GetDistribution(c);
                d.numberOfKeys = numKeys;
            }
            else
            {
                d = new Edu.Stanford.Nlp.Stats.Distribution <E>();
                d.numberOfKeys = numKeys;
                d.counter      = c;
                d.reservedMass = 1.0 - total;
            }
            return(d);
        }
Exemplo n.º 8
0
        public static Edu.Stanford.Nlp.Stats.Distribution <E> GetDistributionWithReservedMass <E>(ICounter <E> counter, double reservedMass)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter      = new ClassicCounter <E>();
            norm.numberOfKeys = counter.Size();
            norm.reservedMass = reservedMass;
            double total = counter.TotalCount() * (1 + reservedMass);

            if (total == 0.0)
            {
                total = 1.0;
            }
            foreach (E key in counter.KeySet())
            {
                double count = counter.GetCount(key) / total;
                //      if (Double.isNaN(count) || count < 0.0 || count> 1.0 ) throw new RuntimeException("count=" + counter.getCount(key) + " total=" + total);
                norm.counter.SetCount(key, count);
            }
            return(norm);
        }
Exemplo n.º 9
0
        /// <summary>
        /// Creates a smoothed Distribution with Laplace smoothing, but assumes an explicit
        /// count of "UNKNOWN" items.
        /// </summary>
        /// <remarks>
        /// Creates a smoothed Distribution with Laplace smoothing, but assumes an explicit
        /// count of "UNKNOWN" items.  Thus anything not in the original counter will have
        /// probability zero.
        /// </remarks>
        /// <param name="counter">the counter to normalize</param>
        /// <param name="lambda">the value to add to each count</param>
        /// <param name="Unk">the UNKNOWN symbol</param>
        /// <returns>a new Laplace-smoothed distribution</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> LaplaceWithExplicitUnknown <E>(ICounter <E> counter, double lambda, E Unk)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter = new ClassicCounter <E>();
            double total = counter.TotalCount() + (lambda * (counter.Size() - 1));

            norm.numberOfKeys = counter.Size();
            norm.reservedMass = 0.0;
            foreach (E key in counter.KeySet())
            {
                if (key.Equals(Unk))
                {
                    norm.counter.SetCount(key, counter.GetCount(key) / total);
                }
                else
                {
                    norm.counter.SetCount(key, (counter.GetCount(key) + lambda) / total);
                }
            }
            return(norm);
        }
Exemplo n.º 10
0
        public static Edu.Stanford.Nlp.Stats.Distribution <E> GetPerturbedDistribution <E>(ICounter <E> wordCounter, Random r)
        {
            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            norm.counter      = new ClassicCounter <E>();
            norm.numberOfKeys = wordCounter.Size();
            norm.reservedMass = 0;
            double totalCount = wordCounter.TotalCount();
            double stdev      = 1.0 / norm.numberOfKeys / 1000.0;

            // tiny relative to average value
            foreach (E key in wordCounter.KeySet())
            {
                double prob          = wordCounter.GetCount(key) / totalCount;
                double perturbedProb = prob + (r.NextGaussian() * stdev);
                if (perturbedProb < 0.0)
                {
                    perturbedProb = 0.0;
                }
                norm.counter.SetCount(key, perturbedProb);
            }
            return(norm);
        }
Exemplo n.º 11
0
        // ----------------------------------------------------------------------------
        /// <summary>
        /// Creates a Distribution from the given counter using Gale &amp; Sampsons'
        /// "simple Good-Turing" smoothing.
        /// </summary>
        /// <returns>a new simple Good-Turing smoothed Distribution.</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> SimpleGoodTuring <E>(ICounter <E> counter, int numberOfKeys)
        {
            // check arguments
            ValidateCounter(counter);
            int numUnseen = numberOfKeys - counter.Size();

            if (numUnseen < 1)
            {
                throw new ArgumentException(string.Format("ERROR: numberOfKeys %d must be > size of counter %d!", numberOfKeys, counter.Size()));
            }
            // do smoothing
            int[][] cc = CountCounts2IntArrays(CollectCountCounts(counter));
            int[]   r  = cc[0];
            // counts
            int[] n = cc[1];
            // counts of counts
            Edu.Stanford.Nlp.Stats.SimpleGoodTuring sgt = new Edu.Stanford.Nlp.Stats.SimpleGoodTuring(r, n);
            // collate results
            ICounter <int> probsByCount = new ClassicCounter <int>();

            double[] probs = sgt.GetProbabilities();
            for (int i = 0; i < probs.Length; i++)
            {
                probsByCount.SetCount(r[i], probs[i]);
            }
            // make smoothed distribution
            Edu.Stanford.Nlp.Stats.Distribution <E> dist = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            dist.counter = new ClassicCounter <E>();
            foreach (KeyValuePair <E, double> entry in counter.EntrySet())
            {
                E   item  = entry.Key;
                int count = (int)Math.Round(entry.Value);
                dist.counter.SetCount(item, probsByCount.GetCount(count));
            }
            dist.numberOfKeys = numberOfKeys;
            dist.reservedMass = sgt.GetProbabilityForUnseen();
            return(dist);
        }
Exemplo n.º 12
0
        /// <summary>
        /// Like normalizedCounterWithDirichletPrior except probabilities are
        /// computed dynamically from the counter and prior instead of all at once up front.
        /// </summary>
        /// <remarks>
        /// Like normalizedCounterWithDirichletPrior except probabilities are
        /// computed dynamically from the counter and prior instead of all at once up front.
        /// The main advantage of this is if you are making many distributions from relatively
        /// sparse counters using the same relatively dense prior, the prior is only represented
        /// once, for major memory savings.
        /// </remarks>
        /// <param name="weight">multiplier of prior to get "pseudo-count"</param>
        /// <returns>new Distribution</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> DynamicCounterWithDirichletPrior <E>(ICounter <E> c, Edu.Stanford.Nlp.Stats.Distribution <E> prior, double weight)
        {
            double totalWeight = c.TotalCount() + weight;

            Edu.Stanford.Nlp.Stats.Distribution <E> norm = new Distribution.DynamicDistribution <E>(prior, weight / totalWeight);
            norm.counter = new ClassicCounter <E>();
            // this might be done more efficiently with entrySet but there isn't a way to get
            // the entrySet from a Counter now.  In most cases c will be small(-ish) anyway
            foreach (E key in c.KeySet())
            {
                double count = c.GetCount(key) / totalWeight;
                prior.AddToKeySet(key);
                norm.counter.SetCount(key, count);
            }
            norm.numberOfKeys = prior.numberOfKeys;
            return(norm);
        }