Ejemplo n.º 1
0
        // ----------------------------------------------------------------------------
        /// <summary>
        /// Creates a Distribution from the given counter using Gale &amp; Sampsons'
        /// "simple Good-Turing" smoothing.
        /// </summary>
        /// <returns>a new simple Good-Turing smoothed Distribution.</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> SimpleGoodTuring <E>(ICounter <E> counter, int numberOfKeys)
        {
            // check arguments
            ValidateCounter(counter);
            int numUnseen = numberOfKeys - counter.Size();

            if (numUnseen < 1)
            {
                throw new ArgumentException(string.Format("ERROR: numberOfKeys %d must be > size of counter %d!", numberOfKeys, counter.Size()));
            }
            // do smoothing
            int[][] cc = CountCounts2IntArrays(CollectCountCounts(counter));
            int[]   r  = cc[0];
            // counts
            int[] n = cc[1];
            // counts of counts
            Edu.Stanford.Nlp.Stats.SimpleGoodTuring sgt = new Edu.Stanford.Nlp.Stats.SimpleGoodTuring(r, n);
            // collate results
            ICounter <int> probsByCount = new ClassicCounter <int>();

            double[] probs = sgt.GetProbabilities();
            for (int i = 0; i < probs.Length; i++)
            {
                probsByCount.SetCount(r[i], probs[i]);
            }
            // make smoothed distribution
            Edu.Stanford.Nlp.Stats.Distribution <E> dist = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            dist.counter = new ClassicCounter <E>();
            foreach (KeyValuePair <E, double> entry in counter.EntrySet())
            {
                E   item  = entry.Key;
                int count = (int)Math.Round(entry.Value);
                dist.counter.SetCount(item, probsByCount.GetCount(count));
            }
            dist.numberOfKeys = numberOfKeys;
            dist.reservedMass = sgt.GetProbabilityForUnseen();
            return(dist);
        }
Ejemplo n.º 2
0
 // main =======================================================================
 /// <summary>
 /// Like Sampson's SGT program, reads data from STDIN and writes results to
 /// STDOUT.
 /// </summary>
 /// <remarks>
 /// Like Sampson's SGT program, reads data from STDIN and writes results to
 /// STDOUT.  The input should contain two integers on each line, separated by
 /// whitespace.  The first integer is a count; the second is a count for that
 /// count.  The input must be sorted in ascending order, and should not contain
 /// 0s.  For example, valid input is: <p/>
 /// <pre>
 /// 1 10
 /// 2 6
 /// 3 4
 /// 5 2
 /// 8 1
 /// </pre>
 /// This represents a collection in which 10 types occur once each, 6 types
 /// occur twice each, 4 types occur 3 times each, 2 types occur 5 times each,
 /// and one type occurs 10 times, for a total count of 52.  This input will
 /// produce the following output: </p>
 /// <pre>
 /// r      n        p       p
 /// ----   ----     ----     ----
 /// 0      0    0.000   0.1923
 /// 1     10  0.01923  0.01203
 /// 2      6  0.03846  0.02951
 /// 3      4  0.05769  0.04814
 /// 5      2  0.09615  0.08647
 /// 8      1   0.1538   0.1448
 /// </pre>
 /// The last column represents the smoothed probabilities, and the first item
 /// in this column represents the probability assigned to unseen items.
 /// </remarks>
 /// <exception cref="System.Exception"/>
 public static void Main(string[] args)
 {
     int[][] input = ReadInput();
     Edu.Stanford.Nlp.Stats.SimpleGoodTuring sgt = new Edu.Stanford.Nlp.Stats.SimpleGoodTuring(input[0], input[1]);
     sgt.Print();
 }