public virtual void TestGetDistributionFromLogValues() { ICounter <string> c1 = new ClassicCounter <string>(); c1.SetCount("p", 1.0); c1.SetCount("q", 2.0); c1.SetCount("r", 3.0); c1.SetCount("s", 4.0); // take log Counters.LogInPlace(c1); // now call distribution Distribution <string> distribution = Distribution.GetDistributionFromLogValues(c1); // test NUnit.Framework.Assert.AreEqual(distribution.KeySet().Count, 4); // size // keys NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("p"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("q"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("r"), true); NUnit.Framework.Assert.AreEqual(distribution.ContainsKey("s"), true); // values NUnit.Framework.Assert.AreEqual(distribution.GetCount("p"), 1.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("q"), 2.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("r"), 3.0E-1, 1E-10); NUnit.Framework.Assert.AreEqual(distribution.GetCount("s"), 4.0E-1, 1E-10); }
public virtual void TestToSortedString() { ICounter <string> c = new ClassicCounter <string>(); c.SetCount("b", 0.25); c.SetCount("a", 0.5); c.SetCount("c", 1.0); // check full argument version string result = Counters.ToSortedString(c, 5, "%s%.1f", ":", "{%s}"); NUnit.Framework.Assert.AreEqual("{c1.0:a0.5:b0.3}", result); // check version with no wrapper result = Counters.ToSortedString(c, 2, "%2$f %1$s", "\n"); NUnit.Framework.Assert.AreEqual("1.000000 c\n0.500000 a", result); // check some equivalences to other Counters methods int k = 2; result = Counters.ToSortedString(c, k, "%s=%s", ", ", "[%s]"); NUnit.Framework.Assert.AreEqual(Counters.ToString(c, k), result); NUnit.Framework.Assert.AreEqual(Counters.ToBiggestValuesFirstString(c, k), result); result = Counters.ToSortedString(c, k, "%2$g\t%1$s", "\n", "%s\n"); NUnit.Framework.Assert.AreEqual(Counters.ToVerticalString(c, k), result); // test sorting by keys result = Counters.ToSortedByKeysString(c, "%s=>%.2f", "; ", "<%s>"); NUnit.Framework.Assert.AreEqual("<a=>0.50; b=>0.25; c=>1.00>", result); }
public static double SampleBeta(double a, double b, Random random) { ICounter <bool> c = new ClassicCounter <bool>(); c.SetCount(true, a); c.SetCount(false, b); Multinomial <bool> beta = (new Edu.Stanford.Nlp.Stats.Dirichlet <bool>(c)).DrawSample(random); return(beta.ProbabilityOf(true)); }
public virtual void TestSerializeStringCounter() { ICounter <string> counts = new ClassicCounter <string>(); for (int @base = -10; @base < 10; ++@base) { if (@base == 0) { continue; } for (int exponent = -100; exponent < 100; ++exponent) { double number = Math.Pow(Math.Pi * @base, exponent); counts.SetCount(double.ToString(number), number); } } File tmp = File.CreateTempFile("counts", ".tab.gz"); tmp.DeleteOnExit(); Counters.SerializeStringCounter(counts, tmp.GetPath()); ICounter <string> reread = Counters.DeserializeStringCounter(tmp.GetPath()); foreach (KeyValuePair <string, double> entry in reread.EntrySet()) { double old = counts.GetCount(entry.Key); NUnit.Framework.Assert.AreEqual(old, entry.Value, Math.Abs(old) / 1e5); } }
public static Multinomial <F> DrawSample <F>(Random random, ICounter <F> parameters) { ICounter <F> multParameters = new ClassicCounter <F>(); double sum = 0.0; foreach (F o in parameters.KeySet()) { double parameter = Gamma.DrawSample(random, parameters.GetCount(o)); sum += parameter; multParameters.SetCount(o, parameter); } foreach (F o_1 in multParameters.KeySet()) { multParameters.SetCount(o_1, multParameters.GetCount(o_1) / sum); } return(new Multinomial <F>(multParameters)); }
public virtual void SetCount(K1 o1, K2 o2, double count) { ClassicCounter <K2> c = GetCounter(o1); double oldCount = GetCount(o1, o2); total -= oldCount; c.SetCount(o2, count); total += count; }
public virtual ClassicCounter <OUT> LastF1() { ClassicCounter <OUT> result = new ClassicCounter <OUT>(); ICollection <OUT> keys = Sets.Union(previousGuessed.KeySet(), previousGold.KeySet()); foreach (OUT key in keys) { result.SetCount(key, LastF1(key)); } return(result); }
public virtual void TestHIndex() { // empty counter ICounter <string> c = new ClassicCounter <string>(); NUnit.Framework.Assert.AreEqual(0, Counters.HIndex(c)); // two items with 2 or more citations c.SetCount("X", 3); c.SetCount("Y", 2); c.SetCount("Z", 1); NUnit.Framework.Assert.AreEqual(2, Counters.HIndex(c)); // 14 items with 14 or more citations for (int i = 0; i < 14; ++i) { c.SetCount(i.ToString(), 15); } NUnit.Framework.Assert.AreEqual(14, Counters.HIndex(c)); // 15 items with 15 or more citations c.SetCount("15", 15); NUnit.Framework.Assert.AreEqual(15, Counters.HIndex(c)); }
public virtual void TestJensenShannonDivergence() { // borrow from ArrayMathTest ICounter <string> a = new ClassicCounter <string>(); a.SetCount("a", 1.0); a.SetCount("b", 1.0); a.SetCount("c", 7.0); a.SetCount("d", 1.0); ICounter <string> b = new ClassicCounter <string>(); b.SetCount("b", 1.0); b.SetCount("c", 1.0); b.SetCount("d", 7.0); b.SetCount("e", 1.0); b.SetCount("f", 0.0); NUnit.Framework.Assert.AreEqual(0.46514844544032313, Counters.JensenShannonDivergence(a, b), 1e-5); ICounter <string> c = new ClassicCounter <string>(Java.Util.Collections.SingletonList("A")); ICounter <string> d = new ClassicCounter <string>(Arrays.AsList("B", "C")); NUnit.Framework.Assert.AreEqual(1.0, Counters.JensenShannonDivergence(c, d), 1e-5); }
public virtual void TestFlatten() { IDictionary <string, ICounter <string> > h = new Dictionary <string, ICounter <string> >(); ICounter <string> a = new ClassicCounter <string>(); a.SetCount("a", 1.0); a.SetCount("b", 1.0); a.SetCount("c", 7.0); a.SetCount("d", 1.0); ICounter <string> b = new ClassicCounter <string>(); b.SetCount("b", 1.0); b.SetCount("c", 1.0); b.SetCount("d", 7.0); b.SetCount("e", 1.0); b.SetCount("f", 1.0); h["first"] = a; h["second"] = b; ICounter <string> flat = Counters.Flatten(h); NUnit.Framework.Assert.AreEqual(6, flat.Size()); NUnit.Framework.Assert.AreEqual(2.0, flat.GetCount("b")); }
/// <summary>Returns a new Distribution<K> with counts averaged from the two given Distributions.</summary> /// <remarks> /// Returns a new Distribution<K> with counts averaged from the two given Distributions. /// The average Distribution<K> will contain the union of keys in both /// source Distributions, and each count will be the weighted average of the two source /// counts for that key, a missing count in one Distribution /// is treated as if it has probability equal to that returned by the probabilityOf() function. /// </remarks> /// <returns> /// A new distribution with counts that are the mean of the resp. counts /// in the given distributions with the remaining probability mass adjusted accordingly. /// </returns> public static Distribution <K> WeightedAverage <K>(Distribution <K> d1, double w1, Distribution <K> d2) { double w2 = 1.0 - w1; ICollection <K> allKeys = GetSetOfAllKeys(d1, d2); int numKeys = d1.GetNumberOfKeys(); ICounter <K> c = new ClassicCounter <K>(); foreach (K key in allKeys) { double newProbability = d1.ProbabilityOf(key) * w1 + d2.ProbabilityOf(key) * w2; c.SetCount(key, newProbability); } return(Distribution.GetDistributionFromPartiallySpecifiedCounter(c, numKeys)); }
public virtual ClassicCounter <Pair <K1, K2> > Flatten() { ClassicCounter <Pair <K1, K2> > result = new ClassicCounter <Pair <K1, K2> >(); result.SetDefaultReturnValue(defaultValue); foreach (K1 key1 in FirstKeySet()) { ClassicCounter <K2> inner = GetCounter(key1); foreach (K2 key2 in inner.KeySet()) { result.SetCount(new Pair <K1, K2>(key1, key2), inner.GetCount(key2)); } } return(result); }
/// <summary> /// Creates a Distribution from the given counter, ie makes an internal /// copy of the counter and divides all counts by the total count. /// </summary> /// <returns>a new Distribution</returns> public static Edu.Stanford.Nlp.Stats.Distribution <E> GetDistributionFromLogValues <E>(ICounter <E> counter) { ICounter <E> c = new ClassicCounter <E>(); // go through once to get the max // shift all by max so as to minimize the possibility of underflow double max = Counters.Max(counter); // Thang 17Feb12: max should operate on counter instead of c, fixed! foreach (E key in counter.KeySet()) { double count = Math.Exp(counter.GetCount(key) - max); c.SetCount(key, count); } return(GetDistribution(c)); }
/// <summary> /// Converts from the format printed by the toString method back into /// a Counter<String>. /// </summary> /// <remarks> /// Converts from the format printed by the toString method back into /// a Counter<String>. The toString() doesn't escape, so this only /// works providing the keys of the Counter do not have commas or equals signs /// in them. /// </remarks> /// <param name="s">A String representation of a Counter</param> /// <returns>The Counter</returns> public static ClassicCounter <string> FromString(string s) { ClassicCounter <string> result = new ClassicCounter <string>(); if (!s.StartsWith("{") || !s.EndsWith("}")) { throw new Exception("invalid format: ||" + s + "||"); } s = Sharpen.Runtime.Substring(s, 1, s.Length - 1); string[] lines = s.Split(", "); foreach (string line in lines) { string[] fields = line.Split("="); if (fields.Length != 2) { throw new Exception("Got unsplittable line: \"" + line + '\"'); } result.SetCount(fields[0], double.Parse(fields[1])); } return(result); }
// ---------------------------------------------------------------------------- /// <summary> /// Creates a Distribution from the given counter using Gale & Sampsons' /// "simple Good-Turing" smoothing. /// </summary> /// <returns>a new simple Good-Turing smoothed Distribution.</returns> public static Edu.Stanford.Nlp.Stats.Distribution <E> SimpleGoodTuring <E>(ICounter <E> counter, int numberOfKeys) { // check arguments ValidateCounter(counter); int numUnseen = numberOfKeys - counter.Size(); if (numUnseen < 1) { throw new ArgumentException(string.Format("ERROR: numberOfKeys %d must be > size of counter %d!", numberOfKeys, counter.Size())); } // do smoothing int[][] cc = CountCounts2IntArrays(CollectCountCounts(counter)); int[] r = cc[0]; // counts int[] n = cc[1]; // counts of counts Edu.Stanford.Nlp.Stats.SimpleGoodTuring sgt = new Edu.Stanford.Nlp.Stats.SimpleGoodTuring(r, n); // collate results ICounter <int> probsByCount = new ClassicCounter <int>(); double[] probs = sgt.GetProbabilities(); for (int i = 0; i < probs.Length; i++) { probsByCount.SetCount(r[i], probs[i]); } // make smoothed distribution Edu.Stanford.Nlp.Stats.Distribution <E> dist = new Edu.Stanford.Nlp.Stats.Distribution <E>(); dist.counter = new ClassicCounter <E>(); foreach (KeyValuePair <E, double> entry in counter.EntrySet()) { E item = entry.Key; int count = (int)Math.Round(entry.Value); dist.counter.SetCount(item, probsByCount.GetCount(count)); } dist.numberOfKeys = numberOfKeys; dist.reservedMass = sgt.GetProbabilityForUnseen(); return(dist); }
// EXTRA I/O METHODS /// <summary>Returns the Counter over Strings specified by this String.</summary> /// <remarks> /// Returns the Counter over Strings specified by this String. /// The String is often the whole contents of a file. /// The file can include comments if each line of comment starts with /// a hash (#) symbol, and does not contain any TAB characters. /// Otherwise, the format is one entry per line. Each line must contain /// precisely one tab separating a key and a value, giving a format of: /// <blockquote> /// StringKey\tdoubleValue\n /// </blockquote> /// </remarks> /// <param name="s"> /// String representation of a Counter, where entries are one per /// line such that each line is either a comment (begins with #) /// or key \t value /// </param> /// <returns>The Counter with String keys</returns> public static ClassicCounter <string> ValueOfIgnoreComments(string s) { ClassicCounter <string> result = new ClassicCounter <string>(); string[] lines = s.Split("\n"); foreach (string line in lines) { string[] fields = line.Split("\t"); if (fields.Length != 2) { if (line.StartsWith("#")) { continue; } else { throw new Exception("Got unsplittable line: \"" + line + '\"'); } } result.SetCount(fields[0], double.Parse(fields[1])); } return(result); }
/// <summary>For internal testing purposes only.</summary> public static void Main(string[] args) { ICounter <string> c2 = new ClassicCounter <string>(); c2.IncrementCount("p", 13); c2.SetCount("q", 12); c2.SetCount("w", 5); c2.IncrementCount("x", 7.5); // System.out.println(getDistribution(c2).getCount("w") + " should be 0.13333"); ClassicCounter <string> c = new ClassicCounter <string>(); double p = 1000; string Unk = "!*UNKNOWN*!"; ICollection <string> s = Generics.NewHashSet(); s.Add(Unk); // fill counter with roughly Zipfian distribution // "1" : 1000 // "2" : 500 // "3" : 333 // ... // "UNK" : 45 // ... // "666" : 2 // "667" : 1 // ... // "1000" : 1 for (int rank = 1; rank < 2000; rank++) { string i = rank.ToString(); c.SetCount(i, Math.Round(p / rank)); s.Add(i); } for (int rank_1 = 2000; rank_1 <= 4000; rank_1++) { string i = rank_1.ToString(); s.Add(i); } Distribution <string> n = GetDistribution(c); Distribution <string> prior = GetUniformDistribution(s); Distribution <string> dir1 = DistributionWithDirichletPrior(c, prior, 4000); Distribution <string> dir2 = DynamicCounterWithDirichletPrior(c, prior, 4000); Distribution <string> add1; Distribution <string> gt; if (true) { add1 = LaplaceSmoothedDistribution(c, 4000); gt = GoodTuringSmoothedCounter(c, 4000); } else { c.SetCount(Unk, 45); add1 = LaplaceWithExplicitUnknown(c, 0.5, Unk); gt = GoodTuringWithExplicitUnknown(c, Unk); } Distribution <string> sgt = SimpleGoodTuring(c, 4000); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "Freq", "Norm", "Add1", "Dir1", "Dir2", "GT", "SGT"); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); for (int i_1 = 1; i_1 < 5; i_1++) { System.Console.Out.Printf("%10d ", Math.Round(p / i_1)); string @in = i_1.ToString(); System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(@in.ToString())); System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(@in)); System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(@in)); System.Console.Out.WriteLine(); } System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10d ", 1); string last = 1500.ToString(); System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(last)); System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(last)); System.Console.Out.WriteLine(); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10s ", "UNK"); System.Console.Out.Printf("%10.8f ", n.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", add1.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", dir1.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", dir2.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", gt.ProbabilityOf(Unk)); System.Console.Out.Printf("%10.8f ", sgt.ProbabilityOf(Unk)); System.Console.Out.WriteLine(); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10s ", "RESERVE"); System.Console.Out.Printf("%10.8f ", n.GetReservedMass()); System.Console.Out.Printf("%10.8f ", add1.GetReservedMass()); System.Console.Out.Printf("%10.8f ", dir1.GetReservedMass()); System.Console.Out.Printf("%10.8f ", dir2.GetReservedMass()); System.Console.Out.Printf("%10.8f ", gt.GetReservedMass()); System.Console.Out.Printf("%10.8f ", sgt.GetReservedMass()); System.Console.Out.WriteLine(); System.Console.Out.Printf("%10s %10s %10s %10s %10s %10s %10s%n", "----------", "----------", "----------", "----------", "----------", "----------", "----------"); System.Console.Out.Printf("%10s ", "Total"); System.Console.Out.Printf("%10.8f ", n.TotalCount()); System.Console.Out.Printf("%10.8f ", add1.TotalCount()); System.Console.Out.Printf("%10.8f ", dir1.TotalCount()); System.Console.Out.Printf("%10.8f ", dir2.TotalCount()); System.Console.Out.Printf("%10.8f ", gt.TotalCount()); System.Console.Out.Printf("%10.8f ", sgt.TotalCount()); System.Console.Out.WriteLine(); }