public virtual void TestEntrySet() { c.Clear(); c.SetCount("r", 3.0); c.SetCount("p", 1.0); c.SetCount("q", 2.0); c.SetCount("s", 4.0); NUnit.Framework.Assert.AreEqual(10.0, c.TotalCount()); NUnit.Framework.Assert.AreEqual(1.0, c.GetCount("p")); foreach (KeyValuePair <string, double> entry in c.EntrySet()) { if (entry.Key.Equals("p")) { NUnit.Framework.Assert.AreEqual(1.0, entry.SetValue(3.0)); NUnit.Framework.Assert.AreEqual(3.0, entry.Value); } } NUnit.Framework.Assert.AreEqual(3.0, c.GetCount("p")); NUnit.Framework.Assert.AreEqual(12.0, c.TotalCount()); ICollection <double> vals = c.Values(); double tot = 0.0; foreach (double d in vals) { tot += d; } NUnit.Framework.Assert.AreEqual("Testing values()", 12.0, tot); }
internal virtual double GetPatTFIDFScore(CandidatePhrase word, ICounter <E> patsThatExtractedThis, ICounter <E> allSelectedPatterns) { if (Data.processedDataFreq.GetCount(word) == 0.0) { Redwood.Log(Redwood.Warn, "How come the processed corpus freq has count of " + word + " 0. The count in raw freq is " + Data.rawFreq.GetCount(word) + " and the Data.rawFreq size is " + Data.rawFreq.Size()); return(0); } else { double total = 0; ICollection <E> rem = new HashSet <E>(); foreach (KeyValuePair <E, double> en2 in patsThatExtractedThis.EntrySet()) { double weight = 1.0; if (usePatternWeights) { weight = allSelectedPatterns.GetCount(en2.Key); if (weight == 0) { Redwood.Log(Redwood.Force, "Warning: Weight zero for " + en2.Key + ". May be pattern was removed when choosing other patterns (if subsumed by another pattern)."); rem.Add(en2.Key); } } total += weight; } Counters.RemoveKeys(patsThatExtractedThis, rem); double score = total / Data.processedDataFreq.GetCount(word); return(score); } }
public virtual void TestSerializeStringCounter() { ICounter <string> counts = new ClassicCounter <string>(); for (int @base = -10; @base < 10; ++@base) { if (@base == 0) { continue; } for (int exponent = -100; exponent < 100; ++exponent) { double number = Math.Pow(Math.Pi * @base, exponent); counts.SetCount(double.ToString(number), number); } } File tmp = File.CreateTempFile("counts", ".tab.gz"); tmp.DeleteOnExit(); Counters.SerializeStringCounter(counts, tmp.GetPath()); ICounter <string> reread = Counters.DeserializeStringCounter(tmp.GetPath()); foreach (KeyValuePair <string, double> entry in reread.EntrySet()) { double old = counts.GetCount(entry.Key); NUnit.Framework.Assert.AreEqual(old, entry.Value, Math.Abs(old) / 1e5); } }
private static ICounter <string> GetConjunction(ICounter <string> original, string suffix) { ICounter <string> conjuction = new ClassicCounter <string>(); foreach (KeyValuePair <string, double> e in original.EntrySet()) { conjuction.IncrementCount(e.Key + suffix, e.Value); } return(conjuction); }
public virtual double WeightFeatureProduct(ICounter <string> features) { double product = 0; foreach (KeyValuePair <string, double> feature in features.EntrySet()) { product += feature.Value * weights.GetCount(feature.Key); } return(product); }
private static ICounter <string> AddSuffix(ICounter <string> features, string suffix) { ICounter <string> withSuffix = new ClassicCounter <string>(); foreach (KeyValuePair <string, double> e in features.EntrySet()) { withSuffix.IncrementCount(e.Key + suffix, e.Value); } return(withSuffix); }
/* Helper to simpleGoodTuringSmoothedCounter() */ private static ICounter <int> CollectCountCounts <E>(ICounter <E> counts) { ICounter <int> cc = new ClassicCounter <int>(); // counts of counts foreach (KeyValuePair <E, double> entry in counts.EntrySet()) { //E item = entry.getKey(); int count = (int)Math.Round(entry.Value); cc.IncrementCount(count); } return(cc); }
/// <exception cref="System.IO.IOException"/> /// <exception cref="System.TypeLoadException"/> public virtual ICounter <string> GetTopFeatures(IEnumerator <Pair <IDictionary <string, DataInstance>, File> > sentsf, double perSelectRand, double perSelectNeg, string externalFeatureWeightsFileLabel) { ICounter <string> features = new ClassicCounter <string>(); RVFDataset <string, string> dataset = new RVFDataset <string, string>(); Random r = new Random(10); Random rneg = new Random(10); int numrand = 0; IList <Pair <string, int> > chosen = new List <Pair <string, int> >(); while (sentsf.MoveNext()) { Pair <IDictionary <string, DataInstance>, File> sents = sentsf.Current; numrand = this.Sample(sents.First(), r, rneg, perSelectNeg, perSelectRand, numrand, chosen, dataset); } /*if(batchProcessSents){ * for(File f: sentFiles){ * Map<String, List<CoreLabel>> sentsf = IOUtils.readObjectFromFile(f); * numrand = this.sample(sentsf, r, rneg, perSelectNeg, perSelectRand, numrand, chosen, dataset); * } * }else * numrand = this.sample(sents, r, rneg, perSelectNeg, perSelectRand, numrand, chosen, dataset); */ System.Console.Out.WriteLine("num random chosen: " + numrand); System.Console.Out.WriteLine("Number of datums per label: " + dataset.NumDatumsPerLabel()); LogisticClassifierFactory <string, string> logfactory = new LogisticClassifierFactory <string, string>(); LogisticClassifier <string, string> classifier = logfactory.TrainClassifier(dataset); ICounter <string> weights = classifier.WeightsAsCounter(); if (!classifier.GetLabelForInternalPositiveClass().Equals(answerLabel)) { weights = Counters.Scale(weights, -1); } if (thresholdWeight != null) { HashSet <string> removeKeys = new HashSet <string>(); foreach (KeyValuePair <string, double> en in weights.EntrySet()) { if (Math.Abs(en.Value) <= thresholdWeight) { removeKeys.Add(en.Key); } } Counters.RemoveKeys(weights, removeKeys); System.Console.Out.WriteLine("Removing " + removeKeys); } IOUtils.WriteStringToFile(Counters.ToSortedString(weights, weights.Size(), "%1$s:%2$f", "\n"), externalFeatureWeightsFileLabel, "utf8"); // getDecisionTree(sents, chosen, weights, wekaOptions); return(features); }
/* Helper to simpleGoodTuringSmoothedCounter() */ private static void ValidateCounter <E>(ICounter <E> counts) { foreach (KeyValuePair <E, double> entry in counts.EntrySet()) { E item = entry.Key; double dblCount = entry.Value; if (dblCount == null) { throw new ArgumentException("ERROR: null count for item " + item + "!"); } if (dblCount < 0) { throw new ArgumentException("ERROR: negative count " + dblCount + " for item " + item + "!"); } } }
public static ICounter <string> FilterOut(ICounter <string> c, IList <string> disallowedPrefixes) { ICounter <string> c2 = new ClassicCounter <string>(); foreach (KeyValuePair <string, double> e in c.EntrySet()) { bool allowed = true; foreach (string prefix in disallowedPrefixes) { allowed &= !e.Key.StartsWith(prefix); } if (allowed) { c2.IncrementCount(e.Key, e.Value); } } return(c2); }
public virtual void Learn(Example correct, Example incorrect, IDictionary <int, CompressedFeatureVector> mentionFeatures, Compressor <string> compressor, MaxMarginMentionRanker.ErrorType errorType) { ICounter <string> cFeatures = meta.GetFeatures(correct, mentionFeatures, compressor); ICounter <string> iFeatures = meta.GetFeatures(incorrect, mentionFeatures, compressor); foreach (KeyValuePair <string, double> e in cFeatures.EntrySet()) { iFeatures.DecrementCount(e.Key, e.Value); } if (multiplicativeCost) { classifier.Learn(iFeatures, 1.0, costs[errorType.id], loss); } else { classifier.Learn(iFeatures, 1.0, 1.0, losses[errorType.id]); } }
public virtual CompressedFeatureVector Compress(ICounter <K> c) { IList <int> keys = new List <int>(c.Size()); IList <double> values = new List <double>(c.Size()); foreach (KeyValuePair <K, double> e in c.EntrySet()) { K key = e.Key; int id = index[key]; if (id == null) { id = index.Count; inverse[id] = key; index[key] = id; } keys.Add(id); values.Add(e.Value); } return(new CompressedFeatureVector(keys, values)); }
public virtual void Learn(ICounter <string> features, double label, double weight, SimpleLinearClassifier.ILoss loss) { examplesSeen++; double dloss = loss.Derivative(label, WeightFeatureProduct(features)); foreach (KeyValuePair <string, double> feature in features.EntrySet()) { double dfeature = weight * (-dloss * feature.Value); if (dfeature != 0) { string featureName = feature.Key; learningRateSchedule.Update(featureName, dfeature); double lr = learningRateSchedule.GetLearningRate(featureName); double w = weights.GetCount(featureName); double dreg = weight * regularizationStrength * (examplesSeen - accessTimes.GetCount(featureName)); double afterReg = (w - Math.Signum(w) * dreg * lr); weights.SetCount(featureName, (Math.Signum(afterReg) != Math.Signum(w) ? 0 : afterReg) + dfeature * lr); accessTimes.SetCount(featureName, examplesSeen); } } }
private static ICollection <string> LoadVocabulary(string wordCountsPath) { ICollection <string> vocabulary = new HashSet <string>(); try { ICounter <string> counts = IOUtils.ReadObjectFromURLOrClasspathOrFileSystem(wordCountsPath); foreach (KeyValuePair <string, double> e in counts.EntrySet()) { if (e.Value > MinWordCount) { vocabulary.Add(e.Key); } } } catch (Exception e) { throw new Exception("Error loading word counts", e); } return(vocabulary); }
// ---------------------------------------------------------------------------- /// <summary> /// Creates a Distribution from the given counter using Gale & Sampsons' /// "simple Good-Turing" smoothing. /// </summary> /// <returns>a new simple Good-Turing smoothed Distribution.</returns> public static Edu.Stanford.Nlp.Stats.Distribution <E> SimpleGoodTuring <E>(ICounter <E> counter, int numberOfKeys) { // check arguments ValidateCounter(counter); int numUnseen = numberOfKeys - counter.Size(); if (numUnseen < 1) { throw new ArgumentException(string.Format("ERROR: numberOfKeys %d must be > size of counter %d!", numberOfKeys, counter.Size())); } // do smoothing int[][] cc = CountCounts2IntArrays(CollectCountCounts(counter)); int[] r = cc[0]; // counts int[] n = cc[1]; // counts of counts Edu.Stanford.Nlp.Stats.SimpleGoodTuring sgt = new Edu.Stanford.Nlp.Stats.SimpleGoodTuring(r, n); // collate results ICounter <int> probsByCount = new ClassicCounter <int>(); double[] probs = sgt.GetProbabilities(); for (int i = 0; i < probs.Length; i++) { probsByCount.SetCount(r[i], probs[i]); } // make smoothed distribution Edu.Stanford.Nlp.Stats.Distribution <E> dist = new Edu.Stanford.Nlp.Stats.Distribution <E>(); dist.counter = new ClassicCounter <E>(); foreach (KeyValuePair <E, double> entry in counter.EntrySet()) { E item = entry.Key; int count = (int)Math.Round(entry.Value); dist.counter.SetCount(item, probsByCount.GetCount(count)); } dist.numberOfKeys = numberOfKeys; dist.reservedMass = sgt.GetProbabilityForUnseen(); return(dist); }