public virtual void TestEntrySet()
        {
            c.Clear();
            c.SetCount("r", 3.0);
            c.SetCount("p", 1.0);
            c.SetCount("q", 2.0);
            c.SetCount("s", 4.0);
            NUnit.Framework.Assert.AreEqual(10.0, c.TotalCount());
            NUnit.Framework.Assert.AreEqual(1.0, c.GetCount("p"));
            foreach (KeyValuePair <string, double> entry in c.EntrySet())
            {
                if (entry.Key.Equals("p"))
                {
                    NUnit.Framework.Assert.AreEqual(1.0, entry.SetValue(3.0));
                    NUnit.Framework.Assert.AreEqual(3.0, entry.Value);
                }
            }
            NUnit.Framework.Assert.AreEqual(3.0, c.GetCount("p"));
            NUnit.Framework.Assert.AreEqual(12.0, c.TotalCount());
            ICollection <double> vals = c.Values();
            double tot = 0.0;

            foreach (double d in vals)
            {
                tot += d;
            }
            NUnit.Framework.Assert.AreEqual("Testing values()", 12.0, tot);
        }
 internal virtual double GetPatTFIDFScore(CandidatePhrase word, ICounter <E> patsThatExtractedThis, ICounter <E> allSelectedPatterns)
 {
     if (Data.processedDataFreq.GetCount(word) == 0.0)
     {
         Redwood.Log(Redwood.Warn, "How come the processed corpus freq has count of " + word + " 0. The count in raw freq is " + Data.rawFreq.GetCount(word) + " and the Data.rawFreq size is " + Data.rawFreq.Size());
         return(0);
     }
     else
     {
         double          total = 0;
         ICollection <E> rem   = new HashSet <E>();
         foreach (KeyValuePair <E, double> en2 in patsThatExtractedThis.EntrySet())
         {
             double weight = 1.0;
             if (usePatternWeights)
             {
                 weight = allSelectedPatterns.GetCount(en2.Key);
                 if (weight == 0)
                 {
                     Redwood.Log(Redwood.Force, "Warning: Weight zero for " + en2.Key + ". May be pattern was removed when choosing other patterns (if subsumed by another pattern).");
                     rem.Add(en2.Key);
                 }
             }
             total += weight;
         }
         Counters.RemoveKeys(patsThatExtractedThis, rem);
         double score = total / Data.processedDataFreq.GetCount(word);
         return(score);
     }
 }
Beispiel #3
0
        public virtual void TestSerializeStringCounter()
        {
            ICounter <string> counts = new ClassicCounter <string>();

            for (int @base = -10; @base < 10; ++@base)
            {
                if (@base == 0)
                {
                    continue;
                }
                for (int exponent = -100; exponent < 100; ++exponent)
                {
                    double number = Math.Pow(Math.Pi * @base, exponent);
                    counts.SetCount(double.ToString(number), number);
                }
            }
            File tmp = File.CreateTempFile("counts", ".tab.gz");

            tmp.DeleteOnExit();
            Counters.SerializeStringCounter(counts, tmp.GetPath());
            ICounter <string> reread = Counters.DeserializeStringCounter(tmp.GetPath());

            foreach (KeyValuePair <string, double> entry in reread.EntrySet())
            {
                double old = counts.GetCount(entry.Key);
                NUnit.Framework.Assert.AreEqual(old, entry.Value, Math.Abs(old) / 1e5);
            }
        }
        private static ICounter <string> GetConjunction(ICounter <string> original, string suffix)
        {
            ICounter <string> conjuction = new ClassicCounter <string>();

            foreach (KeyValuePair <string, double> e in original.EntrySet())
            {
                conjuction.IncrementCount(e.Key + suffix, e.Value);
            }
            return(conjuction);
        }
Beispiel #5
0
        public virtual double WeightFeatureProduct(ICounter <string> features)
        {
            double product = 0;

            foreach (KeyValuePair <string, double> feature in features.EntrySet())
            {
                product += feature.Value * weights.GetCount(feature.Key);
            }
            return(product);
        }
        private static ICounter <string> AddSuffix(ICounter <string> features, string suffix)
        {
            ICounter <string> withSuffix = new ClassicCounter <string>();

            foreach (KeyValuePair <string, double> e in features.EntrySet())
            {
                withSuffix.IncrementCount(e.Key + suffix, e.Value);
            }
            return(withSuffix);
        }
Beispiel #7
0
        /* Helper to simpleGoodTuringSmoothedCounter() */
        private static ICounter <int> CollectCountCounts <E>(ICounter <E> counts)
        {
            ICounter <int> cc = new ClassicCounter <int>();

            // counts of counts
            foreach (KeyValuePair <E, double> entry in counts.EntrySet())
            {
                //E item = entry.getKey();
                int count = (int)Math.Round(entry.Value);
                cc.IncrementCount(count);
            }
            return(cc);
        }
        /// <exception cref="System.IO.IOException"/>
        /// <exception cref="System.TypeLoadException"/>
        public virtual ICounter <string> GetTopFeatures(IEnumerator <Pair <IDictionary <string, DataInstance>, File> > sentsf, double perSelectRand, double perSelectNeg, string externalFeatureWeightsFileLabel)
        {
            ICounter <string>           features = new ClassicCounter <string>();
            RVFDataset <string, string> dataset  = new RVFDataset <string, string>();
            Random r       = new Random(10);
            Random rneg    = new Random(10);
            int    numrand = 0;
            IList <Pair <string, int> > chosen = new List <Pair <string, int> >();

            while (sentsf.MoveNext())
            {
                Pair <IDictionary <string, DataInstance>, File> sents = sentsf.Current;
                numrand = this.Sample(sents.First(), r, rneg, perSelectNeg, perSelectRand, numrand, chosen, dataset);
            }

            /*if(batchProcessSents){
             * for(File f: sentFiles){
             * Map<String, List<CoreLabel>> sentsf = IOUtils.readObjectFromFile(f);
             * numrand = this.sample(sentsf, r, rneg, perSelectNeg, perSelectRand, numrand, chosen, dataset);
             * }
             * }else
             * numrand = this.sample(sents, r, rneg, perSelectNeg, perSelectRand, numrand, chosen, dataset);
             */
            System.Console.Out.WriteLine("num random chosen: " + numrand);
            System.Console.Out.WriteLine("Number of datums per label: " + dataset.NumDatumsPerLabel());
            LogisticClassifierFactory <string, string> logfactory = new LogisticClassifierFactory <string, string>();
            LogisticClassifier <string, string>        classifier = logfactory.TrainClassifier(dataset);
            ICounter <string> weights = classifier.WeightsAsCounter();

            if (!classifier.GetLabelForInternalPositiveClass().Equals(answerLabel))
            {
                weights = Counters.Scale(weights, -1);
            }
            if (thresholdWeight != null)
            {
                HashSet <string> removeKeys = new HashSet <string>();
                foreach (KeyValuePair <string, double> en in weights.EntrySet())
                {
                    if (Math.Abs(en.Value) <= thresholdWeight)
                    {
                        removeKeys.Add(en.Key);
                    }
                }
                Counters.RemoveKeys(weights, removeKeys);
                System.Console.Out.WriteLine("Removing " + removeKeys);
            }
            IOUtils.WriteStringToFile(Counters.ToSortedString(weights, weights.Size(), "%1$s:%2$f", "\n"), externalFeatureWeightsFileLabel, "utf8");
            // getDecisionTree(sents, chosen, weights, wekaOptions);
            return(features);
        }
Beispiel #9
0
 /* Helper to simpleGoodTuringSmoothedCounter() */
 private static void ValidateCounter <E>(ICounter <E> counts)
 {
     foreach (KeyValuePair <E, double> entry in counts.EntrySet())
     {
         E      item     = entry.Key;
         double dblCount = entry.Value;
         if (dblCount == null)
         {
             throw new ArgumentException("ERROR: null count for item " + item + "!");
         }
         if (dblCount < 0)
         {
             throw new ArgumentException("ERROR: negative count " + dblCount + " for item " + item + "!");
         }
     }
 }
        public static ICounter <string> FilterOut(ICounter <string> c, IList <string> disallowedPrefixes)
        {
            ICounter <string> c2 = new ClassicCounter <string>();

            foreach (KeyValuePair <string, double> e in c.EntrySet())
            {
                bool allowed = true;
                foreach (string prefix in disallowedPrefixes)
                {
                    allowed &= !e.Key.StartsWith(prefix);
                }
                if (allowed)
                {
                    c2.IncrementCount(e.Key, e.Value);
                }
            }
            return(c2);
        }
        public virtual void Learn(Example correct, Example incorrect, IDictionary <int, CompressedFeatureVector> mentionFeatures, Compressor <string> compressor, MaxMarginMentionRanker.ErrorType errorType)
        {
            ICounter <string> cFeatures = meta.GetFeatures(correct, mentionFeatures, compressor);
            ICounter <string> iFeatures = meta.GetFeatures(incorrect, mentionFeatures, compressor);

            foreach (KeyValuePair <string, double> e in cFeatures.EntrySet())
            {
                iFeatures.DecrementCount(e.Key, e.Value);
            }
            if (multiplicativeCost)
            {
                classifier.Learn(iFeatures, 1.0, costs[errorType.id], loss);
            }
            else
            {
                classifier.Learn(iFeatures, 1.0, 1.0, losses[errorType.id]);
            }
        }
        public virtual CompressedFeatureVector Compress(ICounter <K> c)
        {
            IList <int>    keys   = new List <int>(c.Size());
            IList <double> values = new List <double>(c.Size());

            foreach (KeyValuePair <K, double> e in c.EntrySet())
            {
                K   key = e.Key;
                int id  = index[key];
                if (id == null)
                {
                    id          = index.Count;
                    inverse[id] = key;
                    index[key]  = id;
                }
                keys.Add(id);
                values.Add(e.Value);
            }
            return(new CompressedFeatureVector(keys, values));
        }
Beispiel #13
0
        public virtual void Learn(ICounter <string> features, double label, double weight, SimpleLinearClassifier.ILoss loss)
        {
            examplesSeen++;
            double dloss = loss.Derivative(label, WeightFeatureProduct(features));

            foreach (KeyValuePair <string, double> feature in features.EntrySet())
            {
                double dfeature = weight * (-dloss * feature.Value);
                if (dfeature != 0)
                {
                    string featureName = feature.Key;
                    learningRateSchedule.Update(featureName, dfeature);
                    double lr       = learningRateSchedule.GetLearningRate(featureName);
                    double w        = weights.GetCount(featureName);
                    double dreg     = weight * regularizationStrength * (examplesSeen - accessTimes.GetCount(featureName));
                    double afterReg = (w - Math.Signum(w) * dreg * lr);
                    weights.SetCount(featureName, (Math.Signum(afterReg) != Math.Signum(w) ? 0 : afterReg) + dfeature * lr);
                    accessTimes.SetCount(featureName, examplesSeen);
                }
            }
        }
Beispiel #14
0
        private static ICollection <string> LoadVocabulary(string wordCountsPath)
        {
            ICollection <string> vocabulary = new HashSet <string>();

            try
            {
                ICounter <string> counts = IOUtils.ReadObjectFromURLOrClasspathOrFileSystem(wordCountsPath);
                foreach (KeyValuePair <string, double> e in counts.EntrySet())
                {
                    if (e.Value > MinWordCount)
                    {
                        vocabulary.Add(e.Key);
                    }
                }
            }
            catch (Exception e)
            {
                throw new Exception("Error loading word counts", e);
            }
            return(vocabulary);
        }
Beispiel #15
0
        // ----------------------------------------------------------------------------
        /// <summary>
        /// Creates a Distribution from the given counter using Gale &amp; Sampsons'
        /// "simple Good-Turing" smoothing.
        /// </summary>
        /// <returns>a new simple Good-Turing smoothed Distribution.</returns>
        public static Edu.Stanford.Nlp.Stats.Distribution <E> SimpleGoodTuring <E>(ICounter <E> counter, int numberOfKeys)
        {
            // check arguments
            ValidateCounter(counter);
            int numUnseen = numberOfKeys - counter.Size();

            if (numUnseen < 1)
            {
                throw new ArgumentException(string.Format("ERROR: numberOfKeys %d must be > size of counter %d!", numberOfKeys, counter.Size()));
            }
            // do smoothing
            int[][] cc = CountCounts2IntArrays(CollectCountCounts(counter));
            int[]   r  = cc[0];
            // counts
            int[] n = cc[1];
            // counts of counts
            Edu.Stanford.Nlp.Stats.SimpleGoodTuring sgt = new Edu.Stanford.Nlp.Stats.SimpleGoodTuring(r, n);
            // collate results
            ICounter <int> probsByCount = new ClassicCounter <int>();

            double[] probs = sgt.GetProbabilities();
            for (int i = 0; i < probs.Length; i++)
            {
                probsByCount.SetCount(r[i], probs[i]);
            }
            // make smoothed distribution
            Edu.Stanford.Nlp.Stats.Distribution <E> dist = new Edu.Stanford.Nlp.Stats.Distribution <E>();
            dist.counter = new ClassicCounter <E>();
            foreach (KeyValuePair <E, double> entry in counter.EntrySet())
            {
                E   item  = entry.Key;
                int count = (int)Math.Round(entry.Value);
                dist.counter.SetCount(item, probsByCount.GetCount(count));
            }
            dist.numberOfKeys = numberOfKeys;
            dist.reservedMass = sgt.GetProbabilityForUnseen();
            return(dist);
        }