public virtual ICounter <L> ProbabilityOf(IDatum <L, F> example)
        {
            // calculate the feature indices and feature values
            int[]    featureIndices = LogisticUtils.IndicesOf(example.AsFeatures(), featureIndex);
            double[] featureValues;
            if (example is RVFDatum <object, object> )
            {
                ICollection <double> featureValuesCollection = ((RVFDatum <object, object>)example).AsFeaturesCounter().Values();
                featureValues = LogisticUtils.ConvertToArray(featureValuesCollection);
            }
            else
            {
                featureValues = new double[example.AsFeatures().Count];
                Arrays.Fill(featureValues, 1.0);
            }
            // calculate probability of each class
            ICounter <L> result     = new ClassicCounter <L>();
            int          numClasses = labelIndex.Size();

            double[] sigmoids = LogisticUtils.CalculateSigmoids(weights, featureIndices, featureValues);
            for (int c = 0; c < numClasses; c++)
            {
                L label = labelIndex.Get(c);
                result.IncrementCount(label, sigmoids[c]);
            }
            return(result);
        }
        /// <summary>Method to convert features from counts to L1-normalized TFIDF based features</summary>
        /// <param name="datum">with a collection of features.</param>
        /// <param name="featureDocCounts">a counter of doc-count for each feature.</param>
        /// <returns>RVFDatum with l1-normalized tf-idf features.</returns>
        public virtual RVFDatum <L, F> GetL1NormalizedTFIDFDatum(IDatum <L, F> datum, ICounter <F> featureDocCounts)
        {
            ICounter <F> tfidfFeatures = new ClassicCounter <F>();

            foreach (F feature in datum.AsFeatures())
            {
                if (featureDocCounts.ContainsKey(feature))
                {
                    tfidfFeatures.IncrementCount(feature, 1.0);
                }
            }
            double l1norm = 0;

            foreach (F feature_1 in tfidfFeatures.KeySet())
            {
                double idf = Math.Log(((double)(this.Size() + 1)) / (featureDocCounts.GetCount(feature_1) + 0.5));
                double tf  = tfidfFeatures.GetCount(feature_1);
                tfidfFeatures.SetCount(feature_1, tf * idf);
                l1norm += tf * idf;
            }
            foreach (F feature_2 in tfidfFeatures.KeySet())
            {
                double tfidf = tfidfFeatures.GetCount(feature_2);
                tfidfFeatures.SetCount(feature_2, tfidf / l1norm);
            }
            RVFDatum <L, F> rvfDatum = new RVFDatum <L, F>(tfidfFeatures, datum.Label());

            return(rvfDatum);
        }
Beispiel #3
0
 public virtual double ProbabilityOf(IDatum <L, F> example)
 {
     if (example is RVFDatum <object, object> )
     {
         return(ProbabilityOfRVFDatum((RVFDatum <L, F>)example));
     }
     return(ProbabilityOf(example.AsFeatures(), example.Label()));
 }
Beispiel #4
0
 public virtual L ClassOf(IDatum <L, F> datum)
 {
     if (datum is RVFDatum <object, object> )
     {
         return(ClassOfRVFDatum((RVFDatum <L, F>)datum));
     }
     return(ClassOf(datum.AsFeatures()));
 }
 /// <summary>Constructs a new RVFDatum taking the data from a Datum.</summary>
 /// <remarks>
 /// Constructs a new RVFDatum taking the data from a Datum. <i>Implementation
 /// note:</i> This constructor allocates its own counter over features, but is
 /// only guaranteed correct if the label and feature names are immutable.
 /// </remarks>
 /// <param name="m">The Datum to copy.</param>
 public RVFDatum(IDatum <L, F> m)
 {
     this.features = new ClassicCounter <F>();
     foreach (F key in m.AsFeatures())
     {
         features.IncrementCount(key, 1.0);
     }
     SetLabel(m.Label());
 }
Beispiel #6
0
        /// <summary>Returns whether the given Datum contains the same features as this Datum.</summary>
        /// <remarks>
        /// Returns whether the given Datum contains the same features as this Datum.
        /// Doesn't check the labels, should we change this?
        /// (CDM Feb 2012: Also doesn't correctly respect the contract for equals,
        /// since it gives one way equality with other Datum's.)
        /// </remarks>
        /// <param name="o">The object to test equality with</param>
        /// <returns>Whether it is equal to this CRFDatum in terms of features</returns>
        public override bool Equals(object o)
        {
            if (!(o is IDatum))
            {
                return(false);
            }
            IDatum <object, object> d = (IDatum <object, object>)o;

            return(features.Equals(d.AsFeatures()));
        }
        /// <summary>Returns whether the given Datum contains the same features as this Datum.</summary>
        /// <remarks>
        /// Returns whether the given Datum contains the same features as this Datum.
        /// Doesn't check the labels, should we change this?
        /// </remarks>
        public override bool Equals(object o)
        {
            if (!(o is IDatum))
            {
                return(false);
            }
            IDatum <LabelType, FeatureType> d = (IDatum <LabelType, FeatureType>)o;

            return(features.Equals(d.AsFeatures()));
        }
Beispiel #8
0
        private void ComputeEmpiricalStatistics(IList <F> geFeatures)
        {
            //allocate memory to the containers and initialize them
            geFeature2EmpiricalDist = new double[][] {  };
            geFeature2DatumList     = new List <IList <int> >(geFeatures.Count);
            IDictionary <F, int> geFeatureMap            = Generics.NewHashMap();
            ICollection <int>    activeUnlabeledExamples = Generics.NewHashSet();

            for (int n = 0; n < geFeatures.Count; n++)
            {
                F geFeature = geFeatures[n];
                geFeature2DatumList.Add(new List <int>());
                Arrays.Fill(geFeature2EmpiricalDist[n], 0);
                geFeatureMap[geFeature] = n;
            }
            //compute the empirical label distribution for each GE feature
            for (int i = 0; i < labeledDataset.Size(); i++)
            {
                IDatum <L, F> datum   = labeledDataset.GetDatum(i);
                int           labelID = labeledDataset.labelIndex.IndexOf(datum.Label());
                foreach (F feature in datum.AsFeatures())
                {
                    if (geFeatureMap.Contains(feature))
                    {
                        int geFnum = geFeatureMap[feature];
                        geFeature2EmpiricalDist[geFnum][labelID]++;
                    }
                }
            }
            //now normalize and smooth the label distribution for each feature.
            for (int n_1 = 0; n_1 < geFeatures.Count; n_1++)
            {
                ArrayMath.Normalize(geFeature2EmpiricalDist[n_1]);
                SmoothDistribution(geFeature2EmpiricalDist[n_1]);
            }
            //now build the inverted index from each GE feature to unlabeled datums that contain it.
            for (int i_1 = 0; i_1 < unlabeledDataList.Count; i_1++)
            {
                IDatum <L, F> datum = unlabeledDataList[i_1];
                foreach (F feature in datum.AsFeatures())
                {
                    if (geFeatureMap.Contains(feature))
                    {
                        int geFnum = geFeatureMap[feature];
                        geFeature2DatumList[geFnum].Add(i_1);
                        activeUnlabeledExamples.Add(i_1);
                    }
                }
            }
            System.Console.Out.WriteLine("Number of active unlabeled examples:" + activeUnlabeledExamples.Count);
        }
Beispiel #9
0
        /// <summary>returns the scores for both the classes</summary>
        public virtual ICounter <L> ScoresOf(IDatum <L, F> datum)
        {
            if (datum is RVFDatum <object, object> )
            {
                return(ScoresOfRVFDatum((RVFDatum <L, F>)datum));
            }
            ICollection <F> features = datum.AsFeatures();
            double          sum      = ScoreOf(features);
            ICounter <L>    c        = new ClassicCounter <L>();

            c.SetCount(classes[0], -sum);
            c.SetCount(classes[1], sum);
            return(c);
        }
 // TODO: Check that this does what we want for Datum other than RVFDatum
 public override void Add(IDatum <L, F> d)
 {
     // If you edit me, also take care of WeightedRVFDataset
     if (d is RVFDatum <object, object> )
     {
         AddLabel(d.Label());
         AddFeatures(((RVFDatum <L, F>)d).AsFeaturesCounter());
         size++;
     }
     else
     {
         AddLabel(d.Label());
         AddFeatures(Counters.AsCounter(d.AsFeatures()));
         size++;
     }
 }
 // If you edit me, also take care of WeightedRVFDataset
 public virtual void Add(IDatum <L, F> d, string src, string id)
 {
     if (d is RVFDatum <object, object> )
     {
         AddLabel(d.Label());
         AddFeatures(((RVFDatum <L, F>)d).AsFeaturesCounter());
         AddSourceAndId(src, id);
         size++;
     }
     else
     {
         AddLabel(d.Label());
         AddFeatures(Counters.AsCounter(d.AsFeatures()));
         AddSourceAndId(src, id);
         size++;
     }
 }
Beispiel #12
0
        public static IDatum <L2, F> MapDatum <L, L2, F>(IDatum <L, F> d, IDictionary <L, L2> labelMapping, L2 defaultLabel)
        {
            // TODO: How to copy datum?
            L2 newLabel = labelMapping[d.Label()];

            if (newLabel == null)
            {
                newLabel = defaultLabel;
            }
            if (d is RVFDatum)
            {
                return(new RVFDatum <L2, F>(((RVFDatum <L, F>)d).AsFeaturesCounter(), newLabel));
            }
            else
            {
                return(new BasicDatum <L2, F>(d.AsFeatures(), newLabel));
            }
        }
Beispiel #13
0
 // loop over each feature for derivative computation
 //end of if condition
 //loop over each GE feature
 private void UpdateDerivative(IDatum <L, F> datum, double[] probs, ICounter <Triple <int, int, int> > feature2classPairDerivatives)
 {
     foreach (F feature in datum.AsFeatures())
     {
         int fID = labeledDataset.featureIndex.IndexOf(feature);
         if (fID >= 0)
         {
             for (int c = 0; c < numClasses; c++)
             {
                 for (int cPrime = 0; cPrime < numClasses; cPrime++)
                 {
                     if (cPrime == c)
                     {
                         feature2classPairDerivatives.IncrementCount(new Triple <int, int, int>(fID, c, cPrime), -probs[c] * (1 - probs[c]) * ValueOfFeature(feature, datum));
                     }
                     else
                     {
                         feature2classPairDerivatives.IncrementCount(new Triple <int, int, int>(fID, c, cPrime), probs[c] * probs[cPrime] * ValueOfFeature(feature, datum));
                     }
                 }
             }
         }
     }
 }
Beispiel #14
0
 public virtual void Add(IDatum <L, F> d, float weight)
 {
     Add(d.AsFeatures(), d.Label(), weight);
 }
 public override void Add(IDatum <L, F> d)
 {
     Add(d.AsFeatures(), d.Label());
 }