Exemple #1
0
        /// <summary>The examples are assumed to be a list of RFVDatum.</summary>
        /// <remarks>
        /// The examples are assumed to be a list of RFVDatum.
        /// The datums are assumed to not contain the zeroes and then they are added to each instance.
        /// </remarks>
        public virtual NaiveBayesClassifier <L, F> TrainClassifier(GeneralDataset <L, F> examples, ICollection <F> featureSet)
        {
            int numFeatures = featureSet.Count;

            int[][] data   = new int[][] {  };
            int[]   labels = new int[examples.Size()];
            labelIndex   = new HashIndex <L>();
            featureIndex = new HashIndex <F>();
            foreach (F feat in featureSet)
            {
                featureIndex.Add(feat);
            }
            for (int d = 0; d < examples.Size(); d++)
            {
                RVFDatum <L, F> datum = examples.GetRVFDatum(d);
                ICounter <F>    c     = datum.AsFeaturesCounter();
                foreach (F feature in c.KeySet())
                {
                    int fNo   = featureIndex.IndexOf(feature);
                    int value = (int)c.GetCount(feature);
                    data[d][fNo] = value;
                }
                labelIndex.Add(datum.Label());
                labels[d] = labelIndex.IndexOf(datum.Label());
            }
            int numClasses = labelIndex.Size();

            return(TrainClassifier(data, labels, numFeatures, numClasses, labelIndex, featureIndex));
        }
Exemple #2
0
        private void ComputeEmpiricalStatistics(IList <F> geFeatures)
        {
            //allocate memory to the containers and initialize them
            geFeature2EmpiricalDist = new double[][] {  };
            geFeature2DatumList     = new List <IList <int> >(geFeatures.Count);
            IDictionary <F, int> geFeatureMap            = Generics.NewHashMap();
            ICollection <int>    activeUnlabeledExamples = Generics.NewHashSet();

            for (int n = 0; n < geFeatures.Count; n++)
            {
                F geFeature = geFeatures[n];
                geFeature2DatumList.Add(new List <int>());
                Arrays.Fill(geFeature2EmpiricalDist[n], 0);
                geFeatureMap[geFeature] = n;
            }
            //compute the empirical label distribution for each GE feature
            for (int i = 0; i < labeledDataset.Size(); i++)
            {
                IDatum <L, F> datum   = labeledDataset.GetDatum(i);
                int           labelID = labeledDataset.labelIndex.IndexOf(datum.Label());
                foreach (F feature in datum.AsFeatures())
                {
                    if (geFeatureMap.Contains(feature))
                    {
                        int geFnum = geFeatureMap[feature];
                        geFeature2EmpiricalDist[geFnum][labelID]++;
                    }
                }
            }
            //now normalize and smooth the label distribution for each feature.
            for (int n_1 = 0; n_1 < geFeatures.Count; n_1++)
            {
                ArrayMath.Normalize(geFeature2EmpiricalDist[n_1]);
                SmoothDistribution(geFeature2EmpiricalDist[n_1]);
            }
            //now build the inverted index from each GE feature to unlabeled datums that contain it.
            for (int i_1 = 0; i_1 < unlabeledDataList.Count; i_1++)
            {
                IDatum <L, F> datum = unlabeledDataList[i_1];
                foreach (F feature in datum.AsFeatures())
                {
                    if (geFeatureMap.Contains(feature))
                    {
                        int geFnum = geFeatureMap[feature];
                        geFeature2DatumList[geFnum].Add(i_1);
                        activeUnlabeledExamples.Add(i_1);
                    }
                }
            }
            System.Console.Out.WriteLine("Number of active unlabeled examples:" + activeUnlabeledExamples.Count);
        }
Exemple #3
0
        /// <summary>Builds a sigmoid model to turn the classifier outputs into probabilities.</summary>
        private LinearClassifier <L, L> FitSigmoid(SVMLightClassifier <L, F> classifier, GeneralDataset <L, F> dataset)
        {
            RVFDataset <L, L> plattDataset = new RVFDataset <L, L>();

            for (int i = 0; i < dataset.Size(); i++)
            {
                RVFDatum <L, F> d      = dataset.GetRVFDatum(i);
                ICounter <L>    scores = classifier.ScoresOf((IDatum <L, F>)d);
                scores.IncrementCount(null);
                plattDataset.Add(new RVFDatum <L, L>(scores, d.Label()));
            }
            LinearClassifierFactory <L, L> factory = new LinearClassifierFactory <L, L>();

            factory.SetPrior(new LogPrior(LogPrior.LogPriorType.Null));
            return(factory.TrainClassifier(plattDataset));
        }