/// <summary>The examples are assumed to be a list of RFVDatum.</summary> /// <remarks> /// The examples are assumed to be a list of RFVDatum. /// The datums are assumed to not contain the zeroes and then they are added to each instance. /// </remarks> public virtual NaiveBayesClassifier <L, F> TrainClassifier(GeneralDataset <L, F> examples, ICollection <F> featureSet) { int numFeatures = featureSet.Count; int[][] data = new int[][] { }; int[] labels = new int[examples.Size()]; labelIndex = new HashIndex <L>(); featureIndex = new HashIndex <F>(); foreach (F feat in featureSet) { featureIndex.Add(feat); } for (int d = 0; d < examples.Size(); d++) { RVFDatum <L, F> datum = examples.GetRVFDatum(d); ICounter <F> c = datum.AsFeaturesCounter(); foreach (F feature in c.KeySet()) { int fNo = featureIndex.IndexOf(feature); int value = (int)c.GetCount(feature); data[d][fNo] = value; } labelIndex.Add(datum.Label()); labels[d] = labelIndex.IndexOf(datum.Label()); } int numClasses = labelIndex.Size(); return(TrainClassifier(data, labels, numFeatures, numClasses, labelIndex, featureIndex)); }
private void ComputeEmpiricalStatistics(IList <F> geFeatures) { //allocate memory to the containers and initialize them geFeature2EmpiricalDist = new double[][] { }; geFeature2DatumList = new List <IList <int> >(geFeatures.Count); IDictionary <F, int> geFeatureMap = Generics.NewHashMap(); ICollection <int> activeUnlabeledExamples = Generics.NewHashSet(); for (int n = 0; n < geFeatures.Count; n++) { F geFeature = geFeatures[n]; geFeature2DatumList.Add(new List <int>()); Arrays.Fill(geFeature2EmpiricalDist[n], 0); geFeatureMap[geFeature] = n; } //compute the empirical label distribution for each GE feature for (int i = 0; i < labeledDataset.Size(); i++) { IDatum <L, F> datum = labeledDataset.GetDatum(i); int labelID = labeledDataset.labelIndex.IndexOf(datum.Label()); foreach (F feature in datum.AsFeatures()) { if (geFeatureMap.Contains(feature)) { int geFnum = geFeatureMap[feature]; geFeature2EmpiricalDist[geFnum][labelID]++; } } } //now normalize and smooth the label distribution for each feature. for (int n_1 = 0; n_1 < geFeatures.Count; n_1++) { ArrayMath.Normalize(geFeature2EmpiricalDist[n_1]); SmoothDistribution(geFeature2EmpiricalDist[n_1]); } //now build the inverted index from each GE feature to unlabeled datums that contain it. for (int i_1 = 0; i_1 < unlabeledDataList.Count; i_1++) { IDatum <L, F> datum = unlabeledDataList[i_1]; foreach (F feature in datum.AsFeatures()) { if (geFeatureMap.Contains(feature)) { int geFnum = geFeatureMap[feature]; geFeature2DatumList[geFnum].Add(i_1); activeUnlabeledExamples.Add(i_1); } } } System.Console.Out.WriteLine("Number of active unlabeled examples:" + activeUnlabeledExamples.Count); }
/// <summary>Builds a sigmoid model to turn the classifier outputs into probabilities.</summary> private LinearClassifier <L, L> FitSigmoid(SVMLightClassifier <L, F> classifier, GeneralDataset <L, F> dataset) { RVFDataset <L, L> plattDataset = new RVFDataset <L, L>(); for (int i = 0; i < dataset.Size(); i++) { RVFDatum <L, F> d = dataset.GetRVFDatum(i); ICounter <L> scores = classifier.ScoresOf((IDatum <L, F>)d); scores.IncrementCount(null); plattDataset.Add(new RVFDatum <L, L>(scores, d.Label())); } LinearClassifierFactory <L, L> factory = new LinearClassifierFactory <L, L>(); factory.SetPrior(new LogPrior(LogPrior.LogPriorType.Null)); return(factory.TrainClassifier(plattDataset)); }