/// <summary>Constructs a new RVFDatum taking the data from a Datum.</summary> /// <remarks> /// Constructs a new RVFDatum taking the data from a Datum. <i>Implementation /// note:</i> This constructor allocates its own counter over features, but is /// only guaranteed correct if the label and feature names are immutable. /// </remarks> /// <param name="m">The Datum to copy.</param> public RVFDatum(IDatum <L, F> m) { this.features = new ClassicCounter <F>(); foreach (F key in m.AsFeatures()) { features.IncrementCount(key, 1.0); } SetLabel(m.Label()); }
private void ComputeEmpiricalStatistics(IList <F> geFeatures) { //allocate memory to the containers and initialize them geFeature2EmpiricalDist = new double[][] { }; geFeature2DatumList = new List <IList <int> >(geFeatures.Count); IDictionary <F, int> geFeatureMap = Generics.NewHashMap(); ICollection <int> activeUnlabeledExamples = Generics.NewHashSet(); for (int n = 0; n < geFeatures.Count; n++) { F geFeature = geFeatures[n]; geFeature2DatumList.Add(new List <int>()); Arrays.Fill(geFeature2EmpiricalDist[n], 0); geFeatureMap[geFeature] = n; } //compute the empirical label distribution for each GE feature for (int i = 0; i < labeledDataset.Size(); i++) { IDatum <L, F> datum = labeledDataset.GetDatum(i); int labelID = labeledDataset.labelIndex.IndexOf(datum.Label()); foreach (F feature in datum.AsFeatures()) { if (geFeatureMap.Contains(feature)) { int geFnum = geFeatureMap[feature]; geFeature2EmpiricalDist[geFnum][labelID]++; } } } //now normalize and smooth the label distribution for each feature. for (int n_1 = 0; n_1 < geFeatures.Count; n_1++) { ArrayMath.Normalize(geFeature2EmpiricalDist[n_1]); SmoothDistribution(geFeature2EmpiricalDist[n_1]); } //now build the inverted index from each GE feature to unlabeled datums that contain it. for (int i_1 = 0; i_1 < unlabeledDataList.Count; i_1++) { IDatum <L, F> datum = unlabeledDataList[i_1]; foreach (F feature in datum.AsFeatures()) { if (geFeatureMap.Contains(feature)) { int geFnum = geFeatureMap[feature]; geFeature2DatumList[geFnum].Add(i_1); activeUnlabeledExamples.Add(i_1); } } } System.Console.Out.WriteLine("Number of active unlabeled examples:" + activeUnlabeledExamples.Count); }
public static IDatum <L2, F> MapDatum <L, L2, F>(IDatum <L, F> d, IDictionary <L, L2> labelMapping, L2 defaultLabel) { // TODO: How to copy datum? L2 newLabel = labelMapping[d.Label()]; if (newLabel == null) { newLabel = defaultLabel; } if (d is RVFDatum) { return(new RVFDatum <L2, F>(((RVFDatum <L, F>)d).AsFeaturesCounter(), newLabel)); } else { return(new BasicDatum <L2, F>(d.AsFeatures(), newLabel)); } }
public virtual void InitMC <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { //if (!(gData instanceof Dataset)) { // throw new UnsupportedOperationException("Can only handle Datasets, not "+gData.getClass().getName()); //} // //Dataset data = (Dataset)gData; IPriorityQueue <Pair <int, Pair <double, bool> > > q = new BinaryHeapPriorityQueue <Pair <int, Pair <double, bool> > >(); total = 0; correct = 0; logLikelihood = 0.0; for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); ICounter <L> scores = classifier.LogProbabilityOf(d); L guess = Counters.Argmax(scores); L correctLab = d.Label(); double guessScore = scores.GetCount(guess); double correctScore = scores.GetCount(correctLab); int guessInd = data.LabelIndex().IndexOf(guess); int correctInd = data.LabelIndex().IndexOf(correctLab); total++; if (guessInd == correctInd) { correct++; } logLikelihood += correctScore; q.Add(new Pair <int, Pair <double, bool> >(int.Parse(i), new Pair <double, bool>(guessScore, bool.ValueOf(guessInd == correctInd))), -guessScore); } accuracy = (double)correct / (double)total; IList <Pair <int, Pair <double, bool> > > sorted = q.ToSortedList(); scores = new double[sorted.Count]; isCorrect = new bool[sorted.Count]; for (int i_1 = 0; i_1 < sorted.Count; i_1++) { Pair <double, bool> next = sorted[i_1].Second(); scores[i_1] = next.First(); isCorrect[i_1] = next.Second(); } }
public virtual double Score <F>(IProbabilisticClassifier <L, F> classifier, GeneralDataset <L, F> data) { List <Pair <double, int> > dataScores = new List <Pair <double, int> >(); for (int i = 0; i < data.Size(); i++) { IDatum <L, F> d = data.GetRVFDatum(i); ICounter <L> scores = classifier.LogProbabilityOf(d); int labelD = d.Label().Equals(posLabel) ? 1 : 0; dataScores.Add(new Pair <double, int>(Math.Exp(scores.GetCount(posLabel)), labelD)); } PRCurve prc = new PRCurve(dataScores); confWeightedAccuracy = prc.Cwa(); accuracy = prc.Accuracy(); optAccuracy = prc.OptimalAccuracy(); optConfWeightedAccuracy = prc.OptimalCwa(); logLikelihood = prc.LogLikelihood(); accrecall = prc.CwaArray(); optaccrecall = prc.OptimalCwaArray(); return(accuracy); }
public PrecisionRecallStats(IClassifier <L, F> classifier, Dataset <L, F> data, L positiveClass) { for (int i = 0; i < data.Size(); ++i) { IDatum <L, F> d = data.GetDatum(i); L guess = classifier.ClassOf(d); L label = d.Label(); bool guessPositive = guess.Equals(positiveClass); bool isPositive = label.Equals(positiveClass); if (isPositive && guessPositive) { tpCount++; } if (isPositive && !guessPositive) { fnCount++; } if (!isPositive && guessPositive) { fpCount++; } } }
public virtual void Add(IDatum <L, F> d, float weight) { Add(d.AsFeatures(), d.Label(), weight); }
public override void Add(IDatum <L, F> d) { Add(d.AsFeatures(), d.Label()); }