private void ComputeEmpiricalStatistics(IList <F> geFeatures) { //allocate memory to the containers and initialize them geFeature2EmpiricalDist = new double[][] { }; geFeature2DatumList = new List <IList <int> >(geFeatures.Count); IDictionary <F, int> geFeatureMap = Generics.NewHashMap(); ICollection <int> activeUnlabeledExamples = Generics.NewHashSet(); for (int n = 0; n < geFeatures.Count; n++) { F geFeature = geFeatures[n]; geFeature2DatumList.Add(new List <int>()); Arrays.Fill(geFeature2EmpiricalDist[n], 0); geFeatureMap[geFeature] = n; } //compute the empirical label distribution for each GE feature for (int i = 0; i < labeledDataset.Size(); i++) { IDatum <L, F> datum = labeledDataset.GetDatum(i); int labelID = labeledDataset.labelIndex.IndexOf(datum.Label()); foreach (F feature in datum.AsFeatures()) { if (geFeatureMap.Contains(feature)) { int geFnum = geFeatureMap[feature]; geFeature2EmpiricalDist[geFnum][labelID]++; } } } //now normalize and smooth the label distribution for each feature. for (int n_1 = 0; n_1 < geFeatures.Count; n_1++) { ArrayMath.Normalize(geFeature2EmpiricalDist[n_1]); SmoothDistribution(geFeature2EmpiricalDist[n_1]); } //now build the inverted index from each GE feature to unlabeled datums that contain it. for (int i_1 = 0; i_1 < unlabeledDataList.Count; i_1++) { IDatum <L, F> datum = unlabeledDataList[i_1]; foreach (F feature in datum.AsFeatures()) { if (geFeatureMap.Contains(feature)) { int geFnum = geFeatureMap[feature]; geFeature2DatumList[geFnum].Add(i_1); activeUnlabeledExamples.Add(i_1); } } } System.Console.Out.WriteLine("Number of active unlabeled examples:" + activeUnlabeledExamples.Count); }