protected override int Test(FeatureVector vector, out double[] details) { // classify(v) = argmax(P(c)P(v|c)) = argmax_c(prob_c * prob_v_c) double[] v_logProb_c = new double[NoOfClasses]; for (int c_i = 0; c_i < NoOfClasses; c_i++) { v_logProb_c[c_i] = CalculateLogProb_v_c(vector, c_i); } // Convert back from log to decimal format. NormalizationHelper.NormalizeLogs(v_logProb_c, 10); details = v_logProb_c; return(StatisticsHelper.ArgMax(details)); }
protected override int Test(FeatureVector vector, out double[] details) { double[] votes_c = new double[NoOfClasses]; var nearestNeighbors = new List <IdValuePair <double> >(); for (int v_i = 0; v_i < TrainingVectors.Count; v_i++) { double distance = distanceFunc(TrainingVectors[v_i], vector); // If this neighbor is closer than our furthest neighbor, ... // If the list of nearest neighbors is empty OR this is the closest distance we'Ve seen, // then add neighbor as the closest neighbor (i.e. insert at position 0). if (nearestNeighbors.Count == 0 || distance < nearestNeighbors[0].Value) { nearestNeighbors.Insert(0, new IdValuePair <double>(v_i, distance)); votes_c[TrainingVectors[v_i].Headers[Gold_i]]++; // If we have too many neighbors, then remove the furthest one. if (nearestNeighbors.Count > K) { votes_c[TrainingVectors[nearestNeighbors[nearestNeighbors.Count - 1].Id].Headers[Gold_i]]--; nearestNeighbors.RemoveAt(nearestNeighbors.Count - 1); } } else if (nearestNeighbors.Count < K || distance < nearestNeighbors[nearestNeighbors.Count - 1].Value) { var newNeighbor = new IdValuePair <double>(v_i, distance); int insert_b = SearchHelper.FindInsertIndex(nearestNeighbors, newNeighbor); if (insert_b <= K) { nearestNeighbors.Insert(insert_b, newNeighbor); votes_c[TrainingVectors[v_i].Headers[Gold_i]]++; } // If we have too many neighbors, then remove the furthest one. if (nearestNeighbors.Count > K) { votes_c[TrainingVectors[nearestNeighbors[nearestNeighbors.Count - 1].Id].Headers[Gold_i]]--; nearestNeighbors.RemoveAt(nearestNeighbors.Count - 1); } } Debug.Assert(nearestNeighbors.Count <= K); } if (nearestNeighbors.Count < K) { Console.Error.WriteLine("Warning: K nearest neighbors could not be found."); } details = NormalizationHelper.CreateNormalizedDistribution(votes_c); return(StatisticsHelper.ArgMax(votes_c)); }
protected override int Test(FeatureVector vector, out double[] details) { double[] distribution = new double[NoOfClasses]; // Do two things: // 1) Calculate the probability of a document belonging to class c_i, given document <c>vector</c>, and // 2) Calculate Z, which will be used to normalize the distribution shortly. for (int c_i = 0; c_i < NoOfClasses; c_i++) { double logProb = _lambda_c[c_i]; for (int u_i = 0; u_i < vector.UsedFeatures.Length; u_i++) { int f_i = vector.UsedFeatures[u_i]; logProb += CalculateLogProb_c_f(c_i, f_i); } distribution[c_i] = logProb; } NormalizationHelper.NormalizeLogs(distribution, Math.E); details = distribution; return(StatisticsHelper.ArgMax(details)); }
private static void FindFeatureWithMaxInformationGain( List <FeatureVector> vectors , int noOfCategories , int gold_i , out int bestFeature , out double maxInformationGain , out List <FeatureVector>[] bestSplit) { // Input Validation: if (vectors.Count <= 0) { throw new ArgumentOutOfRangeException("vector", "Parameter is expected to have at least one training vector."); } int noOfFeatures = vectors[0].Features.Length; //Debug.Assert(noOfFeatures >= 0); // Initialize Output: bestFeature = -1; maxInformationGain = 0; bestSplit = null; // Iterate over each of the features, ... for (int featureIndex = 0; featureIndex < noOfFeatures; featureIndex++) { int count_t = 0; int count_f = 0; foreach (FeatureVector vector in vectors) { if (vector.Features[featureIndex] > 0) { count_t++; } else { count_f++; } } // Group the vectors by class. List <FeatureVector>[] vectors_byClass = new List <FeatureVector> [noOfCategories]; int[][] distribution_byClass = new int[noOfCategories][]; //TODO: It would be nice to make this less binary-feature dependent. List <FeatureVector>[] split = new List <FeatureVector> [2]; split[0] = new List <FeatureVector>(); split[1] = new List <FeatureVector>(); for (int i = 0; i < noOfCategories; i++) { vectors_byClass[i] = new List <FeatureVector>(); distribution_byClass[i] = new int[2]; } // Iterate over each of the training vectors and add the vector to the relevant group BY CATEGORY // AND split the data BY FEATURE. foreach (FeatureVector vector in vectors) { vectors_byClass[vector.Headers[gold_i]].Add(vector); //TODO: It would be nice to make this less binary-feature dependent. if (vector.Features[featureIndex] > 0) { split[1].Add(vector); distribution_byClass[vector.Headers[gold_i]][1]++; } else { split[0].Add(vector); distribution_byClass[vector.Headers[gold_i]][0]++; } } double informationGain = StatisticsHelper.CalculateInformationGain(distribution_byClass); if (bestFeature == -1 || informationGain > maxInformationGain) { maxInformationGain = informationGain; bestFeature = featureIndex; bestSplit = split; } } }