Пример #1
0
 protected override int Test(FeatureVector vector, out double[] details)
 {
     // classify(v) = argmax(P(c)P(v|c)) = argmax_c(prob_c * prob_v_c)
     double[] v_logProb_c = new double[NoOfClasses];
     for (int c_i = 0; c_i < NoOfClasses; c_i++)
     {
         v_logProb_c[c_i] = CalculateLogProb_v_c(vector, c_i);
     }
     // Convert back from log to decimal format.
     NormalizationHelper.NormalizeLogs(v_logProb_c, 10);
     details = v_logProb_c;
     return(StatisticsHelper.ArgMax(details));
 }
Пример #2
0
        protected override int Test(FeatureVector vector, out double[] details)
        {
            double[] votes_c          = new double[NoOfClasses];
            var      nearestNeighbors = new List <IdValuePair <double> >();

            for (int v_i = 0; v_i < TrainingVectors.Count; v_i++)
            {
                double distance = distanceFunc(TrainingVectors[v_i], vector);
                // If this neighbor is closer than our furthest neighbor, ...

                // If the list of nearest neighbors is empty OR this is the closest distance we'Ve seen,
                // then add neighbor as the closest neighbor (i.e. insert at position 0).
                if (nearestNeighbors.Count == 0 || distance < nearestNeighbors[0].Value)
                {
                    nearestNeighbors.Insert(0, new IdValuePair <double>(v_i, distance));
                    votes_c[TrainingVectors[v_i].Headers[Gold_i]]++;
                    // If we have too many neighbors, then remove the furthest one.
                    if (nearestNeighbors.Count > K)
                    {
                        votes_c[TrainingVectors[nearestNeighbors[nearestNeighbors.Count - 1].Id].Headers[Gold_i]]--;
                        nearestNeighbors.RemoveAt(nearestNeighbors.Count - 1);
                    }
                }
                else if (nearestNeighbors.Count < K || distance < nearestNeighbors[nearestNeighbors.Count - 1].Value)
                {
                    var newNeighbor = new IdValuePair <double>(v_i, distance);
                    int insert_b    = SearchHelper.FindInsertIndex(nearestNeighbors, newNeighbor);
                    if (insert_b <= K)
                    {
                        nearestNeighbors.Insert(insert_b, newNeighbor);
                        votes_c[TrainingVectors[v_i].Headers[Gold_i]]++;
                    }
                    // If we have too many neighbors, then remove the furthest one.
                    if (nearestNeighbors.Count > K)
                    {
                        votes_c[TrainingVectors[nearestNeighbors[nearestNeighbors.Count - 1].Id].Headers[Gold_i]]--;
                        nearestNeighbors.RemoveAt(nearestNeighbors.Count - 1);
                    }
                }
                Debug.Assert(nearestNeighbors.Count <= K);
            }
            if (nearestNeighbors.Count < K)
            {
                Console.Error.WriteLine("Warning: K nearest neighbors could not be found.");
            }

            details = NormalizationHelper.CreateNormalizedDistribution(votes_c);
            return(StatisticsHelper.ArgMax(votes_c));
        }
        protected override int Test(FeatureVector vector, out double[] details)
        {
            double[] distribution = new double[NoOfClasses];

            // Do two things:
            // 1) Calculate the probability of a document belonging to class c_i, given document <c>vector</c>, and
            // 2) Calculate Z, which will be used to normalize the distribution shortly.
            for (int c_i = 0; c_i < NoOfClasses; c_i++)
            {
                double logProb = _lambda_c[c_i];
                for (int u_i = 0; u_i < vector.UsedFeatures.Length; u_i++)
                {
                    int f_i = vector.UsedFeatures[u_i];
                    logProb += CalculateLogProb_c_f(c_i, f_i);
                }
                distribution[c_i] = logProb;
            }
            NormalizationHelper.NormalizeLogs(distribution, Math.E);
            details = distribution;
            return(StatisticsHelper.ArgMax(details));
        }
Пример #4
0
        private static void FindFeatureWithMaxInformationGain(
            List <FeatureVector> vectors
            , int noOfCategories
            , int gold_i
            , out int bestFeature
            , out double maxInformationGain
            , out List <FeatureVector>[] bestSplit)
        {
            // Input Validation:
            if (vectors.Count <= 0)
            {
                throw new ArgumentOutOfRangeException("vector", "Parameter is expected to have at least one training vector.");
            }

            int noOfFeatures = vectors[0].Features.Length;

            //Debug.Assert(noOfFeatures >= 0);

            // Initialize Output:
            bestFeature        = -1;
            maxInformationGain = 0;
            bestSplit          = null;

            // Iterate over each of the features, ...
            for (int featureIndex = 0; featureIndex < noOfFeatures; featureIndex++)
            {
                int count_t = 0;
                int count_f = 0;
                foreach (FeatureVector vector in vectors)
                {
                    if (vector.Features[featureIndex] > 0)
                    {
                        count_t++;
                    }
                    else
                    {
                        count_f++;
                    }
                }

                // Group the vectors by class.
                List <FeatureVector>[] vectors_byClass = new List <FeatureVector> [noOfCategories];
                int[][] distribution_byClass           = new int[noOfCategories][];
                //TODO: It would be nice to make this less binary-feature dependent.
                List <FeatureVector>[] split = new List <FeatureVector> [2];
                split[0] = new List <FeatureVector>();
                split[1] = new List <FeatureVector>();
                for (int i = 0; i < noOfCategories; i++)
                {
                    vectors_byClass[i]      = new List <FeatureVector>();
                    distribution_byClass[i] = new int[2];
                }
                // Iterate over each of the training vectors and add the vector to the relevant group BY CATEGORY
                // AND split the data BY FEATURE.
                foreach (FeatureVector vector in vectors)
                {
                    vectors_byClass[vector.Headers[gold_i]].Add(vector);
                    //TODO: It would be nice to make this less binary-feature dependent.
                    if (vector.Features[featureIndex] > 0)
                    {
                        split[1].Add(vector);
                        distribution_byClass[vector.Headers[gold_i]][1]++;
                    }
                    else
                    {
                        split[0].Add(vector);
                        distribution_byClass[vector.Headers[gold_i]][0]++;
                    }
                }
                double informationGain = StatisticsHelper.CalculateInformationGain(distribution_byClass);
                if (bestFeature == -1 || informationGain > maxInformationGain)
                {
                    maxInformationGain = informationGain;
                    bestFeature        = featureIndex;
                    bestSplit          = split;
                }
            }
        }