Пример #1
0
        public static int SelectLevels(number[] x, int[][] J, int Kmin, int Kmax, double[] BIC)
        {
            int N = x.Length;

            if (Kmin >= Kmax || N < 2)
            {
                return(Math.Min(Kmin, Kmax));
            }

            if (BIC.Length != Kmax - Kmin + 1)
            {
                Array.Resize(ref BIC, Kmax - Kmin + 1);
            }

            // double variance_min, variance_max;
            // range_of_variance(x, variance_min, variance_max);

            int Kopt = Kmin;

            double maxBIC = 0;

            double[] lambda = new double[Kmax];
            number[] mu     = new number[Kmax];
            number[] sigma2 = new number[Kmax];
            double[] coeff  = new double[Kmax];

            for (int K = Kmin; K <= Kmax; ++K)
            {
                int[] size = new int[K];

                // Backtrack the matrix to determine boundaries between the bins.
                DynamicProgramming.Backtrack(x, J, size, K);

                int indexLeft = 0;
                int indexRight;

                for (int k = 0; k < K; ++k)
                { // Estimate GMM parameters first
                    lambda[k] = size[k] / (double)N;

                    indexRight = indexLeft + size[k] - 1;

                    ShiftedDataVariance(x, indexLeft, indexRight, out mu[k], out sigma2[k]);

                    if (sigma2[k] == 0 || size[k] == 1)
                    {
                        number dmin;

                        if (indexLeft > 0 && indexRight < N - 1)
                        {
                            dmin = Math.Min(x[indexLeft] - x[indexLeft - 1], x[indexRight + 1] - x[indexRight]);
                        }
                        else if (indexLeft > 0)
                        {
                            dmin = x[indexLeft] - x[indexLeft - 1];
                        }
                        else
                        {
                            dmin = x[indexRight + 1] - x[indexRight];
                        }

                        // std::cout << "sigma2[k]=" << sigma2[k] << "==>";
                        if (sigma2[k] == 0)
                        {
                            sigma2[k] = dmin * dmin / ((number)4) / ((number)9);
                        }
                        if (size[k] == 1)
                        {
                            sigma2[k] = dmin * dmin;
                        }
                        // std::cout << sigma2[k] << std::endl;
                    }

                    /*
                     * if(sigma2[k] == 0) sigma2[k] = variance_min;
                     * if(size[k] == 1) sigma2[k] = variance_max;
                     */

                    coeff[k] = lambda[k] / Math.Sqrt((double)(2 * PI * sigma2[k]));

                    indexLeft = indexRight + 1;
                }

                double loglikelihood = 0;

                for (int i = 0; i < N; ++i)
                {
                    double L = 0;
                    for (int k = 0; k < K; ++k)
                    {
                        L += coeff[k] * Math.Exp((double)(-(x[i] - mu[k]) * (x[i] - mu[k]) / (2 * sigma2[k])));
                    }

                    loglikelihood += Math.Log(L);
                }

                // Compute the Bayesian information criterion
                BIC[K - Kmin] = 2d * loglikelihood - (3d * K - 1d) * Math.Log(N);  //(K*3-1)

                double bic = BIC[K - Kmin];

                // std::cout << "k=" << K << ": Loglh=" << loglikelihood << ", BIC=" << BIC << std::endl;

                if (K == Kmin)
                {
                    maxBIC = bic;
                    Kopt   = Kmin;
                }
                else
                {
                    if (bic > maxBIC)
                    {
                        maxBIC = bic;
                        Kopt   = K;
                    }
                }
            }
            return(Kopt);
        }
Пример #2
0
        public static int SelectLevels(number[] x, number[] y, int[][] J, int Kmin, int Kmax, double[] BIC)
        {
            int N = x.Length;

            /*if (Kmin == Kmax)
             * {
             *  return Kmin;
             * }*/


            if (Kmin > Kmax || N < 2)
            {
                return(Math.Min(Kmin, Kmax));
            }


            if (BIC.Length != Kmax - Kmin + 1)
            {
                Array.Resize(ref BIC, Kmax - Kmin + 1);
            }

            // double variance_min, variance_max;
            // range_of_variance(x, variance_min, variance_max);

            int Kopt = Kmin;

            double maxBIC = 0;

            number[] lambda  = new number[Kmax];
            number[] mu      = new number[Kmax];
            number[] sigma2  = new number[Kmax];
            double[] coeff   = new double[Kmax];
            int[]    counts  = new int[Kmax];
            number[] weights = new number[Kmax];

            for (int K = Kmin; K <= Kmax; ++K)
            {
                // std::vector< std::vector< size_t > > JK(J.begin(), J.begin()+K);

                // Backtrack the matrix to determine boundaries between the bins.
                DynamicProgramming.BacktrackWeighted(x, y, J, counts, weights, K);

                // double totalweight = std::accumulate(weights.begin(), weights.begin() + K, 0, std::plus<double>());

                number totalweight;

                totalweight = 0;
                for (int k = 0; k < K; k++)
                {
                    totalweight += weights[k];
                }

                int indexLeft = 0;
                int indexRight;

                for (int k = 0; k < K; ++k)
                { // Estimate GMM parameters first
                    lambda[k] = weights[k] / totalweight;

                    indexRight = indexLeft + counts[k] - 1;

                    ShiftedDataVariance(x, y, weights[k], indexLeft, indexRight, out mu[k], out sigma2[k]);

                    if (sigma2[k] == 0 || counts[k] == 1)
                    {
                        number dmin;

                        if (indexLeft > 0 && indexRight < N - 1)
                        {
                            dmin = Math.Min(x[indexLeft] - x[indexLeft - 1], x[indexRight + 1] - x[indexRight]);
                        }
                        else if (indexLeft > 0)
                        {
                            dmin = x[indexLeft] - x[indexLeft - 1];
                        }
                        else
                        {
                            dmin = x[indexRight + 1] - x[indexRight];
                        }

                        // std::cout << "sigma2[k]=" << sigma2[k] << "==>";
                        if (sigma2[k] == 0)
                        {
                            sigma2[k] = (dmin * dmin) / (number)4 / (number)9;
                        }
                        if (counts[k] == 1)
                        {
                            sigma2[k] = (dmin * dmin);
                        }
                        // std::cout << sigma2[k] << std::endl;
                    }

                    /*
                     * if(sigma2[k] == 0) sigma2[k] = variance_min;
                     * if(size[k] == 1) sigma2[k] = variance_max;
                     */

                    coeff[k] = (double)lambda[k] / Math.Sqrt((double)(2 * PI * sigma2[k]));

                    indexLeft = indexRight + 1;
                }

                double loglikelihood = 0;

                for (int i = 0; i < N; ++i)
                {
                    double L = 0;
                    for (int k = 0; k < K; ++k)
                    {
                        L += (coeff[k]) * Math.Exp((double)(-(x[i] - mu[k]) * (x[i] - mu[k]) / (2 * sigma2[k])));
                    }

                    loglikelihood += ((double)y[i]) * Math.Log(L);
                }

                // double & bic = BIC[K-Kmin];

                // Compute the Bayesian information criterion

                double bic = 2 * loglikelihood - (3 * K - 1) * Math.Log((double)totalweight);  //(K*3-1)

                // std::cout << "k=" << K << ": Loglh=" << loglikelihood << ", BIC=" << BIC << std::endl;

                if (K == Kmin)
                {
                    maxBIC = bic;
                    Kopt   = Kmin;
                }
                else
                {
                    if (bic > maxBIC)
                    {
                        maxBIC = bic;
                        Kopt   = K;
                    }
                }

                BIC[K - Kmin] = bic;
            }
            return(Kopt);
        }
Пример #3
0
        private static void KMeans(number[] x, number[] y, int Kmin, int Kmax, out int[] clusters, out number[] centers, out number[] withinss, out number[] size, out double[] BIC, Method method, DissimilarityType criterion)
        {
            // Input:
            // x -- an array of double precision numbers, not necessarily sorted
            // Kmin -- the minimum number of clusters expected
            // Kmax -- the maximum number of clusters expected
            // NOTE: All vectors in this program is considered starting at position 0.

            int N = x.Length;

            clusters = new int[N];
            BIC      = new double[Kmax - Kmin];

            int[] order = new int[N];

            for (int i = 0; i < order.Length; ++i)
            {
                order[i] = i;
            }

            bool is_sorted = true;

            for (int i = 0; i < N - 1; ++i)
            {
                if (x[i] > x[i + 1])
                {
                    is_sorted = false;
                    break;
                }
            }

            number[] x_sorted = null;

            number[] y_sorted            = null;
            bool     is_equally_weighted = true;

            if (!is_sorted)
            {
                x_sorted = new number[x.Length];

                Array.Copy(x, x_sorted, x.Length);
                Array.Sort(x_sorted, order);

                for (int i = 0; i < x_sorted.Length; i++)
                {
                    x_sorted[i] = x[order[i]];
                }
            }
            else
            {
                x_sorted = x;
            }

            // check to see if unequal weight is provided
            if (y != null)
            {
                is_equally_weighted = true;
                for (int i = 1; i < N; ++i)
                {
                    if (y[i] != y[i - 1])
                    {
                        is_equally_weighted = false;
                        break;
                    }
                }
            }

            if (!is_equally_weighted)
            {
                y_sorted = new number[N];

                for (int i = 0; i < N; ++i)
                {
                    y_sorted[i] = y[order[i]];
                }
            }
            else
            {
                y = null;
            }

            int nUnique = 1;

            if (N == 0)
            {
                nUnique = 0;
            }

            if (N > 1)
            {
                for (int i = 1; i < N; i++)
                {
                    if (x_sorted[i - 1] != x_sorted[i])
                    {
                        nUnique++;
                    }
                }
            }

            Kmax = nUnique < Kmax ? nUnique : Kmax;

            if (nUnique > 1)
            { // The case when not all elements are equal.
                number[][] S = new number[Kmax][];

                for (int i = 0; i < Kmax; i++)
                {
                    S[i] = new number[N];
                }

                int[][] J = new int[Kmax][];

                for (int i = 0; i < Kmax; i++)
                {
                    J[i] = new int[N];
                }

                int Kopt;

                DynamicProgramming.FillMatrix(x_sorted, y_sorted, S, J, method, criterion);

                // Fill in dynamic programming matrix
                if (is_equally_weighted)
                {
                    Kopt = NonWeighted.SelectLevels(x_sorted, J, Kmin, Kmax, BIC);
                }
                else
                {
                    switch (criterion)
                    {
                    case DissimilarityType.L2Y:
                        Kopt = NonWeighted.SelectLevels(y_sorted, J, Kmin, Kmax, BIC);
                        break;

                    default:
                        Kopt = Weighted.SelectLevels(x_sorted, y_sorted, J, Kmin, Kmax, BIC);

                        break;
                    }
                }

                centers  = new number[Kopt];
                withinss = new number[Kopt];
                size     = new number[Kopt];

                if (Kopt < Kmax)
                { // Reform the dynamic programming matrix S and J
                    Array.Resize(ref J, Kopt);
                }

                int[] cluster_sorted = new int[N];

                // Backtrack to find the clusters beginning and ending indices
                if (is_equally_weighted && criterion == DissimilarityType.L1)
                {
                    DynamicProgramming.BacktrackL1(x_sorted, J, cluster_sorted, centers, withinss, size);
                }
                else if (is_equally_weighted && criterion == DissimilarityType.L2)
                {
                    DynamicProgramming.Backtrack(x_sorted, J, cluster_sorted, centers, withinss, size);
                }
                else if (criterion == DissimilarityType.L2Y)
                {
                    DynamicProgramming.BacktrackL2Y(x_sorted, y_sorted, J, cluster_sorted, centers, withinss, size);
                }
                else
                {
                    DynamicProgramming.BacktrackWeighted(x_sorted, y_sorted, J, cluster_sorted, centers, withinss, size);
                }

                /*#if DEBUG
                 *              std::cout << "backtrack done." << std::endl;
                 #endif*/

                for (int i = 0; i < N; ++i)
                {
                    // Obtain clustering on data in the original order
                    clusters[order[i]] = cluster_sorted[i];
                }
            }
            else
            {
                // A single cluster that contains all elements
                for (int i = 0; i < N; ++i)
                {
                    clusters[i] = 0;
                }


                centers  = new number[1];
                withinss = new number[1];
                size     = new number[1];

                centers[0]  = x[0];
                withinss[0] = 0;
                size[0]     = N * (is_equally_weighted ? 1 : y[0]);
            }
        }