public static int SelectLevels(number[] x, int[][] J, int Kmin, int Kmax, double[] BIC) { int N = x.Length; if (Kmin >= Kmax || N < 2) { return(Math.Min(Kmin, Kmax)); } if (BIC.Length != Kmax - Kmin + 1) { Array.Resize(ref BIC, Kmax - Kmin + 1); } // double variance_min, variance_max; // range_of_variance(x, variance_min, variance_max); int Kopt = Kmin; double maxBIC = 0; double[] lambda = new double[Kmax]; number[] mu = new number[Kmax]; number[] sigma2 = new number[Kmax]; double[] coeff = new double[Kmax]; for (int K = Kmin; K <= Kmax; ++K) { int[] size = new int[K]; // Backtrack the matrix to determine boundaries between the bins. DynamicProgramming.Backtrack(x, J, size, K); int indexLeft = 0; int indexRight; for (int k = 0; k < K; ++k) { // Estimate GMM parameters first lambda[k] = size[k] / (double)N; indexRight = indexLeft + size[k] - 1; ShiftedDataVariance(x, indexLeft, indexRight, out mu[k], out sigma2[k]); if (sigma2[k] == 0 || size[k] == 1) { number dmin; if (indexLeft > 0 && indexRight < N - 1) { dmin = Math.Min(x[indexLeft] - x[indexLeft - 1], x[indexRight + 1] - x[indexRight]); } else if (indexLeft > 0) { dmin = x[indexLeft] - x[indexLeft - 1]; } else { dmin = x[indexRight + 1] - x[indexRight]; } // std::cout << "sigma2[k]=" << sigma2[k] << "==>"; if (sigma2[k] == 0) { sigma2[k] = dmin * dmin / ((number)4) / ((number)9); } if (size[k] == 1) { sigma2[k] = dmin * dmin; } // std::cout << sigma2[k] << std::endl; } /* * if(sigma2[k] == 0) sigma2[k] = variance_min; * if(size[k] == 1) sigma2[k] = variance_max; */ coeff[k] = lambda[k] / Math.Sqrt((double)(2 * PI * sigma2[k])); indexLeft = indexRight + 1; } double loglikelihood = 0; for (int i = 0; i < N; ++i) { double L = 0; for (int k = 0; k < K; ++k) { L += coeff[k] * Math.Exp((double)(-(x[i] - mu[k]) * (x[i] - mu[k]) / (2 * sigma2[k]))); } loglikelihood += Math.Log(L); } // Compute the Bayesian information criterion BIC[K - Kmin] = 2d * loglikelihood - (3d * K - 1d) * Math.Log(N); //(K*3-1) double bic = BIC[K - Kmin]; // std::cout << "k=" << K << ": Loglh=" << loglikelihood << ", BIC=" << BIC << std::endl; if (K == Kmin) { maxBIC = bic; Kopt = Kmin; } else { if (bic > maxBIC) { maxBIC = bic; Kopt = K; } } } return(Kopt); }
public static int SelectLevels(number[] x, number[] y, int[][] J, int Kmin, int Kmax, double[] BIC) { int N = x.Length; /*if (Kmin == Kmax) * { * return Kmin; * }*/ if (Kmin > Kmax || N < 2) { return(Math.Min(Kmin, Kmax)); } if (BIC.Length != Kmax - Kmin + 1) { Array.Resize(ref BIC, Kmax - Kmin + 1); } // double variance_min, variance_max; // range_of_variance(x, variance_min, variance_max); int Kopt = Kmin; double maxBIC = 0; number[] lambda = new number[Kmax]; number[] mu = new number[Kmax]; number[] sigma2 = new number[Kmax]; double[] coeff = new double[Kmax]; int[] counts = new int[Kmax]; number[] weights = new number[Kmax]; for (int K = Kmin; K <= Kmax; ++K) { // std::vector< std::vector< size_t > > JK(J.begin(), J.begin()+K); // Backtrack the matrix to determine boundaries between the bins. DynamicProgramming.BacktrackWeighted(x, y, J, counts, weights, K); // double totalweight = std::accumulate(weights.begin(), weights.begin() + K, 0, std::plus<double>()); number totalweight; totalweight = 0; for (int k = 0; k < K; k++) { totalweight += weights[k]; } int indexLeft = 0; int indexRight; for (int k = 0; k < K; ++k) { // Estimate GMM parameters first lambda[k] = weights[k] / totalweight; indexRight = indexLeft + counts[k] - 1; ShiftedDataVariance(x, y, weights[k], indexLeft, indexRight, out mu[k], out sigma2[k]); if (sigma2[k] == 0 || counts[k] == 1) { number dmin; if (indexLeft > 0 && indexRight < N - 1) { dmin = Math.Min(x[indexLeft] - x[indexLeft - 1], x[indexRight + 1] - x[indexRight]); } else if (indexLeft > 0) { dmin = x[indexLeft] - x[indexLeft - 1]; } else { dmin = x[indexRight + 1] - x[indexRight]; } // std::cout << "sigma2[k]=" << sigma2[k] << "==>"; if (sigma2[k] == 0) { sigma2[k] = (dmin * dmin) / (number)4 / (number)9; } if (counts[k] == 1) { sigma2[k] = (dmin * dmin); } // std::cout << sigma2[k] << std::endl; } /* * if(sigma2[k] == 0) sigma2[k] = variance_min; * if(size[k] == 1) sigma2[k] = variance_max; */ coeff[k] = (double)lambda[k] / Math.Sqrt((double)(2 * PI * sigma2[k])); indexLeft = indexRight + 1; } double loglikelihood = 0; for (int i = 0; i < N; ++i) { double L = 0; for (int k = 0; k < K; ++k) { L += (coeff[k]) * Math.Exp((double)(-(x[i] - mu[k]) * (x[i] - mu[k]) / (2 * sigma2[k]))); } loglikelihood += ((double)y[i]) * Math.Log(L); } // double & bic = BIC[K-Kmin]; // Compute the Bayesian information criterion double bic = 2 * loglikelihood - (3 * K - 1) * Math.Log((double)totalweight); //(K*3-1) // std::cout << "k=" << K << ": Loglh=" << loglikelihood << ", BIC=" << BIC << std::endl; if (K == Kmin) { maxBIC = bic; Kopt = Kmin; } else { if (bic > maxBIC) { maxBIC = bic; Kopt = K; } } BIC[K - Kmin] = bic; } return(Kopt); }
private static void KMeans(number[] x, number[] y, int Kmin, int Kmax, out int[] clusters, out number[] centers, out number[] withinss, out number[] size, out double[] BIC, Method method, DissimilarityType criterion) { // Input: // x -- an array of double precision numbers, not necessarily sorted // Kmin -- the minimum number of clusters expected // Kmax -- the maximum number of clusters expected // NOTE: All vectors in this program is considered starting at position 0. int N = x.Length; clusters = new int[N]; BIC = new double[Kmax - Kmin]; int[] order = new int[N]; for (int i = 0; i < order.Length; ++i) { order[i] = i; } bool is_sorted = true; for (int i = 0; i < N - 1; ++i) { if (x[i] > x[i + 1]) { is_sorted = false; break; } } number[] x_sorted = null; number[] y_sorted = null; bool is_equally_weighted = true; if (!is_sorted) { x_sorted = new number[x.Length]; Array.Copy(x, x_sorted, x.Length); Array.Sort(x_sorted, order); for (int i = 0; i < x_sorted.Length; i++) { x_sorted[i] = x[order[i]]; } } else { x_sorted = x; } // check to see if unequal weight is provided if (y != null) { is_equally_weighted = true; for (int i = 1; i < N; ++i) { if (y[i] != y[i - 1]) { is_equally_weighted = false; break; } } } if (!is_equally_weighted) { y_sorted = new number[N]; for (int i = 0; i < N; ++i) { y_sorted[i] = y[order[i]]; } } else { y = null; } int nUnique = 1; if (N == 0) { nUnique = 0; } if (N > 1) { for (int i = 1; i < N; i++) { if (x_sorted[i - 1] != x_sorted[i]) { nUnique++; } } } Kmax = nUnique < Kmax ? nUnique : Kmax; if (nUnique > 1) { // The case when not all elements are equal. number[][] S = new number[Kmax][]; for (int i = 0; i < Kmax; i++) { S[i] = new number[N]; } int[][] J = new int[Kmax][]; for (int i = 0; i < Kmax; i++) { J[i] = new int[N]; } int Kopt; DynamicProgramming.FillMatrix(x_sorted, y_sorted, S, J, method, criterion); // Fill in dynamic programming matrix if (is_equally_weighted) { Kopt = NonWeighted.SelectLevels(x_sorted, J, Kmin, Kmax, BIC); } else { switch (criterion) { case DissimilarityType.L2Y: Kopt = NonWeighted.SelectLevels(y_sorted, J, Kmin, Kmax, BIC); break; default: Kopt = Weighted.SelectLevels(x_sorted, y_sorted, J, Kmin, Kmax, BIC); break; } } centers = new number[Kopt]; withinss = new number[Kopt]; size = new number[Kopt]; if (Kopt < Kmax) { // Reform the dynamic programming matrix S and J Array.Resize(ref J, Kopt); } int[] cluster_sorted = new int[N]; // Backtrack to find the clusters beginning and ending indices if (is_equally_weighted && criterion == DissimilarityType.L1) { DynamicProgramming.BacktrackL1(x_sorted, J, cluster_sorted, centers, withinss, size); } else if (is_equally_weighted && criterion == DissimilarityType.L2) { DynamicProgramming.Backtrack(x_sorted, J, cluster_sorted, centers, withinss, size); } else if (criterion == DissimilarityType.L2Y) { DynamicProgramming.BacktrackL2Y(x_sorted, y_sorted, J, cluster_sorted, centers, withinss, size); } else { DynamicProgramming.BacktrackWeighted(x_sorted, y_sorted, J, cluster_sorted, centers, withinss, size); } /*#if DEBUG * std::cout << "backtrack done." << std::endl; #endif*/ for (int i = 0; i < N; ++i) { // Obtain clustering on data in the original order clusters[order[i]] = cluster_sorted[i]; } } else { // A single cluster that contains all elements for (int i = 0; i < N; ++i) { clusters[i] = 0; } centers = new number[1]; withinss = new number[1]; size = new number[1]; centers[0] = x[0]; withinss[0] = 0; size[0] = N * (is_equally_weighted ? 1 : y[0]); } }