public static void Quadratic( int imin, int imax, int q, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { // Assumption: each cluster must have at least one point. for (int i = imin; i <= imax; ++i) { S[q][i] = S[q - 1][i - 1]; J[q][i] = i; int jmin = Math.Max(q, (int)J[q - 1][i]); for (int j = i - 1; j >= jmin; --j) { number Sj = S[q - 1][j - 1] + WithinCluster.Dissimilarity(criterion, j, i, sum_x, sum_x_sq, sum_w, sum_w_sq); // ssq(j, i, sum_x, sum_x_sq, sum_w) if (Sj < S[q][i]) { S[q][i] = Sj; J[q][i] = j; } } } }
public static void SMAWK( int imin, int imax, int q, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { int[] js = new int[imax - q + 1]; int abs = (q); for (int iter = 0; iter < js.Length; iter++) { js[iter] = abs++; } SMAWK(imin, imax, 1, q, js, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); }
private static void FindMinFromCandidates( int imin, int imax, int istep, int q, int[] js, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { int rmin_prev = 0; for (int i = (imin); i <= imax; i += istep) { int rmin = rmin_prev; // Initialization of S[q][i] and J[q][i] S[q][i] = S[q - 1][js[rmin] - 1] + WithinCluster.Dissimilarity(criterion, js[rmin], i, sum_x, sum_x_sq, sum_w, sum_w_sq); // ssq(js[rmin], i, sum_x, sum_x_sq, sum_w); J[q][i] = js[rmin]; for (int r = (rmin + 1); r < js.Length; ++r) { int j_abs = js[r]; if (j_abs < J[q - 1][i]) { continue; } if (j_abs > i) { break; } number Sj = (S[q - 1][j_abs - 1] + WithinCluster.Dissimilarity(criterion, j_abs, i, sum_x, sum_x_sq, sum_w, sum_w_sq)); // ssq(j_abs, i, sum_x, sum_x_sq, sum_w)); if (Sj <= S[q][i]) { S[q][i] = Sj; J[q][i] = js[r]; rmin_prev = r; } } } }
public static number Dissimilarity(DissimilarityType disType, int j, int i, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq = null) { number d = 0; switch (disType) { case DissimilarityType.L1: d = SABS(j, i, sum_x, sum_w); break; case DissimilarityType.L2: d = SSQ(j, i, sum_x, sum_x_sq, sum_w); break; case DissimilarityType.L2Y: d = SSQ(j, i, sum_w, sum_w_sq); break; } return(d); }
public static void FillMatrix(number[] x, number[] w, number[][] S, int[][] J, Method method, DissimilarityType criterion) { /* * x: One dimension vector to be clustered, must be sorted (in any order). * S: K x N matrix. S[q][i] is the sum of squares of the distance from * each x[i] to its cluster mean when there are exactly x[i] is the * last point in cluster q * J: K x N backtrack matrix * * NOTE: All vector indices in this program start at position 0 */ int K = S.Length; int N = S[0].Length; number[] sum_x = new number[N]; number[] sum_x_sq = new number[N]; number[] sum_w = null; number[] sum_w_sq = null; int[] jseq = new int[N]; number shift = x[N / 2]; // median. used to shift the values of x to // improve numerical stability if (w == null || w.Length == 0) { // equally weighted sum_x[0] = x[0] - shift; sum_x_sq[0] = (x[0] - shift) * (x[0] - shift); } else { // unequally weighted sum_x[0] = w[0] * (x[0] - shift); sum_x_sq[0] = w[0] * (x[0] - shift) * (x[0] - shift); sum_w = new number[N]; sum_w_sq = new number[N]; sum_w[0] = w[0]; sum_w_sq[0] = w[0] * w[0]; } S[0][0] = 0; J[0][0] = 0; for (int i = 1; i < N; ++i) { if (w == null || w.Length == 0) { // equally weighted sum_x[i] = sum_x[i - 1] + x[i] - shift; sum_x_sq[i] = sum_x_sq[i - 1] + (x[i] - shift) * (x[i] - shift); } else { // unequally weighted sum_x[i] = sum_x[i - 1] + w[i] * (x[i] - shift); sum_x_sq[i] = sum_x_sq[i - 1] + w[i] * (x[i] - shift) * (x[i] - shift); sum_w[i] = sum_w[i - 1] + w[i]; sum_w_sq[i] = sum_w_sq[i - 1] + w[i] * w[i]; } // Initialize for q = 0 S[0][i] = WithinCluster.Dissimilarity(criterion, 0, i, sum_x, sum_x_sq, sum_w, sum_w_sq); // ssq(0, i, sum_x, sum_x_sq, sum_w); J[0][i] = 0; } /* #if DEBUG * for (int i = 0; i < x.Length; ++i) * { * Console.Write(x[i] + ","); * } * Console.WriteLine(); #endif */ for (int q = 1; q < K; ++q) { int imin; if (q < K - 1) { imin = Math.Max(1, q); } else { // No need to compute S[K-1][0] ... S[K-1][N-2] imin = N - 1; } /* # ifdef DEBUG # // std::cout << std::endl << "q=" << q << ":" << std::endl; #endif */ // fill_row_k_linear_recursive(imin, N-1, 1, q, jseq, S, J, sum_x, sum_x_sq); // fill_row_k_linear(imin, N-1, q, S, J, sum_x, sum_x_sq); if (method == Method.Linear) { Fill.SMAWK(imin, N - 1, q, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); } else if (method == Method.LogLinear) { Fill.LogLinear(imin, N - 1, q, q, N - 1, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); } else if (method == Method.Quadratic) { Fill.Quadratic(imin, N - 1, q, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); } else { throw new Exception("ERROR: unknown method " + method + "!"); } /* #if DEBUG * * fill_row_q_log_linear(imin, N - 1, q, q, N - 1, SS, JJ, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); * * for (int i = imin; i < N; ++i) * { * if (S[q][i] != SS[q][i] || J[q][i] != JJ[q][i]) * { * std::cout << "ERROR: q=" << q << ", i=" << i << std::endl; * std::cout << "\tS=" << S[q][i] << "\tJ=" << J[q][i] << std::endl; * std::cout << "Truth\tSS=" << SS[q][i] << "\tJJ=" << JJ[q][i]; * std::cout << std::endl; * assert(false); * * } * else * { * * std::cout << "OK: q=" << q << ", i=" << i << std::endl; * std::cout << "\tS=" << S[q][i] << "\tJ=" << J[q][i] << std::endl; * std::cout << "Truth\tSS=" << SS[q][i] << "\tJJ=" << JJ[q][i]; * std::cout << std::endl; * * } * * } #endif */ } /* # ifdef DEBUG # std::cout << "Linear & log-linear code returned identical dp index matrix." # << std::endl; #endif */ }
private static void KMeans(number[] x, number[] y, int Kmin, int Kmax, out int[] clusters, out number[] centers, out number[] withinss, out number[] size, out double[] BIC, Method method, DissimilarityType criterion) { // Input: // x -- an array of double precision numbers, not necessarily sorted // Kmin -- the minimum number of clusters expected // Kmax -- the maximum number of clusters expected // NOTE: All vectors in this program is considered starting at position 0. int N = x.Length; clusters = new int[N]; BIC = new double[Kmax - Kmin]; int[] order = new int[N]; for (int i = 0; i < order.Length; ++i) { order[i] = i; } bool is_sorted = true; for (int i = 0; i < N - 1; ++i) { if (x[i] > x[i + 1]) { is_sorted = false; break; } } number[] x_sorted = null; number[] y_sorted = null; bool is_equally_weighted = true; if (!is_sorted) { x_sorted = new number[x.Length]; Array.Copy(x, x_sorted, x.Length); Array.Sort(x_sorted, order); for (int i = 0; i < x_sorted.Length; i++) { x_sorted[i] = x[order[i]]; } } else { x_sorted = x; } // check to see if unequal weight is provided if (y != null) { is_equally_weighted = true; for (int i = 1; i < N; ++i) { if (y[i] != y[i - 1]) { is_equally_weighted = false; break; } } } if (!is_equally_weighted) { y_sorted = new number[N]; for (int i = 0; i < N; ++i) { y_sorted[i] = y[order[i]]; } } else { y = null; } int nUnique = 1; if (N == 0) { nUnique = 0; } if (N > 1) { for (int i = 1; i < N; i++) { if (x_sorted[i - 1] != x_sorted[i]) { nUnique++; } } } Kmax = nUnique < Kmax ? nUnique : Kmax; if (nUnique > 1) { // The case when not all elements are equal. number[][] S = new number[Kmax][]; for (int i = 0; i < Kmax; i++) { S[i] = new number[N]; } int[][] J = new int[Kmax][]; for (int i = 0; i < Kmax; i++) { J[i] = new int[N]; } int Kopt; DynamicProgramming.FillMatrix(x_sorted, y_sorted, S, J, method, criterion); // Fill in dynamic programming matrix if (is_equally_weighted) { Kopt = NonWeighted.SelectLevels(x_sorted, J, Kmin, Kmax, BIC); } else { switch (criterion) { case DissimilarityType.L2Y: Kopt = NonWeighted.SelectLevels(y_sorted, J, Kmin, Kmax, BIC); break; default: Kopt = Weighted.SelectLevels(x_sorted, y_sorted, J, Kmin, Kmax, BIC); break; } } centers = new number[Kopt]; withinss = new number[Kopt]; size = new number[Kopt]; if (Kopt < Kmax) { // Reform the dynamic programming matrix S and J Array.Resize(ref J, Kopt); } int[] cluster_sorted = new int[N]; // Backtrack to find the clusters beginning and ending indices if (is_equally_weighted && criterion == DissimilarityType.L1) { DynamicProgramming.BacktrackL1(x_sorted, J, cluster_sorted, centers, withinss, size); } else if (is_equally_weighted && criterion == DissimilarityType.L2) { DynamicProgramming.Backtrack(x_sorted, J, cluster_sorted, centers, withinss, size); } else if (criterion == DissimilarityType.L2Y) { DynamicProgramming.BacktrackL2Y(x_sorted, y_sorted, J, cluster_sorted, centers, withinss, size); } else { DynamicProgramming.BacktrackWeighted(x_sorted, y_sorted, J, cluster_sorted, centers, withinss, size); } /*#if DEBUG * std::cout << "backtrack done." << std::endl; #endif*/ for (int i = 0; i < N; ++i) { // Obtain clustering on data in the original order clusters[order[i]] = cluster_sorted[i]; } } else { // A single cluster that contains all elements for (int i = 0; i < N; ++i) { clusters[i] = 0; } centers = new number[1]; withinss = new number[1]; size = new number[1]; centers[0] = x[0]; withinss[0] = 0; size[0] = N * (is_equally_weighted ? 1 : y[0]); } }
public static void LogLinear( int imin, int imax, int q, int jmin, int jmax, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { if (imin > imax) { return; } int N = S[0].Length; int i = (imin + imax) / 2; #if DEBUG // std::cout << " i=" << i << ": "; #endif // Initialization of S[q][i]: S[q][i] = S[q - 1][i - 1]; J[q][i] = i; int jlow = q; // the lower end for j if (imin > q) { // jlow = std::max(jlow, (int)J[q][imin-1]); jlow = Math.Max(jlow, jmin); } jlow = Math.Max(jlow, J[q - 1][i]); int jhigh = i - 1; // the upper end for j if (imax < N - 1) { // jhigh = std::min(jhigh, (int)J[q][imax+1]); jhigh = Math.Min(jhigh, jmax); } #if DEBUG // std::cout << " j-=" << jlow << ", j+=" << jhigh << ": "; #endif for (int j = jhigh; j >= jlow; --j) { // compute s(j,i) number sji = WithinCluster.SSQ(j, i, sum_x, sum_x_sq, sum_w); // MS May 11, 2016 Added: if (sji + S[q - 1][jlow - 1] >= S[q][i]) { break; } // Examine the lower bound of the cluster border // compute s(jlow, i) number sjlowi = WithinCluster.Dissimilarity(criterion, jlow, i, sum_x, sum_x_sq, sum_w, sum_w_sq); // ssq(jlow, i, sum_x, sum_x_sq, sum_w); number SSQ_jlow = sjlowi + S[q - 1][jlow - 1]; if (SSQ_jlow < S[q][i]) { // shrink the lower bound S[q][i] = SSQ_jlow; J[q][i] = jlow; } jlow++; number SSQ_j = sji + S[q - 1][j - 1]; if (SSQ_j < S[q][i]) { S[q][i] = SSQ_j; J[q][i] = j; } } #if DEBUG //std::cout << // " q=" << q << ": " << // "\t" << S[q][i] << "\t" << J[q][i]; //std::cout << std::endl; #endif jmin = (imin > q) ? (int)J[q][imin - 1] : q; jmax = (int)J[q][i]; LogLinear(imin, i - 1, q, jmin, jmax, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); jmin = (int)J[q][i]; jmax = (imax < N - 1) ? (int)J[q][imax + 1] : imax; LogLinear(i + 1, imax, q, jmin, jmax, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); }
private static void SMAWK( int imin, int imax, int istep, int q, int[] js, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { #if DEBUG_REDUCE std::cout << "i:" << '[' << imin << ',' << imax << ']' << '+' << istep << std::endl; #endif if (imax - imin <= 0 * istep) { // base case only one element left FindMinFromCandidates( imin, imax, istep, q, js, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion ); } else { // REDUCE #if DEBUG_REDUCE std::cout << "js:"; for (size_t l = 0; l < js.size(); ++l) { std::cout << js[l] << ","; } std::cout << std::endl; std::cout << std::endl; #endif int[] js_odd; ReduceInPlace(imin, imax, istep, q, js, out js_odd, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); int istepx2 = (istep << 1); int imin_odd = (imin + istep); int imax_odd = (imin_odd + (imax - imin_odd) / istepx2 * istepx2); // Recursion on odd rows (0-based): SMAWK(imin_odd, imax_odd, istepx2, q, js_odd, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); #if DEBUG_REDUCE std::cout << "js_odd (reduced):"; for (size_t l = 0; l < js_odd.size(); ++l) { std::cout << js_odd[l] << ","; } std::cout << std::endl << std::endl; std::cout << "even pos:"; for (int i = imin; i < imax; i += istepx2) { std::cout << i << ","; } std::cout << std::endl << std::endl; #endif FillEvenPositions(imin, imax, istep, q, js, S, J, sum_x, sum_x_sq, sum_w, sum_w_sq, criterion); } }
private static void FillEvenPositions( int imin, int imax, int istep, int q, int[] js, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { // Derive j for even rows (0-based) int n = js.Length; int istepx2 = (istep << 1); int jl = js[0]; for (int i = imin, r = 0; i <= imax; i += istepx2) { // auto jmin = (i == imin) ? js[0] : J[q][i - istep]; while (js[r] < jl) { // Increase r until it points to a value of at least jmin r++; } // Initialize S[q][i] and J[q][i] S[q][i] = S[q - 1][js[r] - 1] + WithinCluster.Dissimilarity(criterion, js[r], i, sum_x, sum_x_sq, sum_w, sum_w_sq); // ssq(js[r], i, sum_x, sum_x_sq, sum_w); J[q][i] = js[r]; // rmin // Look for minimum S upto jmax within js int jh = (i + istep <= imax) ? J[q][i + istep] : js[n - 1]; int jmax = Math.Min(jh, i); number sjimin = WithinCluster.Dissimilarity(criterion, jmax, i, sum_x, sum_x_sq, sum_w, sum_w_sq); // ssq(jmax, i, sum_x, sum_x_sq, sum_w) for (++r; r < n && js[r] <= jmax; r++) { int jabs = js[r]; if (jabs > i) { break; } if (jabs < J[q - 1][i]) { continue; } number s = WithinCluster.Dissimilarity(criterion, jabs, i, sum_x, sum_x_sq, sum_w, sum_w_sq); // (ssq(jabs, i, sum_x, sum_x_sq, sum_w)); number Sj = (S[q - 1][jabs - 1] + s); if (Sj <= S[q][i]) { S[q][i] = Sj; J[q][i] = js[r]; } else if (S[q - 1][jabs - 1] + sjimin > S[q][i]) { break; } /*else if(S[q-1][js[rmin]-1] + s > S[q][i]) { * break; * }*/ } r--; jl = jh; } }
//SMAWK private static void ReduceInPlace( int imin, int imax, int istep, int q, int[] js, out int[] js_red, number[][] S, int[][] J, number[] sum_x, number[] sum_x_sq, number[] sum_w, number[] sum_w_sq, DissimilarityType criterion) { int N = (imax - imin) / istep + 1; js_red = js; if (N >= js.Length) { return; } // Two positions to move candidate j's back and forth int left = -1; // points to last favorable position / column int right = 0; // points to current position / column int m = js_red.Length; while (m > N) { // js_reduced has more than N positions / columns int p = (left + 1); int i = (imin + p * istep); int j = (js_red[right]); number Sl = (S[q - 1][j - 1] + WithinCluster.Dissimilarity(criterion, j, i, sum_x, sum_x_sq, sum_w, sum_w_sq)); // ssq(j, i, sum_x, sum_x_sq, sum_w)); int jplus1 = (js_red[right + 1]); number Slplus1 = (S[q - 1][jplus1 - 1] + WithinCluster.Dissimilarity(criterion, jplus1, i, sum_x, sum_x_sq, sum_w, sum_w_sq)); // ssq(jplus1, i, sum_x, sum_x_sq, sum_w)); if (Sl < Slplus1 && p < N - 1) { js_red[++left] = j; // i += istep; right++; // move on to next position / column p+1 } else if (Sl < Slplus1 && p == N - 1) { js_red[++right] = j; // delete position / column p+1 m--; } else { // (Sl >= Slplus1) if (p > 0) { // i > imin // delete position / column p and // move back to previous position / column p-1: js_red[right] = js_red[left--]; // p --; // i -= istep; } else { right++; // delete position / column 0 } m--; } } for (int r = (left + 1); r < m; ++r) { js_red[r] = js_red[right++]; } Array.Resize(ref js_red, m); return; }