public override void init() { xy = new double[0,0]; d = new double[0,0]; tmpd = new double[0,0]; distbuf = new apserv.apbuffers(); kmeanstmp = new kmeansbuffers(); }
/************************************************************************* K-means++ clusterization INPUT PARAMETERS: XY - dataset, array [0..NPoints-1,0..NVars-1]. NPoints - dataset size, NPoints>=K NVars - number of variables, NVars>=1 K - desired number of clusters, K>=1 InitAlgo - initialization algorithm: * 0 - automatic selection of best algorithm * 1 - random selection of centers * 2 - k-means++ * 3 - fast-greedy init *-1 - first K rows of dataset are used (special debug algorithm) MaxIts - iterations limit or zero for no limit Restarts - number of restarts, Restarts>=1 KMeansDbgNoIts- debug flag; if set, Lloyd's iteration is not performed, only initialization phase. Buf - special reusable structure which stores previously allocated memory, intended to avoid memory fragmentation when solving multiple subsequent problems: * MUST BE INITIALIZED WITH KMeansInitBuffers() CALL BEFORE FIRST PASS TO THIS FUNCTION! * subsequent passes must be made without re-initialization OUTPUT PARAMETERS: Info - return code: * -3, if task is degenerate (number of distinct points is less than K) * -1, if incorrect NPoints/NFeatures/K/Restarts was passed * 1, if subroutine finished successfully IterationsCount- actual number of iterations performed by clusterizer CCol - array[0..NVars-1,0..K-1].matrix whose columns store cluster's centers NeedCCol - True in case caller requires to store result in CCol CRow - array[0..K-1,0..NVars-1], same as CCol, but centers are stored in rows NeedCRow - True in case caller requires to store result in CCol XYC - array[NPoints], which contains cluster indexes Energy - merit function of clusterization -- ALGLIB -- Copyright 21.03.2009 by Bochkanov Sergey *************************************************************************/ public static void kmeansgenerateinternal(double[,] xy, int npoints, int nvars, int k, int initalgo, int maxits, int restarts, bool kmeansdbgnoits, ref int info, ref int iterationscount, ref double[,] ccol, bool needccol, ref double[,] crow, bool needcrow, ref int[] xyc, ref double energy, kmeansbuffers buf) { int i = 0; int j = 0; int i1 = 0; double e = 0; double eprev = 0; double v = 0; double vv = 0; bool waschanges = new bool(); bool zerosizeclusters = new bool(); int pass = 0; int itcnt = 0; hqrnd.hqrndstate rs = new hqrnd.hqrndstate(); int i_ = 0; info = 0; iterationscount = 0; ccol = new double[0,0]; crow = new double[0,0]; xyc = new int[0]; energy = 0; // // Test parameters // if( ((npoints<k || nvars<1) || k<1) || restarts<1 ) { info = -1; iterationscount = 0; return; } // // TODO: special case K=1 // TODO: special case K=NPoints // info = 1; iterationscount = 0; // // Multiple passes of k-means++ algorithm // xyc = new int[npoints]; apserv.rmatrixsetlengthatleast(ref buf.ct, k, nvars); apserv.rmatrixsetlengthatleast(ref buf.ctbest, k, nvars); apserv.ivectorsetlengthatleast(ref buf.xycprev, npoints); apserv.ivectorsetlengthatleast(ref buf.xycbest, npoints); apserv.rvectorsetlengthatleast(ref buf.d2, npoints); apserv.ivectorsetlengthatleast(ref buf.csizes, k); energy = math.maxrealnumber; hqrnd.hqrndrandomize(rs); for(pass=1; pass<=restarts; pass++) { // // Select initial centers. // // Note that for performance reasons centers are stored in ROWS of CT, not // in columns. We'll transpose CT in the end and store it in the C. // // Also note that SelectInitialCenters() may return degenerate set of centers // (some of them have no corresponding points in dataset, some are non-distinct). // Algorithm below is robust enough to deal with such set. // selectinitialcenters(xy, npoints, nvars, initalgo, k, ref buf.ct, buf.initbuf, buf.updatepool); // // Lloyd's iteration // if( !kmeansdbgnoits ) { // // Perform iteration as usual, in normal mode // for(i=0; i<=npoints-1; i++) { xyc[i] = -1; } eprev = math.maxrealnumber; e = math.maxrealnumber; itcnt = 0; while( maxits==0 || itcnt<maxits ) { // // Update iteration counter // itcnt = itcnt+1; apserv.inc(ref iterationscount); // // Call KMeansUpdateDistances(), fill XYC with center numbers, // D2 with center distances. // for(i=0; i<=npoints-1; i++) { buf.xycprev[i] = xyc[i]; } kmeansupdatedistances(xy, 0, npoints, nvars, buf.ct, 0, k, xyc, buf.d2, buf.updatepool); waschanges = false; for(i=0; i<=npoints-1; i++) { waschanges = waschanges || xyc[i]!=buf.xycprev[i]; } // // Update centers // for(j=0; j<=k-1; j++) { buf.csizes[j] = 0; } for(i=0; i<=k-1; i++) { for(j=0; j<=nvars-1; j++) { buf.ct[i,j] = 0; } } for(i=0; i<=npoints-1; i++) { buf.csizes[xyc[i]] = buf.csizes[xyc[i]]+1; for(i_=0; i_<=nvars-1;i_++) { buf.ct[xyc[i],i_] = buf.ct[xyc[i],i_] + xy[i,i_]; } } zerosizeclusters = false; for(j=0; j<=k-1; j++) { if( buf.csizes[j]!=0 ) { v = (double)1/(double)buf.csizes[j]; for(i_=0; i_<=nvars-1;i_++) { buf.ct[j,i_] = v*buf.ct[j,i_]; } } zerosizeclusters = zerosizeclusters || buf.csizes[j]==0; } if( zerosizeclusters ) { // // Some clusters have zero size - rare, but possible. // We'll choose new centers for such clusters using k-means++ rule // and restart algorithm // if( !fixcenters(xy, npoints, nvars, buf.ct, k, buf.initbuf, buf.updatepool) ) { info = -3; return; } continue; } // // Stop if one of two conditions is met: // 1. nothing has changed during iteration // 2. energy function increased after recalculation on new centers // e = 0; for(i=0; i<=npoints-1; i++) { v = 0.0; i1 = xyc[i]; for(j=0; j<=nvars-1; j++) { vv = xy[i,j]-buf.ct[i1,j]; v = v+vv*vv; } e = e+v; } if( !waschanges || (double)(e)>=(double)(eprev) ) { break; } // // Update EPrev // eprev = e; } } else { // // Debug mode: no Lloyd's iteration. // We just calculate potential E. // kmeansupdatedistances(xy, 0, npoints, nvars, buf.ct, 0, k, xyc, buf.d2, buf.updatepool); e = 0; for(i=0; i<=npoints-1; i++) { e = e+buf.d2[i]; } } // // Compare E with best centers found so far // if( (double)(e)<(double)(energy) ) { // // store partition. // energy = e; blas.copymatrix(buf.ct, 0, k-1, 0, nvars-1, ref buf.ctbest, 0, k-1, 0, nvars-1); for(i=0; i<=npoints-1; i++) { buf.xycbest[i] = xyc[i]; } } } // // Copy and transpose // if( needccol ) { ccol = new double[nvars, k]; blas.copyandtranspose(buf.ctbest, 0, k-1, 0, nvars-1, ref ccol, 0, nvars-1, 0, k-1); } if( needcrow ) { crow = new double[k, nvars]; ablas.rmatrixcopy(k, nvars, buf.ctbest, 0, 0, ref crow, 0, 0); } for(i=0; i<=npoints-1; i++) { xyc[i] = buf.xycbest[i]; } }
public override alglib.apobject make_copy() { kmeansbuffers _result = new kmeansbuffers(); _result.ct = (double[,])ct.Clone(); _result.ctbest = (double[,])ctbest.Clone(); _result.xycbest = (int[])xycbest.Clone(); _result.xycprev = (int[])xycprev.Clone(); _result.d2 = (double[])d2.Clone(); _result.csizes = (int[])csizes.Clone(); _result.initbuf = (apserv.apbuffers)initbuf.make_copy(); _result.updatepool = (alglib.smp.shared_pool)updatepool.make_copy(); return _result; }
/************************************************************************* K-means++ initialization INPUT PARAMETERS: Buf - special reusable structure which stores previously allocated memory, intended to avoid memory fragmentation when solving multiple subsequent problems. Must be initialized prior to usage. OUTPUT PARAMETERS: Buf - initialized structure -- ALGLIB -- Copyright 24.07.2015 by Bochkanov Sergey *************************************************************************/ public static void kmeansinitbuf(kmeansbuffers buf) { apserv.apbuffers updateseed = new apserv.apbuffers(); alglib.smp.ae_shared_pool_set_seed(buf.updatepool, updateseed); }