/************************************************************************* This function performs clustering by k-means++ algorithm. You may change algorithm properties by calling: * ClusterizerSetKMeansLimits() to change number of restarts or iterations * ClusterizerSetKMeansInit() to change initialization algorithm By default, one restart and unlimited number of iterations are used. Initialization algorithm is chosen automatically. COMMERCIAL EDITION OF ALGLIB: ! Commercial version of ALGLIB includes two important improvements of ! this function: ! * multicore support (can be used from C# and C++) ! * access to high-performance C++ core (actual for C# users) ! ! K-means clustering algorithm has two phases: selection of initial ! centers and clustering itself. ALGLIB parallelizes both phases. ! Parallel version is optimized for the following scenario: medium or ! high-dimensional problem (20 or more dimensions) with large number of ! points and clusters. However, some speed-up can be obtained even when ! assumptions above are violated. ! ! As for native-vs-managed comparison, working with native core brings ! 30-40% improvement in speed over pure C# version of ALGLIB. ! ! We recommend you to read 'Working with commercial version' section of ! ALGLIB Reference Manual in order to find out how to use performance- ! related features provided by commercial edition of ALGLIB. INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() K - number of clusters, K>=0. K can be zero only when algorithm is called for empty dataset, in this case completion code is set to success (+1). If K=0 and dataset size is non-zero, we can not meaningfully assign points to some center (there are no centers because K=0) and return -3 as completion code (failure). OUTPUT PARAMETERS: Rep - clustering results; see description of KMeansReport structure for more information. NOTE 1: k-means clustering can be performed only for datasets with Euclidean distance function. Algorithm will return negative completion code in Rep.TerminationType in case dataset was added to clusterizer with DistType other than Euclidean (or dataset was specified by distance matrix instead of explicitly given points). -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizerrunkmeans(clusterizerstate s, int k, kmeansreport rep) { double[,] dummy = new double[0,0]; alglib.ap.assert(k>=0, "ClusterizerRunKMeans: K<0"); // // Incorrect distance type // if( s.disttype!=2 ) { rep.npoints = s.npoints; rep.terminationtype = -5; rep.k = k; rep.iterationscount = 0; rep.energy = 0.0; return; } // // K>NPoints or (K=0 and NPoints>0) // if( k>s.npoints || (k==0 && s.npoints>0) ) { rep.npoints = s.npoints; rep.terminationtype = -3; rep.k = k; rep.iterationscount = 0; rep.energy = 0.0; return; } // // No points // if( s.npoints==0 ) { rep.npoints = 0; rep.terminationtype = 1; rep.k = k; rep.iterationscount = 0; rep.energy = 0.0; return; } // // Normal case: // 1<=K<=NPoints, Euclidean distance // rep.npoints = s.npoints; rep.nfeatures = s.nfeatures; rep.k = k; rep.npoints = s.npoints; rep.nfeatures = s.nfeatures; kmeansgenerateinternal(s.xy, s.npoints, s.nfeatures, k, s.kmeansinitalgo, s.kmeansmaxits, s.kmeansrestarts, s.kmeansdbgnoits, ref rep.terminationtype, ref rep.iterationscount, ref dummy, false, ref rep.c, true, ref rep.cidx, ref rep.energy, s.kmeanstmp); }
/************************************************************************* Single-threaded stub. HPC ALGLIB replaces it by multithreaded code. *************************************************************************/ public static void _pexec_clusterizerrunkmeans(clusterizerstate s, int k, kmeansreport rep) { clusterizerrunkmeans(s,k,rep); }
/************************************************************************* This function performs agglomerative hierarchical clustering COMMERCIAL EDITION OF ALGLIB: ! Commercial version of ALGLIB includes two important improvements of ! this function, which can be used from C++ and C#: ! * Intel MKL support (lightweight Intel MKL is shipped with ALGLIB) ! * multicore support ! ! Agglomerative hierarchical clustering algorithm has two phases: ! distance matrix calculation and clustering itself. Only first phase ! (distance matrix calculation) is accelerated by Intel MKL and multi- ! threading. Thus, acceleration is significant only for medium or high- ! dimensional problems. ! ! We recommend you to read 'Working with commercial version' section of ! ALGLIB Reference Manual in order to find out how to use performance- ! related features provided by commercial edition of ALGLIB. INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() OUTPUT PARAMETERS: Rep - clustering results; see description of AHCReport structure for more information. NOTE 1: hierarchical clustering algorithms require large amounts of memory. In particular, this implementation needs sizeof(double)*NPoints^2 bytes, which are used to store distance matrix. In case we work with user-supplied matrix, this amount is multiplied by 2 (we have to store original matrix and to work with its copy). For example, problem with 10000 points would require 800M of RAM, even when working in a 1-dimensional space. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizerrunahc(clusterizerstate s, ahcreport rep) { int npoints = 0; int nfeatures = 0; npoints = s.npoints; nfeatures = s.nfeatures; // // Fill Rep.NPoints, quick exit when NPoints<=1 // rep.npoints = npoints; if( npoints==0 ) { rep.p = new int[0]; rep.z = new int[0, 0]; rep.pz = new int[0, 0]; rep.pm = new int[0, 0]; rep.mergedist = new double[0]; rep.terminationtype = 1; return; } if( npoints==1 ) { rep.p = new int[1]; rep.z = new int[0, 0]; rep.pz = new int[0, 0]; rep.pm = new int[0, 0]; rep.mergedist = new double[0]; rep.p[0] = 0; rep.terminationtype = 1; return; } // // More than one point // if( s.disttype==-1 ) { // // Run clusterizer with user-supplied distance matrix // clusterizerrunahcinternal(s, ref s.d, rep); return; } else { // // Check combination of AHC algo and distance type // if( s.ahcalgo==4 && s.disttype!=2 ) { rep.terminationtype = -5; return; } // // Build distance matrix D. // clusterizergetdistancesbuf(s.distbuf, s.xy, npoints, nfeatures, s.disttype, ref s.tmpd); // // Run clusterizer // clusterizerrunahcinternal(s, ref s.tmpd, rep); return; } }
/************************************************************************* Single-threaded stub. HPC ALGLIB replaces it by multithreaded code. *************************************************************************/ public static void _pexec_clusterizerrunahc(clusterizerstate s, ahcreport rep) { clusterizerrunahc(s,rep); }
/************************************************************************* This function sets k-means properties: number of restarts and maximum number of iterations per one run. INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() Restarts- restarts count, >=1. k-means++ algorithm performs several restarts and chooses best set of centers (one with minimum squared distance). MaxIts - maximum number of k-means iterations performed during one run. >=0, zero value means that algorithm performs unlimited number of iterations. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizersetkmeanslimits(clusterizerstate s, int restarts, int maxits) { alglib.ap.assert(restarts>=1, "ClusterizerSetKMeansLimits: Restarts<=0"); alglib.ap.assert(maxits>=0, "ClusterizerSetKMeansLimits: MaxIts<0"); s.kmeansrestarts = restarts; s.kmeansmaxits = maxits; }
/************************************************************************* This function sets k-means initialization algorithm. Several different algorithms can be chosen, including k-means++. INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() InitAlgo- initialization algorithm: * 0 automatic selection ( different versions of ALGLIB may select different algorithms) * 1 random initialization * 2 k-means++ initialization (best quality of initial centers, but long non-parallelizable initialization phase with bad cache locality) * 3 "fast-greedy" algorithm with efficient, easy to parallelize initialization. Quality of initial centers is somewhat worse than that of k-means++. This algorithm is a default one in the current version of ALGLIB. *-1 "debug" algorithm which always selects first K rows of dataset; this algorithm is used for debug purposes only. Do not use it in the industrial code! -- ALGLIB -- Copyright 21.01.2015 by Bochkanov Sergey *************************************************************************/ public static void clusterizersetkmeansinit(clusterizerstate s, int initalgo) { alglib.ap.assert(initalgo>=-1 && initalgo<=3, "ClusterizerSetKMeansInit: InitAlgo is incorrect"); s.kmeansinitalgo = initalgo; }
/************************************************************************* This function adds dataset given by distance matrix to the clusterizer structure. It is important that dataset is not given explicitly - only distance matrix is given. This function overrides all previous calls of ClusterizerSetPoints() or ClusterizerSetDistances(). INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() D - array[NPoints,NPoints], distance matrix given by its upper or lower triangle (main diagonal is ignored because its entries are expected to be zero). NPoints - number of points IsUpper - whether upper or lower triangle of D is given. NOTE 1: different clustering algorithms have different limitations: * agglomerative hierarchical clustering algorithms may be used with any kind of distance metric, including one which is given by distance matrix * k-means++ clustering algorithm may be used only with Euclidean distance function and explicitly given points - it can not be used with dataset given by distance matrix Thus, if you call this function, you will be unable to use k-means clustering algorithm to process your problem. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizersetdistances(clusterizerstate s, double[,] d, int npoints, bool isupper) { int i = 0; int j = 0; int j0 = 0; int j1 = 0; alglib.ap.assert(npoints>=0, "ClusterizerSetDistances: NPoints<0"); alglib.ap.assert(alglib.ap.rows(d)>=npoints, "ClusterizerSetDistances: Rows(D)<NPoints"); alglib.ap.assert(alglib.ap.cols(d)>=npoints, "ClusterizerSetDistances: Cols(D)<NPoints"); s.npoints = npoints; s.nfeatures = 0; s.disttype = -1; apserv.rmatrixsetlengthatleast(ref s.d, npoints, npoints); for(i=0; i<=npoints-1; i++) { if( isupper ) { j0 = i+1; j1 = npoints-1; } else { j0 = 0; j1 = i-1; } for(j=j0; j<=j1; j++) { alglib.ap.assert(math.isfinite(d[i,j]) && (double)(d[i,j])>=(double)(0), "ClusterizerSetDistances: D contains infinite, NAN or negative elements"); s.d[i,j] = d[i,j]; s.d[j,i] = d[i,j]; } s.d[i,i] = 0; } }
/************************************************************************* This function sets agglomerative hierarchical clustering algorithm INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() Algo - algorithm type: * 0 complete linkage (default algorithm) * 1 single linkage * 2 unweighted average linkage * 3 weighted average linkage * 4 Ward's method NOTE: Ward's method works correctly only with Euclidean distance, that's why algorithm will return negative termination code (failure) for any other distance type. It is possible, however, to use this method with user-supplied distance matrix. It is your responsibility to pass one which was calculated with Euclidean distance function. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizersetahcalgo(clusterizerstate s, int algo) { alglib.ap.assert((((algo==0 || algo==1) || algo==2) || algo==3) || algo==4, "ClusterizerSetHCAlgo: incorrect algorithm type"); s.ahcalgo = algo; }
/************************************************************************* This function initializes clusterizer object. Newly initialized object is empty, i.e. it does not contain dataset. You should use it as follows: 1. creation 2. dataset is added with ClusterizerSetPoints() 3. additional parameters are set 3. clusterization is performed with one of the clustering functions -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizercreate(clusterizerstate s) { s.npoints = 0; s.nfeatures = 0; s.disttype = 2; s.ahcalgo = 0; s.kmeansrestarts = 1; s.kmeansmaxits = 0; s.kmeansinitalgo = 0; s.kmeansdbgnoits = false; kmeansinitbuf(s.kmeanstmp); }
/************************************************************************* This function adds dataset to the clusterizer structure. This function overrides all previous calls of ClusterizerSetPoints() or ClusterizerSetDistances(). INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() XY - array[NPoints,NFeatures], dataset NPoints - number of points, >=0 NFeatures- number of features, >=1 DistType- distance function: * 0 Chebyshev distance (L-inf norm) * 1 city block distance (L1 norm) * 2 Euclidean distance (L2 norm), non-squared * 10 Pearson correlation: dist(a,b) = 1-corr(a,b) * 11 Absolute Pearson correlation: dist(a,b) = 1-|corr(a,b)| * 12 Uncentered Pearson correlation (cosine of the angle): dist(a,b) = a'*b/(|a|*|b|) * 13 Absolute uncentered Pearson correlation dist(a,b) = |a'*b|/(|a|*|b|) * 20 Spearman rank correlation: dist(a,b) = 1-rankcorr(a,b) * 21 Absolute Spearman rank correlation dist(a,b) = 1-|rankcorr(a,b)| NOTE 1: different distance functions have different performance penalty: * Euclidean or Pearson correlation distances are the fastest ones * Spearman correlation distance function is a bit slower * city block and Chebyshev distances are order of magnitude slower The reason behing difference in performance is that correlation-based distance functions are computed using optimized linear algebra kernels, while Chebyshev and city block distance functions are computed using simple nested loops with two branches at each iteration. NOTE 2: different clustering algorithms have different limitations: * agglomerative hierarchical clustering algorithms may be used with any kind of distance metric * k-means++ clustering algorithm may be used only with Euclidean distance function Thus, list of specific clustering algorithms you may use depends on distance function you specify when you set your dataset. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizersetpoints(clusterizerstate s, double[,] xy, int npoints, int nfeatures, int disttype) { int i = 0; int i_ = 0; alglib.ap.assert((((((((disttype==0 || disttype==1) || disttype==2) || disttype==10) || disttype==11) || disttype==12) || disttype==13) || disttype==20) || disttype==21, "ClusterizerSetPoints: incorrect DistType"); alglib.ap.assert(npoints>=0, "ClusterizerSetPoints: NPoints<0"); alglib.ap.assert(nfeatures>=1, "ClusterizerSetPoints: NFeatures<1"); alglib.ap.assert(alglib.ap.rows(xy)>=npoints, "ClusterizerSetPoints: Rows(XY)<NPoints"); alglib.ap.assert(alglib.ap.cols(xy)>=nfeatures, "ClusterizerSetPoints: Cols(XY)<NFeatures"); alglib.ap.assert(apserv.apservisfinitematrix(xy, npoints, nfeatures), "ClusterizerSetPoints: XY contains NAN/INF"); s.npoints = npoints; s.nfeatures = nfeatures; s.disttype = disttype; apserv.rmatrixsetlengthatleast(ref s.xy, npoints, nfeatures); for(i=0; i<=npoints-1; i++) { for(i_=0; i_<=nfeatures-1;i_++) { s.xy[i,i_] = xy[i,i_]; } } }
public override alglib.apobject make_copy() { clusterizerstate _result = new clusterizerstate(); _result.npoints = npoints; _result.nfeatures = nfeatures; _result.disttype = disttype; _result.xy = (double[,])xy.Clone(); _result.d = (double[,])d.Clone(); _result.ahcalgo = ahcalgo; _result.kmeansrestarts = kmeansrestarts; _result.kmeansmaxits = kmeansmaxits; _result.kmeansinitalgo = kmeansinitalgo; _result.kmeansdbgnoits = kmeansdbgnoits; _result.tmpd = (double[,])tmpd.Clone(); _result.distbuf = (apserv.apbuffers)distbuf.make_copy(); _result.kmeanstmp = (kmeansbuffers)kmeanstmp.make_copy(); return _result; }
/************************************************************************* This function performs agglomerative hierarchical clustering using precomputed distance matrix. Internal function, should not be called directly. INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() D - distance matrix, array[S.NFeatures,S.NFeatures] Contents of the matrix is destroyed during algorithm operation. OUTPUT PARAMETERS: Rep - clustering results; see description of AHCReport structure for more information. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ private static void clusterizerrunahcinternal(clusterizerstate s, ref double[,] d, ahcreport rep) { int i = 0; int j = 0; int k = 0; double v = 0; int mergeidx = 0; int c0 = 0; int c1 = 0; int s0 = 0; int s1 = 0; int ar = 0; int br = 0; int npoints = 0; int[] cidx = new int[0]; int[] csizes = new int[0]; int[] nnidx = new int[0]; int[,] cinfo = new int[0,0]; int n0 = 0; int n1 = 0; int ni = 0; double d01 = 0; npoints = s.npoints; // // Fill Rep.NPoints, quick exit when NPoints<=1 // rep.npoints = npoints; if( npoints==0 ) { rep.p = new int[0]; rep.z = new int[0, 0]; rep.pz = new int[0, 0]; rep.pm = new int[0, 0]; rep.mergedist = new double[0]; rep.terminationtype = 1; return; } if( npoints==1 ) { rep.p = new int[1]; rep.z = new int[0, 0]; rep.pz = new int[0, 0]; rep.pm = new int[0, 0]; rep.mergedist = new double[0]; rep.p[0] = 0; rep.terminationtype = 1; return; } rep.z = new int[npoints-1, 2]; rep.mergedist = new double[npoints-1]; rep.terminationtype = 1; // // Build list of nearest neighbors // nnidx = new int[npoints]; for(i=0; i<=npoints-1; i++) { // // Calculate index of the nearest neighbor // k = -1; v = math.maxrealnumber; for(j=0; j<=npoints-1; j++) { if( j!=i && (double)(d[i,j])<(double)(v) ) { k = j; v = d[i,j]; } } alglib.ap.assert((double)(v)<(double)(math.maxrealnumber), "ClusterizerRunAHC: internal error"); nnidx[i] = k; } // // For AHCAlgo=4 (Ward's method) replace distances by their squares times 0.5 // if( s.ahcalgo==4 ) { for(i=0; i<=npoints-1; i++) { for(j=0; j<=npoints-1; j++) { d[i,j] = 0.5*d[i,j]*d[i,j]; } } } // // Distance matrix is built, perform merges. // // NOTE 1: CIdx is array[NPoints] which maps rows/columns of the // distance matrix D to indexes of clusters. Values of CIdx // from [0,NPoints) denote single-point clusters, and values // from [NPoints,2*NPoints-1) denote ones obtained by merging // smaller clusters. Negative calues correspond to absent clusters. // // Initially it contains [0...NPoints-1], after each merge // one element of CIdx (one with index C0) is replaced by // NPoints+MergeIdx, and another one with index C1 is // rewritten by -1. // // NOTE 2: CSizes is array[NPoints] which stores sizes of clusters. // // cidx = new int[npoints]; csizes = new int[npoints]; for(i=0; i<=npoints-1; i++) { cidx[i] = i; csizes[i] = 1; } for(mergeidx=0; mergeidx<=npoints-2; mergeidx++) { // // Select pair of clusters (C0,C1) with CIdx[C0]<CIdx[C1] to merge. // c0 = -1; c1 = -1; d01 = math.maxrealnumber; for(i=0; i<=npoints-1; i++) { if( cidx[i]>=0 ) { if( (double)(d[i,nnidx[i]])<(double)(d01) ) { c0 = i; c1 = nnidx[i]; d01 = d[i,nnidx[i]]; } } } alglib.ap.assert((double)(d01)<(double)(math.maxrealnumber), "ClusterizerRunAHC: internal error"); if( cidx[c0]>cidx[c1] ) { i = c1; c1 = c0; c0 = i; } // // Fill one row of Rep.Z and one element of Rep.MergeDist // rep.z[mergeidx,0] = cidx[c0]; rep.z[mergeidx,1] = cidx[c1]; rep.mergedist[mergeidx] = d01; // // Update distance matrix: // * row/column C0 are updated by distances to the new cluster // * row/column C1 are considered empty (we can fill them by zeros, // but do not want to spend time - we just ignore them) // // NOTE: it is important to update distance matrix BEFORE CIdx/CSizes // are updated. // alglib.ap.assert((((s.ahcalgo==0 || s.ahcalgo==1) || s.ahcalgo==2) || s.ahcalgo==3) || s.ahcalgo==4, "ClusterizerRunAHC: internal error"); for(i=0; i<=npoints-1; i++) { if( i!=c0 && i!=c1 ) { n0 = csizes[c0]; n1 = csizes[c1]; ni = csizes[i]; if( s.ahcalgo==0 ) { d[i,c0] = Math.Max(d[i,c0], d[i,c1]); } if( s.ahcalgo==1 ) { d[i,c0] = Math.Min(d[i,c0], d[i,c1]); } if( s.ahcalgo==2 ) { d[i,c0] = (csizes[c0]*d[i,c0]+csizes[c1]*d[i,c1])/(csizes[c0]+csizes[c1]); } if( s.ahcalgo==3 ) { d[i,c0] = (d[i,c0]+d[i,c1])/2; } if( s.ahcalgo==4 ) { d[i,c0] = ((n0+ni)*d[i,c0]+(n1+ni)*d[i,c1]-ni*d01)/(n0+n1+ni); } d[c0,i] = d[i,c0]; } } // // Update CIdx and CSizes // cidx[c0] = npoints+mergeidx; cidx[c1] = -1; csizes[c0] = csizes[c0]+csizes[c1]; csizes[c1] = 0; // // Update nearest neighbors array: // * update nearest neighbors of everything except for C0/C1 // * update neighbors of C0/C1 // for(i=0; i<=npoints-1; i++) { if( (cidx[i]>=0 && i!=c0) && (nnidx[i]==c0 || nnidx[i]==c1) ) { // // I-th cluster which is distinct from C0/C1 has former C0/C1 cluster as its nearest // neighbor. We handle this issue depending on specific AHC algorithm being used. // if( s.ahcalgo==1 ) { // // Single linkage. Merging of two clusters together // does NOT change distances between new cluster and // other clusters. // // The only thing we have to do is to update nearest neighbor index // nnidx[i] = c0; } else { // // Something other than single linkage. We have to re-examine // all the row to find nearest neighbor. // k = -1; v = math.maxrealnumber; for(j=0; j<=npoints-1; j++) { if( (cidx[j]>=0 && j!=i) && (double)(d[i,j])<(double)(v) ) { k = j; v = d[i,j]; } } alglib.ap.assert((double)(v)<(double)(math.maxrealnumber) || mergeidx==npoints-2, "ClusterizerRunAHC: internal error"); nnidx[i] = k; } } } k = -1; v = math.maxrealnumber; for(j=0; j<=npoints-1; j++) { if( (cidx[j]>=0 && j!=c0) && (double)(d[c0,j])<(double)(v) ) { k = j; v = d[c0,j]; } } alglib.ap.assert((double)(v)<(double)(math.maxrealnumber) || mergeidx==npoints-2, "ClusterizerRunAHC: internal error"); nnidx[c0] = k; } // // Calculate Rep.P and Rep.PM. // // In order to do that, we fill CInfo matrix - (2*NPoints-1)*3 matrix, // with I-th row containing: // * CInfo[I,0] - size of I-th cluster // * CInfo[I,1] - beginning of I-th cluster // * CInfo[I,2] - end of I-th cluster // * CInfo[I,3] - height of I-th cluster // // We perform it as follows: // * first NPoints clusters have unit size (CInfo[I,0]=1) and zero // height (CInfo[I,3]=0) // * we replay NPoints-1 merges from first to last and fill sizes of // corresponding clusters (new size is a sum of sizes of clusters // being merged) and height (new height is max(heights)+1). // * now we ready to determine locations of clusters. Last cluster // spans entire dataset, we know it. We replay merges from last to // first, during each merge we already know location of the merge // result, and we can position first cluster to the left part of // the result, and second cluster to the right part. // rep.p = new int[npoints]; rep.pm = new int[npoints-1, 6]; cinfo = new int[2*npoints-1, 4]; for(i=0; i<=npoints-1; i++) { cinfo[i,0] = 1; cinfo[i,3] = 0; } for(i=0; i<=npoints-2; i++) { cinfo[npoints+i,0] = cinfo[rep.z[i,0],0]+cinfo[rep.z[i,1],0]; cinfo[npoints+i,3] = Math.Max(cinfo[rep.z[i,0],3], cinfo[rep.z[i,1],3])+1; } cinfo[2*npoints-2,1] = 0; cinfo[2*npoints-2,2] = npoints-1; for(i=npoints-2; i>=0; i--) { // // We merge C0 which spans [A0,B0] and C1 (spans [A1,B1]), // with unknown A0, B0, A1, B1. However, we know that result // is CR, which spans [AR,BR] with known AR/BR, and we know // sizes of C0, C1, CR (denotes as S0, S1, SR). // c0 = rep.z[i,0]; c1 = rep.z[i,1]; s0 = cinfo[c0,0]; s1 = cinfo[c1,0]; ar = cinfo[npoints+i,1]; br = cinfo[npoints+i,2]; cinfo[c0,1] = ar; cinfo[c0,2] = ar+s0-1; cinfo[c1,1] = br-(s1-1); cinfo[c1,2] = br; rep.pm[i,0] = cinfo[c0,1]; rep.pm[i,1] = cinfo[c0,2]; rep.pm[i,2] = cinfo[c1,1]; rep.pm[i,3] = cinfo[c1,2]; rep.pm[i,4] = cinfo[c0,3]; rep.pm[i,5] = cinfo[c1,3]; } for(i=0; i<=npoints-1; i++) { alglib.ap.assert(cinfo[i,1]==cinfo[i,2]); rep.p[i] = cinfo[i,1]; } // // Calculate Rep.PZ // rep.pz = new int[npoints-1, 2]; for(i=0; i<=npoints-2; i++) { rep.pz[i,0] = rep.z[i,0]; rep.pz[i,1] = rep.z[i,1]; if( rep.pz[i,0]<npoints ) { rep.pz[i,0] = rep.p[rep.pz[i,0]]; } if( rep.pz[i,1]<npoints ) { rep.pz[i,1] = rep.p[rep.pz[i,1]]; } } }
/************************************************************************* This function performs agglomerative hierarchical clustering FOR USERS OF SMP EDITION: ! This function can utilize multicore capabilities of your system. In ! order to do this you have to call version with "smp_" prefix, which ! indicates that multicore code will be used. ! ! This note is given for users of SMP edition; if you use GPL edition, ! or commercial edition of ALGLIB without SMP support, you still will ! be able to call smp-version of this function, but all computations ! will be done serially. ! ! We recommend you to carefully read ALGLIB Reference Manual, section ! called 'SMP support', before using parallel version of this function. ! ! You should remember that starting/stopping worker thread always have ! non-zero cost. Multicore version is pretty efficient on large ! problems which need more than 1.000.000 operations to be solved, ! gives moderate speed-up in mid-range (from 100.000 to 1.000.000 CPU ! cycles), but gives no speed-up for small problems (less than 100.000 ! operations). INPUT PARAMETERS: S - clusterizer state, initialized by ClusterizerCreate() OUTPUT PARAMETERS: Rep - clustering results; see description of AHCReport structure for more information. NOTE 1: hierarchical clustering algorithms require large amounts of memory. In particular, this implementation needs sizeof(double)*NPoints^2 bytes, which are used to store distance matrix. In case we work with user-supplied matrix, this amount is multiplied by 2 (we have to store original matrix and to work with its copy). For example, problem with 10000 points would require 800M of RAM, even when working in a 1-dimensional space. -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizerrunahc(clusterizerstate s, ahcreport rep) { int npoints = 0; int nfeatures = 0; double[,] d = new double[0, 0]; npoints = s.npoints; nfeatures = s.nfeatures; // // Fill Rep.NPoints, quick exit when NPoints<=1 // rep.npoints = npoints; if (npoints == 0) { rep.p = new int[0]; rep.z = new int[0, 0]; rep.pz = new int[0, 0]; rep.pm = new int[0, 0]; rep.mergedist = new double[0]; return; } if (npoints == 1) { rep.p = new int[1]; rep.z = new int[0, 0]; rep.pz = new int[0, 0]; rep.pm = new int[0, 0]; rep.mergedist = new double[0]; rep.p[0] = 0; return; } // // More than one point // if (s.disttype == -1) { // // Run clusterizer with user-supplied distance matrix // clusterizerrunahcinternal(s, ref s.d, rep); return; } else { // // Build distance matrix D. // clusterizergetdistances(s.xy, npoints, nfeatures, s.disttype, ref d); // // Run clusterizer // clusterizerrunahcinternal(s, ref d, rep); return; } }
/************************************************************************* This function initializes clusterizer object. Newly initialized object is empty, i.e. it does not contain dataset. You should use it as follows: 1. creation 2. dataset is added with ClusterizerSetPoints() 3. additional parameters are set 3. clusterization is performed with one of the clustering functions -- ALGLIB -- Copyright 10.07.2012 by Bochkanov Sergey *************************************************************************/ public static void clusterizercreate(clusterizerstate s) { s.npoints = 0; s.nfeatures = 0; s.disttype = 2; s.ahcalgo = 0; s.kmeansrestarts = 1; s.kmeansmaxits = 0; }
public override alglib.apobject make_copy() { clusterizerstate _result = new clusterizerstate(); _result.npoints = npoints; _result.nfeatures = nfeatures; _result.disttype = disttype; _result.xy = (double[,])xy.Clone(); _result.d = (double[,])d.Clone(); _result.ahcalgo = ahcalgo; _result.kmeansrestarts = kmeansrestarts; _result.kmeansmaxits = kmeansmaxits; return _result; }