Esempio n. 1
0
        /*************************************************************************
        This function performs clustering by k-means++ algorithm.

        You may change algorithm properties by calling:
        * ClusterizerSetKMeansLimits() to change number of restarts or iterations
        * ClusterizerSetKMeansInit() to change initialization algorithm

        By  default,  one  restart  and  unlimited number of iterations are  used.
        Initialization algorithm is chosen automatically.

        COMMERCIAL EDITION OF ALGLIB:

          ! Commercial version of ALGLIB includes  two important  improvements  of
          ! this function:
          ! * multicore support (can be used from C# and C++)
          ! * access to high-performance C++ core (actual for C# users)
          !
          ! K-means clustering  algorithm has two  phases:  selection  of  initial
          ! centers  and  clustering  itself.  ALGLIB  parallelizes  both  phases.
          ! Parallel version is optimized for the following  scenario:  medium  or
          ! high-dimensional problem (20 or more dimensions) with large number  of
          ! points and clusters. However, some speed-up can be obtained even  when
          ! assumptions above are violated.
          !
          ! As for native-vs-managed comparison, working with native  core  brings
          ! 30-40% improvement in speed over pure C# version of ALGLIB.
          !
          ! We recommend you to read 'Working with commercial version' section  of
          ! ALGLIB Reference Manual in order to find out how to  use  performance-
          ! related features provided by commercial edition of ALGLIB.

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            K       -   number of clusters, K>=0.
                        K  can  be  zero only when algorithm is called  for  empty
                        dataset,  in   this   case   completion  code  is  set  to
                        success (+1).
                        If  K=0  and  dataset  size  is  non-zero,  we   can   not
                        meaningfully assign points to some center  (there  are  no
                        centers because K=0) and  return  -3  as  completion  code
                        (failure).

        OUTPUT PARAMETERS:
            Rep     -   clustering results; see description of KMeansReport
                        structure for more information.

        NOTE 1: k-means  clustering  can  be  performed  only  for  datasets  with
                Euclidean  distance  function.  Algorithm  will  return   negative
                completion code in Rep.TerminationType in case dataset  was  added
                to clusterizer with DistType other than Euclidean (or dataset  was
                specified by distance matrix instead of explicitly given points).

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizerrunkmeans(clusterizerstate s,
            int k,
            kmeansreport rep)
        {
            double[,] dummy = new double[0,0];

            alglib.ap.assert(k>=0, "ClusterizerRunKMeans: K<0");
            
            //
            // Incorrect distance type
            //
            if( s.disttype!=2 )
            {
                rep.npoints = s.npoints;
                rep.terminationtype = -5;
                rep.k = k;
                rep.iterationscount = 0;
                rep.energy = 0.0;
                return;
            }
            
            //
            // K>NPoints or (K=0 and NPoints>0)
            //
            if( k>s.npoints || (k==0 && s.npoints>0) )
            {
                rep.npoints = s.npoints;
                rep.terminationtype = -3;
                rep.k = k;
                rep.iterationscount = 0;
                rep.energy = 0.0;
                return;
            }
            
            //
            // No points
            //
            if( s.npoints==0 )
            {
                rep.npoints = 0;
                rep.terminationtype = 1;
                rep.k = k;
                rep.iterationscount = 0;
                rep.energy = 0.0;
                return;
            }
            
            //
            // Normal case:
            // 1<=K<=NPoints, Euclidean distance 
            //
            rep.npoints = s.npoints;
            rep.nfeatures = s.nfeatures;
            rep.k = k;
            rep.npoints = s.npoints;
            rep.nfeatures = s.nfeatures;
            kmeansgenerateinternal(s.xy, s.npoints, s.nfeatures, k, s.kmeansinitalgo, s.kmeansmaxits, s.kmeansrestarts, s.kmeansdbgnoits, ref rep.terminationtype, ref rep.iterationscount, ref dummy, false, ref rep.c, true, ref rep.cidx, ref rep.energy, s.kmeanstmp);
        }
Esempio n. 2
0
 /*************************************************************************
 Single-threaded stub. HPC ALGLIB replaces it by multithreaded code.
 *************************************************************************/
 public static void _pexec_clusterizerrunkmeans(clusterizerstate s,
     int k,
     kmeansreport rep)
 {
     clusterizerrunkmeans(s,k,rep);
 }
Esempio n. 3
0
        /*************************************************************************
        This function performs agglomerative hierarchical clustering

        COMMERCIAL EDITION OF ALGLIB:

          ! Commercial version of ALGLIB includes two  important  improvements  of
          ! this function, which can be used from C++ and C#:
          ! * Intel MKL support (lightweight Intel MKL is shipped with ALGLIB)
          ! * multicore support
          !
          ! Agglomerative  hierarchical  clustering  algorithm  has  two   phases:
          ! distance matrix calculation  and  clustering  itself. Only first phase
          ! (distance matrix calculation) is accelerated by Intel MKL  and  multi-
          ! threading. Thus, acceleration is significant only for  medium or high-
          ! dimensional problems.
          !
          ! We recommend you to read 'Working with commercial version' section  of
          ! ALGLIB Reference Manual in order to find out how to  use  performance-
          ! related features provided by commercial edition of ALGLIB.

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()

        OUTPUT PARAMETERS:
            Rep     -   clustering results; see description of AHCReport
                        structure for more information.

        NOTE 1: hierarchical clustering algorithms require large amounts of memory.
                In particular, this implementation needs  sizeof(double)*NPoints^2
                bytes, which are used to store distance matrix. In  case  we  work
                with user-supplied matrix, this amount is multiplied by 2 (we have
                to store original matrix and to work with its copy).
                
                For example, problem with 10000 points  would require 800M of RAM,
                even when working in a 1-dimensional space.

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizerrunahc(clusterizerstate s,
            ahcreport rep)
        {
            int npoints = 0;
            int nfeatures = 0;

            npoints = s.npoints;
            nfeatures = s.nfeatures;
            
            //
            // Fill Rep.NPoints, quick exit when NPoints<=1
            //
            rep.npoints = npoints;
            if( npoints==0 )
            {
                rep.p = new int[0];
                rep.z = new int[0, 0];
                rep.pz = new int[0, 0];
                rep.pm = new int[0, 0];
                rep.mergedist = new double[0];
                rep.terminationtype = 1;
                return;
            }
            if( npoints==1 )
            {
                rep.p = new int[1];
                rep.z = new int[0, 0];
                rep.pz = new int[0, 0];
                rep.pm = new int[0, 0];
                rep.mergedist = new double[0];
                rep.p[0] = 0;
                rep.terminationtype = 1;
                return;
            }
            
            //
            // More than one point
            //
            if( s.disttype==-1 )
            {
                
                //
                // Run clusterizer with user-supplied distance matrix
                //
                clusterizerrunahcinternal(s, ref s.d, rep);
                return;
            }
            else
            {
                
                //
                // Check combination of AHC algo and distance type
                //
                if( s.ahcalgo==4 && s.disttype!=2 )
                {
                    rep.terminationtype = -5;
                    return;
                }
                
                //
                // Build distance matrix D.
                //
                clusterizergetdistancesbuf(s.distbuf, s.xy, npoints, nfeatures, s.disttype, ref s.tmpd);
                
                //
                // Run clusterizer
                //
                clusterizerrunahcinternal(s, ref s.tmpd, rep);
                return;
            }
        }
Esempio n. 4
0
 /*************************************************************************
 Single-threaded stub. HPC ALGLIB replaces it by multithreaded code.
 *************************************************************************/
 public static void _pexec_clusterizerrunahc(clusterizerstate s,
     ahcreport rep)
 {
     clusterizerrunahc(s,rep);
 }
Esempio n. 5
0
        /*************************************************************************
        This  function  sets k-means properties:  number  of  restarts and maximum
        number of iterations per one run.

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            Restarts-   restarts count, >=1.
                        k-means++ algorithm performs several restarts and  chooses
                        best set of centers (one with minimum squared distance).
            MaxIts  -   maximum number of k-means iterations performed during  one
                        run. >=0, zero value means that algorithm performs unlimited
                        number of iterations.

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizersetkmeanslimits(clusterizerstate s,
            int restarts,
            int maxits)
        {
            alglib.ap.assert(restarts>=1, "ClusterizerSetKMeansLimits: Restarts<=0");
            alglib.ap.assert(maxits>=0, "ClusterizerSetKMeansLimits: MaxIts<0");
            s.kmeansrestarts = restarts;
            s.kmeansmaxits = maxits;
        }
Esempio n. 6
0
        /*************************************************************************
        This function sets k-means  initialization  algorithm.  Several  different
        algorithms can be chosen, including k-means++.

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            InitAlgo-   initialization algorithm:
                        * 0  automatic selection ( different  versions  of  ALGLIB
                             may select different algorithms)
                        * 1  random initialization
                        * 2  k-means++ initialization  (best  quality  of  initial
                             centers, but long  non-parallelizable  initialization
                             phase with bad cache locality)
                        * 3  "fast-greedy"  algorithm  with  efficient,  easy   to
                             parallelize initialization. Quality of initial centers
                             is  somewhat  worse  than  that  of  k-means++.  This
                             algorithm is a default one in the current version  of
                             ALGLIB.
                        *-1  "debug" algorithm which always selects first  K  rows
                             of dataset; this algorithm is used for debug purposes
                             only. Do not use it in the industrial code!

          -- ALGLIB --
             Copyright 21.01.2015 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizersetkmeansinit(clusterizerstate s,
            int initalgo)
        {
            alglib.ap.assert(initalgo>=-1 && initalgo<=3, "ClusterizerSetKMeansInit: InitAlgo is incorrect");
            s.kmeansinitalgo = initalgo;
        }
Esempio n. 7
0
        /*************************************************************************
        This function adds dataset given by distance  matrix  to  the  clusterizer
        structure. It is important that dataset is not  given  explicitly  -  only
        distance matrix is given.

        This function overrides all previous calls  of  ClusterizerSetPoints()  or
        ClusterizerSetDistances().

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            D       -   array[NPoints,NPoints], distance matrix given by its upper
                        or lower triangle (main diagonal is  ignored  because  its
                        entries are expected to be zero).
            NPoints -   number of points
            IsUpper -   whether upper or lower triangle of D is given.
                
        NOTE 1: different clustering algorithms have different limitations:
                * agglomerative hierarchical clustering algorithms may be used with
                  any kind of distance metric, including one  which  is  given  by
                  distance matrix
                * k-means++ clustering algorithm may be used only  with  Euclidean
                  distance function and explicitly given points - it  can  not  be
                  used with dataset given by distance matrix
                Thus, if you call this function, you will be unable to use k-means
                clustering algorithm to process your problem.

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizersetdistances(clusterizerstate s,
            double[,] d,
            int npoints,
            bool isupper)
        {
            int i = 0;
            int j = 0;
            int j0 = 0;
            int j1 = 0;

            alglib.ap.assert(npoints>=0, "ClusterizerSetDistances: NPoints<0");
            alglib.ap.assert(alglib.ap.rows(d)>=npoints, "ClusterizerSetDistances: Rows(D)<NPoints");
            alglib.ap.assert(alglib.ap.cols(d)>=npoints, "ClusterizerSetDistances: Cols(D)<NPoints");
            s.npoints = npoints;
            s.nfeatures = 0;
            s.disttype = -1;
            apserv.rmatrixsetlengthatleast(ref s.d, npoints, npoints);
            for(i=0; i<=npoints-1; i++)
            {
                if( isupper )
                {
                    j0 = i+1;
                    j1 = npoints-1;
                }
                else
                {
                    j0 = 0;
                    j1 = i-1;
                }
                for(j=j0; j<=j1; j++)
                {
                    alglib.ap.assert(math.isfinite(d[i,j]) && (double)(d[i,j])>=(double)(0), "ClusterizerSetDistances: D contains infinite, NAN or negative elements");
                    s.d[i,j] = d[i,j];
                    s.d[j,i] = d[i,j];
                }
                s.d[i,i] = 0;
            }
        }
Esempio n. 8
0
        /*************************************************************************
        This function sets agglomerative hierarchical clustering algorithm

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            Algo    -   algorithm type:
                        * 0     complete linkage (default algorithm)
                        * 1     single linkage
                        * 2     unweighted average linkage
                        * 3     weighted average linkage
                        * 4     Ward's method

        NOTE: Ward's method works correctly only with Euclidean  distance,  that's
              why algorithm will return negative termination  code  (failure)  for
              any other distance type.
              
              It is possible, however,  to  use  this  method  with  user-supplied
              distance matrix. It  is  your  responsibility  to pass one which was
              calculated with Euclidean distance function.

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizersetahcalgo(clusterizerstate s,
            int algo)
        {
            alglib.ap.assert((((algo==0 || algo==1) || algo==2) || algo==3) || algo==4, "ClusterizerSetHCAlgo: incorrect algorithm type");
            s.ahcalgo = algo;
        }
Esempio n. 9
0
        /*************************************************************************
        This function initializes clusterizer object. Newly initialized object  is
        empty, i.e. it does not contain dataset. You should use it as follows:
        1. creation
        2. dataset is added with ClusterizerSetPoints()
        3. additional parameters are set
        3. clusterization is performed with one of the clustering functions

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizercreate(clusterizerstate s)
        {
            s.npoints = 0;
            s.nfeatures = 0;
            s.disttype = 2;
            s.ahcalgo = 0;
            s.kmeansrestarts = 1;
            s.kmeansmaxits = 0;
            s.kmeansinitalgo = 0;
            s.kmeansdbgnoits = false;
            kmeansinitbuf(s.kmeanstmp);
        }
Esempio n. 10
0
        /*************************************************************************
        This function adds dataset to the clusterizer structure.

        This function overrides all previous calls  of  ClusterizerSetPoints()  or
        ClusterizerSetDistances().

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            XY      -   array[NPoints,NFeatures], dataset
            NPoints -   number of points, >=0
            NFeatures-  number of features, >=1
            DistType-   distance function:
                        *  0    Chebyshev distance  (L-inf norm)
                        *  1    city block distance (L1 norm)
                        *  2    Euclidean distance  (L2 norm), non-squared
                        * 10    Pearson correlation:
                                dist(a,b) = 1-corr(a,b)
                        * 11    Absolute Pearson correlation:
                                dist(a,b) = 1-|corr(a,b)|
                        * 12    Uncentered Pearson correlation (cosine of the angle):
                                dist(a,b) = a'*b/(|a|*|b|)
                        * 13    Absolute uncentered Pearson correlation
                                dist(a,b) = |a'*b|/(|a|*|b|)
                        * 20    Spearman rank correlation:
                                dist(a,b) = 1-rankcorr(a,b)
                        * 21    Absolute Spearman rank correlation
                                dist(a,b) = 1-|rankcorr(a,b)|

        NOTE 1: different distance functions have different performance penalty:
                * Euclidean or Pearson correlation distances are the fastest ones
                * Spearman correlation distance function is a bit slower
                * city block and Chebyshev distances are order of magnitude slower
               
                The reason behing difference in performance is that correlation-based
                distance functions are computed using optimized linear algebra kernels,
                while Chebyshev and city block distance functions are computed using
                simple nested loops with two branches at each iteration.
                
        NOTE 2: different clustering algorithms have different limitations:
                * agglomerative hierarchical clustering algorithms may be used with
                  any kind of distance metric
                * k-means++ clustering algorithm may be used only  with  Euclidean
                  distance function
                Thus, list of specific clustering algorithms you may  use  depends
                on distance function you specify when you set your dataset.
               
          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        public static void clusterizersetpoints(clusterizerstate s,
            double[,] xy,
            int npoints,
            int nfeatures,
            int disttype)
        {
            int i = 0;
            int i_ = 0;

            alglib.ap.assert((((((((disttype==0 || disttype==1) || disttype==2) || disttype==10) || disttype==11) || disttype==12) || disttype==13) || disttype==20) || disttype==21, "ClusterizerSetPoints: incorrect DistType");
            alglib.ap.assert(npoints>=0, "ClusterizerSetPoints: NPoints<0");
            alglib.ap.assert(nfeatures>=1, "ClusterizerSetPoints: NFeatures<1");
            alglib.ap.assert(alglib.ap.rows(xy)>=npoints, "ClusterizerSetPoints: Rows(XY)<NPoints");
            alglib.ap.assert(alglib.ap.cols(xy)>=nfeatures, "ClusterizerSetPoints: Cols(XY)<NFeatures");
            alglib.ap.assert(apserv.apservisfinitematrix(xy, npoints, nfeatures), "ClusterizerSetPoints: XY contains NAN/INF");
            s.npoints = npoints;
            s.nfeatures = nfeatures;
            s.disttype = disttype;
            apserv.rmatrixsetlengthatleast(ref s.xy, npoints, nfeatures);
            for(i=0; i<=npoints-1; i++)
            {
                for(i_=0; i_<=nfeatures-1;i_++)
                {
                    s.xy[i,i_] = xy[i,i_];
                }
            }
        }
Esempio n. 11
0
 public override alglib.apobject make_copy()
 {
     clusterizerstate _result = new clusterizerstate();
     _result.npoints = npoints;
     _result.nfeatures = nfeatures;
     _result.disttype = disttype;
     _result.xy = (double[,])xy.Clone();
     _result.d = (double[,])d.Clone();
     _result.ahcalgo = ahcalgo;
     _result.kmeansrestarts = kmeansrestarts;
     _result.kmeansmaxits = kmeansmaxits;
     _result.kmeansinitalgo = kmeansinitalgo;
     _result.kmeansdbgnoits = kmeansdbgnoits;
     _result.tmpd = (double[,])tmpd.Clone();
     _result.distbuf = (apserv.apbuffers)distbuf.make_copy();
     _result.kmeanstmp = (kmeansbuffers)kmeanstmp.make_copy();
     return _result;
 }
Esempio n. 12
0
        /*************************************************************************
        This  function  performs  agglomerative  hierarchical  clustering    using
        precomputed  distance  matrix.  Internal  function,  should  not be called
        directly.

        INPUT PARAMETERS:
            S       -   clusterizer state, initialized by ClusterizerCreate()
            D       -   distance matrix, array[S.NFeatures,S.NFeatures]
                        Contents of the matrix is destroyed during
                        algorithm operation.

        OUTPUT PARAMETERS:
            Rep     -   clustering results; see description of AHCReport
                        structure for more information.

          -- ALGLIB --
             Copyright 10.07.2012 by Bochkanov Sergey
        *************************************************************************/
        private static void clusterizerrunahcinternal(clusterizerstate s,
            ref double[,] d,
            ahcreport rep)
        {
            int i = 0;
            int j = 0;
            int k = 0;
            double v = 0;
            int mergeidx = 0;
            int c0 = 0;
            int c1 = 0;
            int s0 = 0;
            int s1 = 0;
            int ar = 0;
            int br = 0;
            int npoints = 0;
            int[] cidx = new int[0];
            int[] csizes = new int[0];
            int[] nnidx = new int[0];
            int[,] cinfo = new int[0,0];
            int n0 = 0;
            int n1 = 0;
            int ni = 0;
            double d01 = 0;

            npoints = s.npoints;
            
            //
            // Fill Rep.NPoints, quick exit when NPoints<=1
            //
            rep.npoints = npoints;
            if( npoints==0 )
            {
                rep.p = new int[0];
                rep.z = new int[0, 0];
                rep.pz = new int[0, 0];
                rep.pm = new int[0, 0];
                rep.mergedist = new double[0];
                rep.terminationtype = 1;
                return;
            }
            if( npoints==1 )
            {
                rep.p = new int[1];
                rep.z = new int[0, 0];
                rep.pz = new int[0, 0];
                rep.pm = new int[0, 0];
                rep.mergedist = new double[0];
                rep.p[0] = 0;
                rep.terminationtype = 1;
                return;
            }
            rep.z = new int[npoints-1, 2];
            rep.mergedist = new double[npoints-1];
            rep.terminationtype = 1;
            
            //
            // Build list of nearest neighbors
            //
            nnidx = new int[npoints];
            for(i=0; i<=npoints-1; i++)
            {
                
                //
                // Calculate index of the nearest neighbor
                //
                k = -1;
                v = math.maxrealnumber;
                for(j=0; j<=npoints-1; j++)
                {
                    if( j!=i && (double)(d[i,j])<(double)(v) )
                    {
                        k = j;
                        v = d[i,j];
                    }
                }
                alglib.ap.assert((double)(v)<(double)(math.maxrealnumber), "ClusterizerRunAHC: internal error");
                nnidx[i] = k;
            }
            
            //
            // For AHCAlgo=4 (Ward's method) replace distances by their squares times 0.5
            //
            if( s.ahcalgo==4 )
            {
                for(i=0; i<=npoints-1; i++)
                {
                    for(j=0; j<=npoints-1; j++)
                    {
                        d[i,j] = 0.5*d[i,j]*d[i,j];
                    }
                }
            }
            
            //
            // Distance matrix is built, perform merges.
            //
            // NOTE 1: CIdx is array[NPoints] which maps rows/columns of the
            //         distance matrix D to indexes of clusters. Values of CIdx
            //         from [0,NPoints) denote single-point clusters, and values
            //         from [NPoints,2*NPoints-1) denote ones obtained by merging
            //         smaller clusters. Negative calues correspond to absent clusters.
            //
            //         Initially it contains [0...NPoints-1], after each merge
            //         one element of CIdx (one with index C0) is replaced by
            //         NPoints+MergeIdx, and another one with index C1 is
            //         rewritten by -1.
            // 
            // NOTE 2: CSizes is array[NPoints] which stores sizes of clusters.
            //         
            //
            cidx = new int[npoints];
            csizes = new int[npoints];
            for(i=0; i<=npoints-1; i++)
            {
                cidx[i] = i;
                csizes[i] = 1;
            }
            for(mergeidx=0; mergeidx<=npoints-2; mergeidx++)
            {
                
                //
                // Select pair of clusters (C0,C1) with CIdx[C0]<CIdx[C1] to merge.
                //
                c0 = -1;
                c1 = -1;
                d01 = math.maxrealnumber;
                for(i=0; i<=npoints-1; i++)
                {
                    if( cidx[i]>=0 )
                    {
                        if( (double)(d[i,nnidx[i]])<(double)(d01) )
                        {
                            c0 = i;
                            c1 = nnidx[i];
                            d01 = d[i,nnidx[i]];
                        }
                    }
                }
                alglib.ap.assert((double)(d01)<(double)(math.maxrealnumber), "ClusterizerRunAHC: internal error");
                if( cidx[c0]>cidx[c1] )
                {
                    i = c1;
                    c1 = c0;
                    c0 = i;
                }
                
                //
                // Fill one row of Rep.Z and one element of Rep.MergeDist
                //
                rep.z[mergeidx,0] = cidx[c0];
                rep.z[mergeidx,1] = cidx[c1];
                rep.mergedist[mergeidx] = d01;
                
                //
                // Update distance matrix:
                // * row/column C0 are updated by distances to the new cluster
                // * row/column C1 are considered empty (we can fill them by zeros,
                //   but do not want to spend time - we just ignore them)
                //
                // NOTE: it is important to update distance matrix BEFORE CIdx/CSizes
                //       are updated.
                //
                alglib.ap.assert((((s.ahcalgo==0 || s.ahcalgo==1) || s.ahcalgo==2) || s.ahcalgo==3) || s.ahcalgo==4, "ClusterizerRunAHC: internal error");
                for(i=0; i<=npoints-1; i++)
                {
                    if( i!=c0 && i!=c1 )
                    {
                        n0 = csizes[c0];
                        n1 = csizes[c1];
                        ni = csizes[i];
                        if( s.ahcalgo==0 )
                        {
                            d[i,c0] = Math.Max(d[i,c0], d[i,c1]);
                        }
                        if( s.ahcalgo==1 )
                        {
                            d[i,c0] = Math.Min(d[i,c0], d[i,c1]);
                        }
                        if( s.ahcalgo==2 )
                        {
                            d[i,c0] = (csizes[c0]*d[i,c0]+csizes[c1]*d[i,c1])/(csizes[c0]+csizes[c1]);
                        }
                        if( s.ahcalgo==3 )
                        {
                            d[i,c0] = (d[i,c0]+d[i,c1])/2;
                        }
                        if( s.ahcalgo==4 )
                        {
                            d[i,c0] = ((n0+ni)*d[i,c0]+(n1+ni)*d[i,c1]-ni*d01)/(n0+n1+ni);
                        }
                        d[c0,i] = d[i,c0];
                    }
                }
                
                //
                // Update CIdx and CSizes
                //
                cidx[c0] = npoints+mergeidx;
                cidx[c1] = -1;
                csizes[c0] = csizes[c0]+csizes[c1];
                csizes[c1] = 0;
                
                //
                // Update nearest neighbors array:
                // * update nearest neighbors of everything except for C0/C1
                // * update neighbors of C0/C1
                //
                for(i=0; i<=npoints-1; i++)
                {
                    if( (cidx[i]>=0 && i!=c0) && (nnidx[i]==c0 || nnidx[i]==c1) )
                    {
                        
                        //
                        // I-th cluster which is distinct from C0/C1 has former C0/C1 cluster as its nearest
                        // neighbor. We handle this issue depending on specific AHC algorithm being used.
                        //
                        if( s.ahcalgo==1 )
                        {
                            
                            //
                            // Single linkage. Merging of two clusters together
                            // does NOT change distances between new cluster and
                            // other clusters.
                            //
                            // The only thing we have to do is to update nearest neighbor index
                            //
                            nnidx[i] = c0;
                        }
                        else
                        {
                            
                            //
                            // Something other than single linkage. We have to re-examine
                            // all the row to find nearest neighbor.
                            //
                            k = -1;
                            v = math.maxrealnumber;
                            for(j=0; j<=npoints-1; j++)
                            {
                                if( (cidx[j]>=0 && j!=i) && (double)(d[i,j])<(double)(v) )
                                {
                                    k = j;
                                    v = d[i,j];
                                }
                            }
                            alglib.ap.assert((double)(v)<(double)(math.maxrealnumber) || mergeidx==npoints-2, "ClusterizerRunAHC: internal error");
                            nnidx[i] = k;
                        }
                    }
                }
                k = -1;
                v = math.maxrealnumber;
                for(j=0; j<=npoints-1; j++)
                {
                    if( (cidx[j]>=0 && j!=c0) && (double)(d[c0,j])<(double)(v) )
                    {
                        k = j;
                        v = d[c0,j];
                    }
                }
                alglib.ap.assert((double)(v)<(double)(math.maxrealnumber) || mergeidx==npoints-2, "ClusterizerRunAHC: internal error");
                nnidx[c0] = k;
            }
            
            //
            // Calculate Rep.P and Rep.PM.
            //
            // In order to do that, we fill CInfo matrix - (2*NPoints-1)*3 matrix,
            // with I-th row containing:
            // * CInfo[I,0]     -   size of I-th cluster
            // * CInfo[I,1]     -   beginning of I-th cluster
            // * CInfo[I,2]     -   end of I-th cluster
            // * CInfo[I,3]     -   height of I-th cluster
            //
            // We perform it as follows:
            // * first NPoints clusters have unit size (CInfo[I,0]=1) and zero
            //   height (CInfo[I,3]=0)
            // * we replay NPoints-1 merges from first to last and fill sizes of
            //   corresponding clusters (new size is a sum of sizes of clusters
            //   being merged) and height (new height is max(heights)+1).
            // * now we ready to determine locations of clusters. Last cluster
            //   spans entire dataset, we know it. We replay merges from last to
            //   first, during each merge we already know location of the merge
            //   result, and we can position first cluster to the left part of
            //   the result, and second cluster to the right part.
            //
            rep.p = new int[npoints];
            rep.pm = new int[npoints-1, 6];
            cinfo = new int[2*npoints-1, 4];
            for(i=0; i<=npoints-1; i++)
            {
                cinfo[i,0] = 1;
                cinfo[i,3] = 0;
            }
            for(i=0; i<=npoints-2; i++)
            {
                cinfo[npoints+i,0] = cinfo[rep.z[i,0],0]+cinfo[rep.z[i,1],0];
                cinfo[npoints+i,3] = Math.Max(cinfo[rep.z[i,0],3], cinfo[rep.z[i,1],3])+1;
            }
            cinfo[2*npoints-2,1] = 0;
            cinfo[2*npoints-2,2] = npoints-1;
            for(i=npoints-2; i>=0; i--)
            {
                
                //
                // We merge C0 which spans [A0,B0] and C1 (spans [A1,B1]),
                // with unknown A0, B0, A1, B1. However, we know that result
                // is CR, which spans [AR,BR] with known AR/BR, and we know
                // sizes of C0, C1, CR (denotes as S0, S1, SR).
                //
                c0 = rep.z[i,0];
                c1 = rep.z[i,1];
                s0 = cinfo[c0,0];
                s1 = cinfo[c1,0];
                ar = cinfo[npoints+i,1];
                br = cinfo[npoints+i,2];
                cinfo[c0,1] = ar;
                cinfo[c0,2] = ar+s0-1;
                cinfo[c1,1] = br-(s1-1);
                cinfo[c1,2] = br;
                rep.pm[i,0] = cinfo[c0,1];
                rep.pm[i,1] = cinfo[c0,2];
                rep.pm[i,2] = cinfo[c1,1];
                rep.pm[i,3] = cinfo[c1,2];
                rep.pm[i,4] = cinfo[c0,3];
                rep.pm[i,5] = cinfo[c1,3];
            }
            for(i=0; i<=npoints-1; i++)
            {
                alglib.ap.assert(cinfo[i,1]==cinfo[i,2]);
                rep.p[i] = cinfo[i,1];
            }
            
            //
            // Calculate Rep.PZ
            //
            rep.pz = new int[npoints-1, 2];
            for(i=0; i<=npoints-2; i++)
            {
                rep.pz[i,0] = rep.z[i,0];
                rep.pz[i,1] = rep.z[i,1];
                if( rep.pz[i,0]<npoints )
                {
                    rep.pz[i,0] = rep.p[rep.pz[i,0]];
                }
                if( rep.pz[i,1]<npoints )
                {
                    rep.pz[i,1] = rep.p[rep.pz[i,1]];
                }
            }
        }
Esempio n. 13
0
            /*************************************************************************
            This function performs agglomerative hierarchical clustering

            FOR USERS OF SMP EDITION:

              ! This function can utilize multicore capabilities of  your system.  In
              ! order to do this you have to call version with "smp_" prefix,   which
              ! indicates that multicore code will be used.
              ! 
              ! This note is given for users of SMP edition; if you use GPL  edition,
              ! or commercial edition of ALGLIB without SMP support, you  still  will
              ! be able to call smp-version of this function,  but  all  computations
              ! will be done serially.
              !
              ! We recommend you to carefully read ALGLIB Reference  Manual,  section
              ! called 'SMP support', before using parallel version of this function.
              !
              ! You should remember that starting/stopping worker thread always  have
              ! non-zero  cost.  Multicore  version  is  pretty  efficient  on  large
              ! problems  which  need  more  than  1.000.000 operations to be solved,
              ! gives  moderate  speed-up in mid-range (from 100.000 to 1.000.000 CPU
              ! cycles), but gives no speed-up for small problems (less than  100.000
              ! operations).

            INPUT PARAMETERS:
                S       -   clusterizer state, initialized by ClusterizerCreate()

            OUTPUT PARAMETERS:
                Rep     -   clustering results; see description of AHCReport
                            structure for more information.

            NOTE 1: hierarchical clustering algorithms require large amounts of memory.
                    In particular, this implementation needs  sizeof(double)*NPoints^2
                    bytes, which are used to store distance matrix. In  case  we  work
                    with user-supplied matrix, this amount is multiplied by 2 (we have
                    to store original matrix and to work with its copy).
                
                    For example, problem with 10000 points  would require 800M of RAM,
                    even when working in a 1-dimensional space.

              -- ALGLIB --
                 Copyright 10.07.2012 by Bochkanov Sergey
            *************************************************************************/
            public static void clusterizerrunahc(clusterizerstate s,
                ahcreport rep)
            {
                int npoints = 0;
                int nfeatures = 0;
                double[,] d = new double[0, 0];

                npoints = s.npoints;
                nfeatures = s.nfeatures;

                //
                // Fill Rep.NPoints, quick exit when NPoints<=1
                //
                rep.npoints = npoints;
                if (npoints == 0)
                {
                    rep.p = new int[0];
                    rep.z = new int[0, 0];
                    rep.pz = new int[0, 0];
                    rep.pm = new int[0, 0];
                    rep.mergedist = new double[0];
                    return;
                }
                if (npoints == 1)
                {
                    rep.p = new int[1];
                    rep.z = new int[0, 0];
                    rep.pz = new int[0, 0];
                    rep.pm = new int[0, 0];
                    rep.mergedist = new double[0];
                    rep.p[0] = 0;
                    return;
                }

                //
                // More than one point
                //
                if (s.disttype == -1)
                {

                    //
                    // Run clusterizer with user-supplied distance matrix
                    //
                    clusterizerrunahcinternal(s, ref s.d, rep);
                    return;
                }
                else
                {

                    //
                    // Build distance matrix D.
                    //
                    clusterizergetdistances(s.xy, npoints, nfeatures, s.disttype, ref d);

                    //
                    // Run clusterizer
                    //
                    clusterizerrunahcinternal(s, ref d, rep);
                    return;
                }
            }
Esempio n. 14
0
            /*************************************************************************
            This function initializes clusterizer object. Newly initialized object  is
            empty, i.e. it does not contain dataset. You should use it as follows:
            1. creation
            2. dataset is added with ClusterizerSetPoints()
            3. additional parameters are set
            3. clusterization is performed with one of the clustering functions

              -- ALGLIB --
                 Copyright 10.07.2012 by Bochkanov Sergey
            *************************************************************************/
            public static void clusterizercreate(clusterizerstate s)
            {
                s.npoints = 0;
                s.nfeatures = 0;
                s.disttype = 2;
                s.ahcalgo = 0;
                s.kmeansrestarts = 1;
                s.kmeansmaxits = 0;
            }
Esempio n. 15
0
 public override alglib.apobject make_copy()
 {
     clusterizerstate _result = new clusterizerstate();
     _result.npoints = npoints;
     _result.nfeatures = nfeatures;
     _result.disttype = disttype;
     _result.xy = (double[,])xy.Clone();
     _result.d = (double[,])d.Clone();
     _result.ahcalgo = ahcalgo;
     _result.kmeansrestarts = kmeansrestarts;
     _result.kmeansmaxits = kmeansmaxits;
     return _result;
 }