Example #1
0
        public static void dfbuildinternal(double[,] xy,
            int npoints,
            int nvars,
            int nclasses,
            int ntrees,
            int samplesize,
            int nfeatures,
            int flags,
            ref int info,
            decisionforest df,
            dfreport rep)
        {
            int i = 0;
            int j = 0;
            int k = 0;
            int tmpi = 0;
            int lasttreeoffs = 0;
            int offs = 0;
            int ooboffs = 0;
            int treesize = 0;
            int nvarsinpool = 0;
            bool useevs = new bool();
            dfinternalbuffers bufs = new dfinternalbuffers();
            int[] permbuf = new int[0];
            double[] oobbuf = new double[0];
            int[] oobcntbuf = new int[0];
            double[,] xys = new double[0,0];
            double[] x = new double[0];
            double[] y = new double[0];
            int oobcnt = 0;
            int oobrelcnt = 0;
            double v = 0;
            double vmin = 0;
            double vmax = 0;
            bool bflag = new bool();
            int i_ = 0;
            int i1_ = 0;

            info = 0;

            
            //
            // Test for inputs
            //
            if( (((((npoints<1 || samplesize<1) || samplesize>npoints) || nvars<1) || nclasses<1) || ntrees<1) || nfeatures<1 )
            {
                info = -1;
                return;
            }
            if( nclasses>1 )
            {
                for(i=0; i<=npoints-1; i++)
                {
                    if( (int)Math.Round(xy[i,nvars])<0 || (int)Math.Round(xy[i,nvars])>=nclasses )
                    {
                        info = -2;
                        return;
                    }
                }
            }
            info = 1;
            
            //
            // Flags
            //
            useevs = flags/dfuseevs%2!=0;
            
            //
            // Allocate data, prepare header
            //
            treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize;
            permbuf = new int[npoints-1+1];
            bufs.treebuf = new double[treesize-1+1];
            bufs.idxbuf = new int[npoints-1+1];
            bufs.tmpbufr = new double[npoints-1+1];
            bufs.tmpbufr2 = new double[npoints-1+1];
            bufs.tmpbufi = new int[npoints-1+1];
            bufs.sortrbuf = new double[npoints];
            bufs.sortrbuf2 = new double[npoints];
            bufs.sortibuf = new int[npoints];
            bufs.varpool = new int[nvars-1+1];
            bufs.evsbin = new bool[nvars-1+1];
            bufs.evssplits = new double[nvars-1+1];
            bufs.classibuf = new int[2*nclasses-1+1];
            oobbuf = new double[nclasses*npoints-1+1];
            oobcntbuf = new int[npoints-1+1];
            df.trees = new double[ntrees*treesize-1+1];
            xys = new double[samplesize-1+1, nvars+1];
            x = new double[nvars-1+1];
            y = new double[nclasses-1+1];
            for(i=0; i<=npoints-1; i++)
            {
                permbuf[i] = i;
            }
            for(i=0; i<=npoints*nclasses-1; i++)
            {
                oobbuf[i] = 0;
            }
            for(i=0; i<=npoints-1; i++)
            {
                oobcntbuf[i] = 0;
            }
            
            //
            // Prepare variable pool and EVS (extended variable selection/splitting) buffers
            // (whether EVS is turned on or not):
            // 1. detect binary variables and pre-calculate splits for them
            // 2. detect variables with non-distinct values and exclude them from pool
            //
            for(i=0; i<=nvars-1; i++)
            {
                bufs.varpool[i] = i;
            }
            nvarsinpool = nvars;
            if( useevs )
            {
                for(j=0; j<=nvars-1; j++)
                {
                    vmin = xy[0,j];
                    vmax = vmin;
                    for(i=0; i<=npoints-1; i++)
                    {
                        v = xy[i,j];
                        vmin = Math.Min(vmin, v);
                        vmax = Math.Max(vmax, v);
                    }
                    if( (double)(vmin)==(double)(vmax) )
                    {
                        
                        //
                        // exclude variable from pool
                        //
                        bufs.varpool[j] = bufs.varpool[nvarsinpool-1];
                        bufs.varpool[nvarsinpool-1] = -1;
                        nvarsinpool = nvarsinpool-1;
                        continue;
                    }
                    bflag = false;
                    for(i=0; i<=npoints-1; i++)
                    {
                        v = xy[i,j];
                        if( (double)(v)!=(double)(vmin) && (double)(v)!=(double)(vmax) )
                        {
                            bflag = true;
                            break;
                        }
                    }
                    if( bflag )
                    {
                        
                        //
                        // non-binary variable
                        //
                        bufs.evsbin[j] = false;
                    }
                    else
                    {
                        
                        //
                        // Prepare
                        //
                        bufs.evsbin[j] = true;
                        bufs.evssplits[j] = 0.5*(vmin+vmax);
                        if( (double)(bufs.evssplits[j])<=(double)(vmin) )
                        {
                            bufs.evssplits[j] = vmax;
                        }
                    }
                }
            }
            
            //
            // RANDOM FOREST FORMAT
            // W[0]         -   size of array
            // W[1]         -   version number
            // W[2]         -   NVars
            // W[3]         -   NClasses (1 for regression)
            // W[4]         -   NTrees
            // W[5]         -   trees offset
            //
            //
            // TREE FORMAT
            // W[Offs]      -   size of sub-array
            //     node info:
            // W[K+0]       -   variable number        (-1 for leaf mode)
            // W[K+1]       -   threshold              (class/value for leaf node)
            // W[K+2]       -   ">=" branch index      (absent for leaf node)
            //
            //
            df.nvars = nvars;
            df.nclasses = nclasses;
            df.ntrees = ntrees;
            
            //
            // Build forest
            //
            offs = 0;
            for(i=0; i<=ntrees-1; i++)
            {
                
                //
                // Prepare sample
                //
                for(k=0; k<=samplesize-1; k++)
                {
                    j = k+math.randominteger(npoints-k);
                    tmpi = permbuf[k];
                    permbuf[k] = permbuf[j];
                    permbuf[j] = tmpi;
                    j = permbuf[k];
                    for(i_=0; i_<=nvars;i_++)
                    {
                        xys[k,i_] = xy[j,i_];
                    }
                }
                
                //
                // build tree, copy
                //
                dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs);
                j = (int)Math.Round(bufs.treebuf[0]);
                i1_ = (0) - (offs);
                for(i_=offs; i_<=offs+j-1;i_++)
                {
                    df.trees[i_] = bufs.treebuf[i_+i1_];
                }
                lasttreeoffs = offs;
                offs = offs+j;
                
                //
                // OOB estimates
                //
                for(k=samplesize; k<=npoints-1; k++)
                {
                    for(j=0; j<=nclasses-1; j++)
                    {
                        y[j] = 0;
                    }
                    j = permbuf[k];
                    for(i_=0; i_<=nvars-1;i_++)
                    {
                        x[i_] = xy[j,i_];
                    }
                    dfprocessinternal(df, lasttreeoffs, x, ref y);
                    i1_ = (0) - (j*nclasses);
                    for(i_=j*nclasses; i_<=(j+1)*nclasses-1;i_++)
                    {
                        oobbuf[i_] = oobbuf[i_] + y[i_+i1_];
                    }
                    oobcntbuf[j] = oobcntbuf[j]+1;
                }
            }
            df.bufsize = offs;
            
            //
            // Normalize OOB results
            //
            for(i=0; i<=npoints-1; i++)
            {
                if( oobcntbuf[i]!=0 )
                {
                    v = (double)1/(double)oobcntbuf[i];
                    for(i_=i*nclasses; i_<=i*nclasses+nclasses-1;i_++)
                    {
                        oobbuf[i_] = v*oobbuf[i_];
                    }
                }
            }
            
            //
            // Calculate training set estimates
            //
            rep.relclserror = dfrelclserror(df, xy, npoints);
            rep.avgce = dfavgce(df, xy, npoints);
            rep.rmserror = dfrmserror(df, xy, npoints);
            rep.avgerror = dfavgerror(df, xy, npoints);
            rep.avgrelerror = dfavgrelerror(df, xy, npoints);
            
            //
            // Calculate OOB estimates.
            //
            rep.oobrelclserror = 0;
            rep.oobavgce = 0;
            rep.oobrmserror = 0;
            rep.oobavgerror = 0;
            rep.oobavgrelerror = 0;
            oobcnt = 0;
            oobrelcnt = 0;
            for(i=0; i<=npoints-1; i++)
            {
                if( oobcntbuf[i]!=0 )
                {
                    ooboffs = i*nclasses;
                    if( nclasses>1 )
                    {
                        
                        //
                        // classification-specific code
                        //
                        k = (int)Math.Round(xy[i,nvars]);
                        tmpi = 0;
                        for(j=1; j<=nclasses-1; j++)
                        {
                            if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) )
                            {
                                tmpi = j;
                            }
                        }
                        if( tmpi!=k )
                        {
                            rep.oobrelclserror = rep.oobrelclserror+1;
                        }
                        if( (double)(oobbuf[ooboffs+k])!=(double)(0) )
                        {
                            rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]);
                        }
                        else
                        {
                            rep.oobavgce = rep.oobavgce-Math.Log(math.minrealnumber);
                        }
                        for(j=0; j<=nclasses-1; j++)
                        {
                            if( j==k )
                            {
                                rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]-1);
                                rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1);
                                rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1);
                                oobrelcnt = oobrelcnt+1;
                            }
                            else
                            {
                                rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]);
                                rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]);
                            }
                        }
                    }
                    else
                    {
                        
                        //
                        // regression-specific code
                        //
                        rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs]-xy[i,nvars]);
                        rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]);
                        if( (double)(xy[i,nvars])!=(double)(0) )
                        {
                            rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]);
                            oobrelcnt = oobrelcnt+1;
                        }
                    }
                    
                    //
                    // update OOB estimates count.
                    //
                    oobcnt = oobcnt+1;
                }
            }
            if( oobcnt>0 )
            {
                rep.oobrelclserror = rep.oobrelclserror/oobcnt;
                rep.oobavgce = rep.oobavgce/oobcnt;
                rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses));
                rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses);
                if( oobrelcnt>0 )
                {
                    rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt;
                }
            }
        }
Example #2
0
        /*************************************************************************
        This subroutine builds random decision forest.
        This function gives ability to tune number of variables used when choosing
        best split.

        INPUT PARAMETERS:
            XY          -   training set
            NPoints     -   training set size, NPoints>=1
            NVars       -   number of independent variables, NVars>=1
            NClasses    -   task type:
                            * NClasses=1 - regression task with one
                                           dependent variable
                            * NClasses>1 - classification task with
                                           NClasses classes.
            NTrees      -   number of trees in a forest, NTrees>=1.
                            recommended values: 50-100.
            NRndVars    -   number of variables used when choosing best split
            R           -   percent of a training set used to build
                            individual trees. 0<R<=1.
                            recommended values: 0.1 <= R <= 0.66.

        OUTPUT PARAMETERS:
            Info        -   return code:
                            * -2, if there is a point with class number
                                  outside of [0..NClasses-1].
                            * -1, if incorrect parameters was passed
                                  (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
                                  or R>1).
                            *  1, if task has been solved
            DF          -   model built
            Rep         -   training report, contains error on a training set
                            and out-of-bag estimates of generalization error.

          -- ALGLIB --
             Copyright 19.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfbuildrandomdecisionforestx1(double[,] xy,
            int npoints,
            int nvars,
            int nclasses,
            int ntrees,
            int nrndvars,
            double r,
            ref int info,
            decisionforest df,
            dfreport rep)
        {
            int samplesize = 0;

            info = 0;

            if( (double)(r)<=(double)(0) || (double)(r)>(double)(1) )
            {
                info = -1;
                return;
            }
            if( nrndvars<=0 || nrndvars>nvars )
            {
                info = -1;
                return;
            }
            samplesize = Math.Max((int)Math.Round(r*npoints), 1);
            dfbuildinternal(xy, npoints, nvars, nclasses, ntrees, samplesize, nrndvars, dfusestrongsplits+dfuseevs, ref info, df, rep);
        }
    /*************************************************************************
    This subroutine builds random decision forest.

    INPUT PARAMETERS:
        XY          -   training set
        NPoints     -   training set size, NPoints>=1
        NVars       -   number of independent variables, NVars>=1
        NClasses    -   task type:
                        * NClasses=1 - regression task with one
                                       dependent variable
                        * NClasses>1 - classification task with
                                       NClasses classes.
        NTrees      -   number of trees in a forest, NTrees>=1.
                        recommended values: 50-100.
        R           -   percent of a training set used to build
                        individual trees. 0<R<=1.
                        recommended values: 0.1 <= R <= 0.66.

    OUTPUT PARAMETERS:
        Info        -   return code:
                        * -2, if there is a point with class number
                              outside of [0..NClasses-1].
                        * -1, if incorrect parameters was passed
                              (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
                              or R>1).
                        *  1, if task has been solved
        DF          -   model built
        Rep         -   training report, contains error on a training set
                        and out-of-bag estimates of generalization error.

      -- ALGLIB --
         Copyright 19.02.2009 by Bochkanov Sergey
    *************************************************************************/
    public static void dfbuildrandomdecisionforest(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, out int info, out decisionforest df, out dfreport rep)
    {
        info = 0;
        df = new decisionforest();
        rep = new dfreport();
        dforest.dfbuildrandomdecisionforest(xy, npoints, nvars, nclasses, ntrees, r, ref info, df.innerobj, rep.innerobj);
        return;
    }
Example #4
0
 public override alglib.apobject make_copy()
 {
     dfreport _result = new dfreport();
     _result.relclserror = relclserror;
     _result.avgce = avgce;
     _result.rmserror = rmserror;
     _result.avgerror = avgerror;
     _result.avgrelerror = avgrelerror;
     _result.oobrelclserror = oobrelclserror;
     _result.oobavgce = oobavgce;
     _result.oobrmserror = oobrmserror;
     _result.oobavgerror = oobavgerror;
     _result.oobavgrelerror = oobavgrelerror;
     return _result;
 }
        /*************************************************************************
        This subroutine builds random decision forest.

        INPUT PARAMETERS:
            XY          -   training set
            NPoints     -   training set size, NPoints>=1
            NVars       -   number of independent variables, NVars>=1
            NClasses    -   task type:
                            * NClasses=1 - regression task with one
                                           dependent variable
                            * NClasses>1 - classification task with
                                           NClasses classes.
            NTrees      -   number of trees in a forest, NTrees>=1.
                            recommended values: 50-100.
            R           -   percent of a training set used to build
                            individual trees. 0<R<=1.
                            recommended values: 0.1 <= R <= 0.66.

        OUTPUT PARAMETERS:
            Info        -   return code:
                            * -2, if there is a point with class number
                                  outside of [0..NClasses-1].
                            * -1, if incorrect parameters was passed
                                  (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
                                  or R>1).
                            *  1, if task has been solved
            DF          -   model built
            Rep         -   training report, contains error on a training set
                            and out-of-bag estimates of generalization error.

          -- ALGLIB --
             Copyright 19.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfbuildrandomdecisionforest(ref double[,] xy,
            int npoints,
            int nvars,
            int nclasses,
            int ntrees,
            double r,
            ref int info,
            ref decisionforest df,
            ref dfreport rep)
        {
            int samplesize = 0;

            if( (double)(r)<=(double)(0) | (double)(r)>(double)(1) )
            {
                info = -1;
                return;
            }
            samplesize = Math.Max((int)Math.Round(r*npoints), 1);
            dfbuildinternal(ref xy, npoints, nvars, nclasses, ntrees, samplesize, Math.Max(nvars/2, 1), dfusestrongsplits+dfuseevs, ref info, ref df, ref rep);
        }