Example #1
0
        /*************************************************************************
        Procesing

        INPUT PARAMETERS:
            DF      -   decision forest model
            X       -   input vector,  array[0..NVars-1].

        OUTPUT PARAMETERS:
            Y       -   result. Regression estimate when solving regression  task,
                        vector of posterior probabilities for classification task.

        See also DFProcessI.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfprocess(decisionforest df,
            double[] x,
            ref double[] y)
        {
            int offs = 0;
            int i = 0;
            double v = 0;
            int i_ = 0;

            
            //
            // Proceed
            //
            if( alglib.ap.len(y)<df.nclasses )
            {
                y = new double[df.nclasses];
            }
            offs = 0;
            for(i=0; i<=df.nclasses-1; i++)
            {
                y[i] = 0;
            }
            for(i=0; i<=df.ntrees-1; i++)
            {
                
                //
                // Process basic tree
                //
                dfprocessinternal(df, offs, x, ref y);
                
                //
                // Next tree
                //
                offs = offs+(int)Math.Round(df.trees[offs]);
            }
            v = (double)1/(double)df.ntrees;
            for(i_=0; i_<=df.nclasses-1;i_++)
            {
                y[i_] = v*y[i_];
            }
        }
Example #2
0
        /*************************************************************************
        'interactive' variant of DFProcess for languages like Python which support
        constructs like "Y = DFProcessI(DF,X)" and interactive mode of interpreter

        This function allocates new array on each call,  so  it  is  significantly
        slower than its 'non-interactive' counterpart, but it is  more  convenient
        when you call it from command line.

          -- ALGLIB --
             Copyright 28.02.2010 by Bochkanov Sergey
        *************************************************************************/
        public static void dfprocessi(decisionforest df,
            double[] x,
            ref double[] y)
        {
            y = new double[0];

            dfprocess(df, x, ref y);
        }
Example #3
0
        /*************************************************************************
        Classification error
        *************************************************************************/
        private static int dfclserror(decisionforest df,
            double[,] xy,
            int npoints)
        {
            int result = 0;
            double[] x = new double[0];
            double[] y = new double[0];
            int i = 0;
            int j = 0;
            int k = 0;
            int tmpi = 0;
            int i_ = 0;

            if( df.nclasses<=1 )
            {
                result = 0;
                return result;
            }
            x = new double[df.nvars-1+1];
            y = new double[df.nclasses-1+1];
            result = 0;
            for(i=0; i<=npoints-1; i++)
            {
                for(i_=0; i_<=df.nvars-1;i_++)
                {
                    x[i_] = xy[i,i_];
                }
                dfprocess(df, x, ref y);
                k = (int)Math.Round(xy[i,df.nvars]);
                tmpi = 0;
                for(j=1; j<=df.nclasses-1; j++)
                {
                    if( (double)(y[j])>(double)(y[tmpi]) )
                    {
                        tmpi = j;
                    }
                }
                if( tmpi!=k )
                {
                    result = result+1;
                }
            }
            return result;
        }
        /*************************************************************************
        Procesing

        INPUT PARAMETERS:
            DF      -   decision forest model
            X       -   input vector,  array[0..NVars-1].

        OUTPUT PARAMETERS:
            Y       -   result. Regression estimate when solving regression  task,
                        vector of posterior probabilities for classification task.
                        Subroutine does not allocate memory for this vector, it is
                        responsibility of a caller to allocate it. Array  must  be
                        at least [0..NClasses-1].

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfprocess(ref decisionforest df,
            ref double[] x,
            ref double[] y)
        {
            int offs = 0;
            int i = 0;
            double v = 0;
            int i_ = 0;

            
            //
            // Proceed
            //
            offs = 0;
            for(i=0; i<=df.nclasses-1; i++)
            {
                y[i] = 0;
            }
            for(i=0; i<=df.ntrees-1; i++)
            {
                
                //
                // Process basic tree
                //
                dfprocessinternal(ref df, offs, ref x, ref y);
                
                //
                // Next tree
                //
                offs = offs+(int)Math.Round(df.trees[offs]);
            }
            v = (double)(1)/(double)(df.ntrees);
            for(i_=0; i_<=df.nclasses-1;i_++)
            {
                y[i_] = v*y[i_];
            }
        }
Example #5
0
        /*************************************************************************
        Copying of DecisionForest strucure

        INPUT PARAMETERS:
            DF1 -   original

        OUTPUT PARAMETERS:
            DF2 -   copy

          -- ALGLIB --
             Copyright 13.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfcopy(decisionforest df1,
            decisionforest df2)
        {
            int i_ = 0;

            df2.nvars = df1.nvars;
            df2.nclasses = df1.nclasses;
            df2.ntrees = df1.ntrees;
            df2.bufsize = df1.bufsize;
            df2.trees = new double[df1.bufsize-1+1];
            for(i_=0; i_<=df1.bufsize-1;i_++)
            {
                df2.trees[i_] = df1.trees[i_];
            }
        }
Example #6
0
        /*************************************************************************
        Serializer: serialization

          -- ALGLIB --
             Copyright 14.03.2011 by Bochkanov Sergey
        *************************************************************************/
        public static void dfserialize(alglib.serializer s,
            decisionforest forest)
        {
            s.serialize_int(scodes.getrdfserializationcode());
            s.serialize_int(dffirstversion);
            s.serialize_int(forest.nvars);
            s.serialize_int(forest.nclasses);
            s.serialize_int(forest.ntrees);
            s.serialize_int(forest.bufsize);
            apserv.serializerealarray(s, forest.trees, forest.bufsize);
        }
    /*************************************************************************
    Procesing

    INPUT PARAMETERS:
        DF      -   decision forest model
        X       -   input vector,  array[0..NVars-1].

    OUTPUT PARAMETERS:
        Y       -   result. Regression estimate when solving regression  task,
                    vector of posterior probabilities for classification task.

    See also DFProcessI.

      -- ALGLIB --
         Copyright 16.02.2009 by Bochkanov Sergey
    *************************************************************************/
    public static void dfprocess(decisionforest df, double[] x, ref double[] y)
    {

        dforest.dfprocess(df.innerobj, x, ref y);
        return;
    }
Example #8
0
        /*************************************************************************
        RMS error on the test set

        INPUT PARAMETERS:
            DF      -   decision forest model
            XY      -   test set
            NPoints -   test set size

        RESULT:
            root mean square error.
            Its meaning for regression task is obvious. As for
            classification task, RMS error means error when estimating posterior
            probabilities.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static double dfrmserror(decisionforest df,
            double[,] xy,
            int npoints)
        {
            double result = 0;
            double[] x = new double[0];
            double[] y = new double[0];
            int i = 0;
            int j = 0;
            int k = 0;
            int tmpi = 0;
            int i_ = 0;

            x = new double[df.nvars-1+1];
            y = new double[df.nclasses-1+1];
            result = 0;
            for(i=0; i<=npoints-1; i++)
            {
                for(i_=0; i_<=df.nvars-1;i_++)
                {
                    x[i_] = xy[i,i_];
                }
                dfprocess(df, x, ref y);
                if( df.nclasses>1 )
                {
                    
                    //
                    // classification-specific code
                    //
                    k = (int)Math.Round(xy[i,df.nvars]);
                    tmpi = 0;
                    for(j=1; j<=df.nclasses-1; j++)
                    {
                        if( (double)(y[j])>(double)(y[tmpi]) )
                        {
                            tmpi = j;
                        }
                    }
                    for(j=0; j<=df.nclasses-1; j++)
                    {
                        if( j==k )
                        {
                            result = result+math.sqr(y[j]-1);
                        }
                        else
                        {
                            result = result+math.sqr(y[j]);
                        }
                    }
                }
                else
                {
                    
                    //
                    // regression-specific code
                    //
                    result = result+math.sqr(y[0]-xy[i,df.nvars]);
                }
            }
            result = Math.Sqrt(result/(npoints*df.nclasses));
            return result;
        }
        /*************************************************************************
        Unserialization of DecisionForest strucure

        INPUT PARAMETERS:
            RA      -   real array which stores decision forest

        OUTPUT PARAMETERS:
            DF      -   restored structure

          -- ALGLIB --
             Copyright 13.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfunserialize(ref double[] ra,
            ref decisionforest df)
        {
            int i_ = 0;
            int i1_ = 0;

            System.Diagnostics.Debug.Assert((int)Math.Round(ra[0])==dfvnum, "DFUnserialize: incorrect array!");
            df.nvars = (int)Math.Round(ra[1]);
            df.nclasses = (int)Math.Round(ra[2]);
            df.ntrees = (int)Math.Round(ra[3]);
            df.bufsize = (int)Math.Round(ra[4]);
            df.trees = new double[df.bufsize-1+1];
            i1_ = (5) - (0);
            for(i_=0; i_<=df.bufsize-1;i_++)
            {
                df.trees[i_] = ra[i_+i1_];
            }
        }
    /*************************************************************************
    This subroutine builds random decision forest.

    INPUT PARAMETERS:
        XY          -   training set
        NPoints     -   training set size, NPoints>=1
        NVars       -   number of independent variables, NVars>=1
        NClasses    -   task type:
                        * NClasses=1 - regression task with one
                                       dependent variable
                        * NClasses>1 - classification task with
                                       NClasses classes.
        NTrees      -   number of trees in a forest, NTrees>=1.
                        recommended values: 50-100.
        R           -   percent of a training set used to build
                        individual trees. 0<R<=1.
                        recommended values: 0.1 <= R <= 0.66.

    OUTPUT PARAMETERS:
        Info        -   return code:
                        * -2, if there is a point with class number
                              outside of [0..NClasses-1].
                        * -1, if incorrect parameters was passed
                              (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
                              or R>1).
                        *  1, if task has been solved
        DF          -   model built
        Rep         -   training report, contains error on a training set
                        and out-of-bag estimates of generalization error.

      -- ALGLIB --
         Copyright 19.02.2009 by Bochkanov Sergey
    *************************************************************************/
    public static void dfbuildrandomdecisionforest(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, out int info, out decisionforest df, out dfreport rep)
    {
        info = 0;
        df = new decisionforest();
        rep = new dfreport();
        dforest.dfbuildrandomdecisionforest(xy, npoints, nvars, nclasses, ntrees, r, ref info, df.innerobj, rep.innerobj);
        return;
    }
Example #11
0
        /*************************************************************************
        Serialization of DecisionForest strucure

        INPUT PARAMETERS:
            DF      -   original

        OUTPUT PARAMETERS:
            RA      -   array of real numbers which stores decision forest,
                        array[0..RLen-1]
            RLen    -   RA lenght

          -- ALGLIB --
             Copyright 13.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfserialize(ref decisionforest df,
            ref double[] ra,
            ref int rlen)
        {
            int i_ = 0;
            int i1_ = 0;

            ra = new double[df.bufsize+5-1+1];
            ra[0] = dfvnum;
            ra[1] = df.nvars;
            ra[2] = df.nclasses;
            ra[3] = df.ntrees;
            ra[4] = df.bufsize;
            i1_ = (0) - (5);
            for(i_=5; i_<=5+df.bufsize-1;i_++)
            {
                ra[i_] = df.trees[i_+i1_];
            }
            rlen = 5+df.bufsize;
        }
Example #12
0
        /*************************************************************************
        Average error on the test set

        INPUT PARAMETERS:
            DF      -   decision forest model
            XY      -   test set
            NPoints -   test set size

        RESULT:
            Its meaning for regression task is obvious. As for
            classification task, it means average error when estimating posterior
            probabilities.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static double dfavgerror(ref decisionforest df,
            ref double[,] xy,
            int npoints)
        {
            double result = 0;
            double[] x = new double[0];
            double[] y = new double[0];
            int i = 0;
            int j = 0;
            int k = 0;
            int i_ = 0;

            x = new double[df.nvars-1+1];
            y = new double[df.nclasses-1+1];
            result = 0;
            for(i=0; i<=npoints-1; i++)
            {
                for(i_=0; i_<=df.nvars-1;i_++)
                {
                    x[i_] = xy[i,i_];
                }
                dfprocess(ref df, ref x, ref y);
                if( df.nclasses>1 )
                {
                    
                    //
                    // classification-specific code
                    //
                    k = (int)Math.Round(xy[i,df.nvars]);
                    for(j=0; j<=df.nclasses-1; j++)
                    {
                        if( j==k )
                        {
                            result = result+Math.Abs(y[j]-1);
                        }
                        else
                        {
                            result = result+Math.Abs(y[j]);
                        }
                    }
                }
                else
                {
                    
                    //
                    // regression-specific code
                    //
                    result = result+Math.Abs(y[0]-xy[i,df.nvars]);
                }
            }
            result = result/(npoints*df.nclasses);
            return result;
        }
Example #13
0
        /*************************************************************************
        Relative classification error on the test set

        INPUT PARAMETERS:
            DF      -   decision forest model
            XY      -   test set
            NPoints -   test set size

        RESULT:
            percent of incorrectly classified cases.
            Zero if model solves regression task.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static double dfrelclserror(ref decisionforest df,
            ref double[,] xy,
            int npoints)
        {
            double result = 0;

            result = (double)(dfclserror(ref df, ref xy, npoints))/(double)(npoints);
            return result;
        }
Example #14
0
        /*************************************************************************
        Relative classification error on the test set

        INPUT PARAMETERS:
            DF      -   decision forest model
            XY      -   test set
            NPoints -   test set size

        RESULT:
            percent of incorrectly classified cases.
            Zero if model solves regression task.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static double dfrelclserror(decisionforest df,
            double[,] xy,
            int npoints)
        {
            double result = 0;

            result = (double)dfclserror(df, xy, npoints)/(double)npoints;
            return result;
        }
    /*************************************************************************
    'interactive' variant of DFProcess for languages like Python which support
    constructs like "Y = DFProcessI(DF,X)" and interactive mode of interpreter

    This function allocates new array on each call,  so  it  is  significantly
    slower than its 'non-interactive' counterpart, but it is  more  convenient
    when you call it from command line.

      -- ALGLIB --
         Copyright 28.02.2010 by Bochkanov Sergey
    *************************************************************************/
    public static void dfprocessi(decisionforest df, double[] x, out double[] y)
    {
        y = new double[0];
        dforest.dfprocessi(df.innerobj, x, ref y);
        return;
    }
Example #16
0
        /*************************************************************************
        Average cross-entropy (in bits per element) on the test set

        INPUT PARAMETERS:
            DF      -   decision forest model
            XY      -   test set
            NPoints -   test set size

        RESULT:
            CrossEntropy/(NPoints*LN(2)).
            Zero if model solves regression task.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static double dfavgce(decisionforest df,
            double[,] xy,
            int npoints)
        {
            double result = 0;
            double[] x = new double[0];
            double[] y = new double[0];
            int i = 0;
            int j = 0;
            int k = 0;
            int tmpi = 0;
            int i_ = 0;

            x = new double[df.nvars-1+1];
            y = new double[df.nclasses-1+1];
            result = 0;
            for(i=0; i<=npoints-1; i++)
            {
                for(i_=0; i_<=df.nvars-1;i_++)
                {
                    x[i_] = xy[i,i_];
                }
                dfprocess(df, x, ref y);
                if( df.nclasses>1 )
                {
                    
                    //
                    // classification-specific code
                    //
                    k = (int)Math.Round(xy[i,df.nvars]);
                    tmpi = 0;
                    for(j=1; j<=df.nclasses-1; j++)
                    {
                        if( (double)(y[j])>(double)(y[tmpi]) )
                        {
                            tmpi = j;
                        }
                    }
                    if( (double)(y[k])!=(double)(0) )
                    {
                        result = result-Math.Log(y[k]);
                    }
                    else
                    {
                        result = result-Math.Log(math.minrealnumber);
                    }
                }
            }
            result = result/npoints;
            return result;
        }
    /*************************************************************************
    Average relative error on the test set

    INPUT PARAMETERS:
        DF      -   decision forest model
        XY      -   test set
        NPoints -   test set size

    RESULT:
        Its meaning for regression task is obvious. As for
        classification task, it means average relative error when estimating
        posterior probability of belonging to the correct class.

      -- ALGLIB --
         Copyright 16.02.2009 by Bochkanov Sergey
    *************************************************************************/
    public static double dfavgrelerror(decisionforest df, double[,] xy, int npoints)
    {

        double result = dforest.dfavgrelerror(df.innerobj, xy, npoints);
        return result;
    }
Example #18
0
        /*************************************************************************
        Average relative error on the test set

        INPUT PARAMETERS:
            DF      -   decision forest model
            XY      -   test set
            NPoints -   test set size

        RESULT:
            Its meaning for regression task is obvious. As for
            classification task, it means average relative error when estimating
            posterior probability of belonging to the correct class.

          -- ALGLIB --
             Copyright 16.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static double dfavgrelerror(decisionforest df,
            double[,] xy,
            int npoints)
        {
            double result = 0;
            double[] x = new double[0];
            double[] y = new double[0];
            int relcnt = 0;
            int i = 0;
            int j = 0;
            int k = 0;
            int i_ = 0;

            x = new double[df.nvars-1+1];
            y = new double[df.nclasses-1+1];
            result = 0;
            relcnt = 0;
            for(i=0; i<=npoints-1; i++)
            {
                for(i_=0; i_<=df.nvars-1;i_++)
                {
                    x[i_] = xy[i,i_];
                }
                dfprocess(df, x, ref y);
                if( df.nclasses>1 )
                {
                    
                    //
                    // classification-specific code
                    //
                    k = (int)Math.Round(xy[i,df.nvars]);
                    for(j=0; j<=df.nclasses-1; j++)
                    {
                        if( j==k )
                        {
                            result = result+Math.Abs(y[j]-1);
                            relcnt = relcnt+1;
                        }
                    }
                }
                else
                {
                    
                    //
                    // regression-specific code
                    //
                    if( (double)(xy[i,df.nvars])!=(double)(0) )
                    {
                        result = result+Math.Abs((y[0]-xy[i,df.nvars])/xy[i,df.nvars]);
                        relcnt = relcnt+1;
                    }
                }
            }
            if( relcnt>0 )
            {
                result = result/relcnt;
            }
            return result;
        }
Example #19
0
 public override alglib.apobject make_copy()
 {
     decisionforest _result = new decisionforest();
     _result.nvars = nvars;
     _result.nclasses = nclasses;
     _result.ntrees = ntrees;
     _result.bufsize = bufsize;
     _result.trees = (double[])trees.Clone();
     return _result;
 }
Example #20
0
        /*************************************************************************
        Serializer: allocation

          -- ALGLIB --
             Copyright 14.03.2011 by Bochkanov Sergey
        *************************************************************************/
        public static void dfalloc(alglib.serializer s,
            decisionforest forest)
        {
            s.alloc_entry();
            s.alloc_entry();
            s.alloc_entry();
            s.alloc_entry();
            s.alloc_entry();
            s.alloc_entry();
            apserv.allocrealarray(s, forest.trees, forest.bufsize);
        }
Example #21
0
        /*************************************************************************
        This subroutine builds random decision forest.
        This function gives ability to tune number of variables used when choosing
        best split.

        INPUT PARAMETERS:
            XY          -   training set
            NPoints     -   training set size, NPoints>=1
            NVars       -   number of independent variables, NVars>=1
            NClasses    -   task type:
                            * NClasses=1 - regression task with one
                                           dependent variable
                            * NClasses>1 - classification task with
                                           NClasses classes.
            NTrees      -   number of trees in a forest, NTrees>=1.
                            recommended values: 50-100.
            NRndVars    -   number of variables used when choosing best split
            R           -   percent of a training set used to build
                            individual trees. 0<R<=1.
                            recommended values: 0.1 <= R <= 0.66.

        OUTPUT PARAMETERS:
            Info        -   return code:
                            * -2, if there is a point with class number
                                  outside of [0..NClasses-1].
                            * -1, if incorrect parameters was passed
                                  (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
                                  or R>1).
                            *  1, if task has been solved
            DF          -   model built
            Rep         -   training report, contains error on a training set
                            and out-of-bag estimates of generalization error.

          -- ALGLIB --
             Copyright 19.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfbuildrandomdecisionforestx1(double[,] xy,
            int npoints,
            int nvars,
            int nclasses,
            int ntrees,
            int nrndvars,
            double r,
            ref int info,
            decisionforest df,
            dfreport rep)
        {
            int samplesize = 0;

            info = 0;

            if( (double)(r)<=(double)(0) || (double)(r)>(double)(1) )
            {
                info = -1;
                return;
            }
            if( nrndvars<=0 || nrndvars>nvars )
            {
                info = -1;
                return;
            }
            samplesize = Math.Max((int)Math.Round(r*npoints), 1);
            dfbuildinternal(xy, npoints, nvars, nclasses, ntrees, samplesize, nrndvars, dfusestrongsplits+dfuseevs, ref info, df, rep);
        }
Example #22
0
        /*************************************************************************
        Serializer: unserialization

          -- ALGLIB --
             Copyright 14.03.2011 by Bochkanov Sergey
        *************************************************************************/
        public static void dfunserialize(alglib.serializer s,
            decisionforest forest)
        {
            int i0 = 0;
            int i1 = 0;

            
            //
            // check correctness of header
            //
            i0 = s.unserialize_int();
            alglib.ap.assert(i0==scodes.getrdfserializationcode(), "DFUnserialize: stream header corrupted");
            i1 = s.unserialize_int();
            alglib.ap.assert(i1==dffirstversion, "DFUnserialize: stream header corrupted");
            
            //
            // Unserialize data
            //
            forest.nvars = s.unserialize_int();
            forest.nclasses = s.unserialize_int();
            forest.ntrees = s.unserialize_int();
            forest.bufsize = s.unserialize_int();
            apserv.unserializerealarray(s, ref forest.trees);
        }
Example #23
0
        public static void dfbuildinternal(double[,] xy,
            int npoints,
            int nvars,
            int nclasses,
            int ntrees,
            int samplesize,
            int nfeatures,
            int flags,
            ref int info,
            decisionforest df,
            dfreport rep)
        {
            int i = 0;
            int j = 0;
            int k = 0;
            int tmpi = 0;
            int lasttreeoffs = 0;
            int offs = 0;
            int ooboffs = 0;
            int treesize = 0;
            int nvarsinpool = 0;
            bool useevs = new bool();
            dfinternalbuffers bufs = new dfinternalbuffers();
            int[] permbuf = new int[0];
            double[] oobbuf = new double[0];
            int[] oobcntbuf = new int[0];
            double[,] xys = new double[0,0];
            double[] x = new double[0];
            double[] y = new double[0];
            int oobcnt = 0;
            int oobrelcnt = 0;
            double v = 0;
            double vmin = 0;
            double vmax = 0;
            bool bflag = new bool();
            int i_ = 0;
            int i1_ = 0;

            info = 0;

            
            //
            // Test for inputs
            //
            if( (((((npoints<1 || samplesize<1) || samplesize>npoints) || nvars<1) || nclasses<1) || ntrees<1) || nfeatures<1 )
            {
                info = -1;
                return;
            }
            if( nclasses>1 )
            {
                for(i=0; i<=npoints-1; i++)
                {
                    if( (int)Math.Round(xy[i,nvars])<0 || (int)Math.Round(xy[i,nvars])>=nclasses )
                    {
                        info = -2;
                        return;
                    }
                }
            }
            info = 1;
            
            //
            // Flags
            //
            useevs = flags/dfuseevs%2!=0;
            
            //
            // Allocate data, prepare header
            //
            treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize;
            permbuf = new int[npoints-1+1];
            bufs.treebuf = new double[treesize-1+1];
            bufs.idxbuf = new int[npoints-1+1];
            bufs.tmpbufr = new double[npoints-1+1];
            bufs.tmpbufr2 = new double[npoints-1+1];
            bufs.tmpbufi = new int[npoints-1+1];
            bufs.sortrbuf = new double[npoints];
            bufs.sortrbuf2 = new double[npoints];
            bufs.sortibuf = new int[npoints];
            bufs.varpool = new int[nvars-1+1];
            bufs.evsbin = new bool[nvars-1+1];
            bufs.evssplits = new double[nvars-1+1];
            bufs.classibuf = new int[2*nclasses-1+1];
            oobbuf = new double[nclasses*npoints-1+1];
            oobcntbuf = new int[npoints-1+1];
            df.trees = new double[ntrees*treesize-1+1];
            xys = new double[samplesize-1+1, nvars+1];
            x = new double[nvars-1+1];
            y = new double[nclasses-1+1];
            for(i=0; i<=npoints-1; i++)
            {
                permbuf[i] = i;
            }
            for(i=0; i<=npoints*nclasses-1; i++)
            {
                oobbuf[i] = 0;
            }
            for(i=0; i<=npoints-1; i++)
            {
                oobcntbuf[i] = 0;
            }
            
            //
            // Prepare variable pool and EVS (extended variable selection/splitting) buffers
            // (whether EVS is turned on or not):
            // 1. detect binary variables and pre-calculate splits for them
            // 2. detect variables with non-distinct values and exclude them from pool
            //
            for(i=0; i<=nvars-1; i++)
            {
                bufs.varpool[i] = i;
            }
            nvarsinpool = nvars;
            if( useevs )
            {
                for(j=0; j<=nvars-1; j++)
                {
                    vmin = xy[0,j];
                    vmax = vmin;
                    for(i=0; i<=npoints-1; i++)
                    {
                        v = xy[i,j];
                        vmin = Math.Min(vmin, v);
                        vmax = Math.Max(vmax, v);
                    }
                    if( (double)(vmin)==(double)(vmax) )
                    {
                        
                        //
                        // exclude variable from pool
                        //
                        bufs.varpool[j] = bufs.varpool[nvarsinpool-1];
                        bufs.varpool[nvarsinpool-1] = -1;
                        nvarsinpool = nvarsinpool-1;
                        continue;
                    }
                    bflag = false;
                    for(i=0; i<=npoints-1; i++)
                    {
                        v = xy[i,j];
                        if( (double)(v)!=(double)(vmin) && (double)(v)!=(double)(vmax) )
                        {
                            bflag = true;
                            break;
                        }
                    }
                    if( bflag )
                    {
                        
                        //
                        // non-binary variable
                        //
                        bufs.evsbin[j] = false;
                    }
                    else
                    {
                        
                        //
                        // Prepare
                        //
                        bufs.evsbin[j] = true;
                        bufs.evssplits[j] = 0.5*(vmin+vmax);
                        if( (double)(bufs.evssplits[j])<=(double)(vmin) )
                        {
                            bufs.evssplits[j] = vmax;
                        }
                    }
                }
            }
            
            //
            // RANDOM FOREST FORMAT
            // W[0]         -   size of array
            // W[1]         -   version number
            // W[2]         -   NVars
            // W[3]         -   NClasses (1 for regression)
            // W[4]         -   NTrees
            // W[5]         -   trees offset
            //
            //
            // TREE FORMAT
            // W[Offs]      -   size of sub-array
            //     node info:
            // W[K+0]       -   variable number        (-1 for leaf mode)
            // W[K+1]       -   threshold              (class/value for leaf node)
            // W[K+2]       -   ">=" branch index      (absent for leaf node)
            //
            //
            df.nvars = nvars;
            df.nclasses = nclasses;
            df.ntrees = ntrees;
            
            //
            // Build forest
            //
            offs = 0;
            for(i=0; i<=ntrees-1; i++)
            {
                
                //
                // Prepare sample
                //
                for(k=0; k<=samplesize-1; k++)
                {
                    j = k+math.randominteger(npoints-k);
                    tmpi = permbuf[k];
                    permbuf[k] = permbuf[j];
                    permbuf[j] = tmpi;
                    j = permbuf[k];
                    for(i_=0; i_<=nvars;i_++)
                    {
                        xys[k,i_] = xy[j,i_];
                    }
                }
                
                //
                // build tree, copy
                //
                dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs);
                j = (int)Math.Round(bufs.treebuf[0]);
                i1_ = (0) - (offs);
                for(i_=offs; i_<=offs+j-1;i_++)
                {
                    df.trees[i_] = bufs.treebuf[i_+i1_];
                }
                lasttreeoffs = offs;
                offs = offs+j;
                
                //
                // OOB estimates
                //
                for(k=samplesize; k<=npoints-1; k++)
                {
                    for(j=0; j<=nclasses-1; j++)
                    {
                        y[j] = 0;
                    }
                    j = permbuf[k];
                    for(i_=0; i_<=nvars-1;i_++)
                    {
                        x[i_] = xy[j,i_];
                    }
                    dfprocessinternal(df, lasttreeoffs, x, ref y);
                    i1_ = (0) - (j*nclasses);
                    for(i_=j*nclasses; i_<=(j+1)*nclasses-1;i_++)
                    {
                        oobbuf[i_] = oobbuf[i_] + y[i_+i1_];
                    }
                    oobcntbuf[j] = oobcntbuf[j]+1;
                }
            }
            df.bufsize = offs;
            
            //
            // Normalize OOB results
            //
            for(i=0; i<=npoints-1; i++)
            {
                if( oobcntbuf[i]!=0 )
                {
                    v = (double)1/(double)oobcntbuf[i];
                    for(i_=i*nclasses; i_<=i*nclasses+nclasses-1;i_++)
                    {
                        oobbuf[i_] = v*oobbuf[i_];
                    }
                }
            }
            
            //
            // Calculate training set estimates
            //
            rep.relclserror = dfrelclserror(df, xy, npoints);
            rep.avgce = dfavgce(df, xy, npoints);
            rep.rmserror = dfrmserror(df, xy, npoints);
            rep.avgerror = dfavgerror(df, xy, npoints);
            rep.avgrelerror = dfavgrelerror(df, xy, npoints);
            
            //
            // Calculate OOB estimates.
            //
            rep.oobrelclserror = 0;
            rep.oobavgce = 0;
            rep.oobrmserror = 0;
            rep.oobavgerror = 0;
            rep.oobavgrelerror = 0;
            oobcnt = 0;
            oobrelcnt = 0;
            for(i=0; i<=npoints-1; i++)
            {
                if( oobcntbuf[i]!=0 )
                {
                    ooboffs = i*nclasses;
                    if( nclasses>1 )
                    {
                        
                        //
                        // classification-specific code
                        //
                        k = (int)Math.Round(xy[i,nvars]);
                        tmpi = 0;
                        for(j=1; j<=nclasses-1; j++)
                        {
                            if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) )
                            {
                                tmpi = j;
                            }
                        }
                        if( tmpi!=k )
                        {
                            rep.oobrelclserror = rep.oobrelclserror+1;
                        }
                        if( (double)(oobbuf[ooboffs+k])!=(double)(0) )
                        {
                            rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]);
                        }
                        else
                        {
                            rep.oobavgce = rep.oobavgce-Math.Log(math.minrealnumber);
                        }
                        for(j=0; j<=nclasses-1; j++)
                        {
                            if( j==k )
                            {
                                rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]-1);
                                rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1);
                                rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1);
                                oobrelcnt = oobrelcnt+1;
                            }
                            else
                            {
                                rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]);
                                rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]);
                            }
                        }
                    }
                    else
                    {
                        
                        //
                        // regression-specific code
                        //
                        rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs]-xy[i,nvars]);
                        rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]);
                        if( (double)(xy[i,nvars])!=(double)(0) )
                        {
                            rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]);
                            oobrelcnt = oobrelcnt+1;
                        }
                    }
                    
                    //
                    // update OOB estimates count.
                    //
                    oobcnt = oobcnt+1;
                }
            }
            if( oobcnt>0 )
            {
                rep.oobrelclserror = rep.oobrelclserror/oobcnt;
                rep.oobavgce = rep.oobavgce/oobcnt;
                rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses));
                rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses);
                if( oobrelcnt>0 )
                {
                    rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt;
                }
            }
        }
Example #24
0
        /*************************************************************************
        Internal subroutine for processing one decision tree starting at Offs
        *************************************************************************/
        private static void dfprocessinternal(decisionforest df,
            int offs,
            double[] x,
            ref double[] y)
        {
            int k = 0;
            int idx = 0;

            
            //
            // Set pointer to the root
            //
            k = offs+1;
            
            //
            // Navigate through the tree
            //
            while( true )
            {
                if( (double)(df.trees[k])==(double)(-1) )
                {
                    if( df.nclasses==1 )
                    {
                        y[0] = y[0]+df.trees[k+1];
                    }
                    else
                    {
                        idx = (int)Math.Round(df.trees[k+1]);
                        y[idx] = y[idx]+1;
                    }
                    break;
                }
                if( (double)(x[(int)Math.Round(df.trees[k])])<(double)(df.trees[k+1]) )
                {
                    k = k+innernodewidth;
                }
                else
                {
                    k = offs+(int)Math.Round(df.trees[k+2]);
                }
            }
        }
Example #25
0
        /*************************************************************************
        This subroutine builds random decision forest.

        INPUT PARAMETERS:
            XY          -   training set
            NPoints     -   training set size, NPoints>=1
            NVars       -   number of independent variables, NVars>=1
            NClasses    -   task type:
                            * NClasses=1 - regression task with one
                                           dependent variable
                            * NClasses>1 - classification task with
                                           NClasses classes.
            NTrees      -   number of trees in a forest, NTrees>=1.
                            recommended values: 50-100.
            R           -   percent of a training set used to build
                            individual trees. 0<R<=1.
                            recommended values: 0.1 <= R <= 0.66.

        OUTPUT PARAMETERS:
            Info        -   return code:
                            * -2, if there is a point with class number
                                  outside of [0..NClasses-1].
                            * -1, if incorrect parameters was passed
                                  (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0
                                  or R>1).
                            *  1, if task has been solved
            DF          -   model built
            Rep         -   training report, contains error on a training set
                            and out-of-bag estimates of generalization error.

          -- ALGLIB --
             Copyright 19.02.2009 by Bochkanov Sergey
        *************************************************************************/
        public static void dfbuildrandomdecisionforest(ref double[,] xy,
            int npoints,
            int nvars,
            int nclasses,
            int ntrees,
            double r,
            ref int info,
            ref decisionforest df,
            ref dfreport rep)
        {
            int samplesize = 0;

            if( (double)(r)<=(double)(0) | (double)(r)>(double)(1) )
            {
                info = -1;
                return;
            }
            samplesize = Math.Max((int)Math.Round(r*npoints), 1);
            dfbuildinternal(ref xy, npoints, nvars, nclasses, ntrees, samplesize, Math.Max(nvars/2, 1), dfusestrongsplits+dfuseevs, ref info, ref df, ref rep);
        }