/************************************************************************* Procesing INPUT PARAMETERS: DF - decision forest model X - input vector, array[0..NVars-1]. OUTPUT PARAMETERS: Y - result. Regression estimate when solving regression task, vector of posterior probabilities for classification task. See also DFProcessI. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfprocess(decisionforest df, double[] x, ref double[] y) { int offs = 0; int i = 0; double v = 0; int i_ = 0; // // Proceed // if( alglib.ap.len(y)<df.nclasses ) { y = new double[df.nclasses]; } offs = 0; for(i=0; i<=df.nclasses-1; i++) { y[i] = 0; } for(i=0; i<=df.ntrees-1; i++) { // // Process basic tree // dfprocessinternal(df, offs, x, ref y); // // Next tree // offs = offs+(int)Math.Round(df.trees[offs]); } v = (double)1/(double)df.ntrees; for(i_=0; i_<=df.nclasses-1;i_++) { y[i_] = v*y[i_]; } }
/************************************************************************* 'interactive' variant of DFProcess for languages like Python which support constructs like "Y = DFProcessI(DF,X)" and interactive mode of interpreter This function allocates new array on each call, so it is significantly slower than its 'non-interactive' counterpart, but it is more convenient when you call it from command line. -- ALGLIB -- Copyright 28.02.2010 by Bochkanov Sergey *************************************************************************/ public static void dfprocessi(decisionforest df, double[] x, ref double[] y) { y = new double[0]; dfprocess(df, x, ref y); }
/************************************************************************* Classification error *************************************************************************/ private static int dfclserror(decisionforest df, double[,] xy, int npoints) { int result = 0; double[] x = new double[0]; double[] y = new double[0]; int i = 0; int j = 0; int k = 0; int tmpi = 0; int i_ = 0; if( df.nclasses<=1 ) { result = 0; return result; } x = new double[df.nvars-1+1]; y = new double[df.nclasses-1+1]; result = 0; for(i=0; i<=npoints-1; i++) { for(i_=0; i_<=df.nvars-1;i_++) { x[i_] = xy[i,i_]; } dfprocess(df, x, ref y); k = (int)Math.Round(xy[i,df.nvars]); tmpi = 0; for(j=1; j<=df.nclasses-1; j++) { if( (double)(y[j])>(double)(y[tmpi]) ) { tmpi = j; } } if( tmpi!=k ) { result = result+1; } } return result; }
/************************************************************************* Procesing INPUT PARAMETERS: DF - decision forest model X - input vector, array[0..NVars-1]. OUTPUT PARAMETERS: Y - result. Regression estimate when solving regression task, vector of posterior probabilities for classification task. Subroutine does not allocate memory for this vector, it is responsibility of a caller to allocate it. Array must be at least [0..NClasses-1]. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfprocess(ref decisionforest df, ref double[] x, ref double[] y) { int offs = 0; int i = 0; double v = 0; int i_ = 0; // // Proceed // offs = 0; for(i=0; i<=df.nclasses-1; i++) { y[i] = 0; } for(i=0; i<=df.ntrees-1; i++) { // // Process basic tree // dfprocessinternal(ref df, offs, ref x, ref y); // // Next tree // offs = offs+(int)Math.Round(df.trees[offs]); } v = (double)(1)/(double)(df.ntrees); for(i_=0; i_<=df.nclasses-1;i_++) { y[i_] = v*y[i_]; } }
/************************************************************************* Copying of DecisionForest strucure INPUT PARAMETERS: DF1 - original OUTPUT PARAMETERS: DF2 - copy -- ALGLIB -- Copyright 13.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfcopy(decisionforest df1, decisionforest df2) { int i_ = 0; df2.nvars = df1.nvars; df2.nclasses = df1.nclasses; df2.ntrees = df1.ntrees; df2.bufsize = df1.bufsize; df2.trees = new double[df1.bufsize-1+1]; for(i_=0; i_<=df1.bufsize-1;i_++) { df2.trees[i_] = df1.trees[i_]; } }
/************************************************************************* Serializer: serialization -- ALGLIB -- Copyright 14.03.2011 by Bochkanov Sergey *************************************************************************/ public static void dfserialize(alglib.serializer s, decisionforest forest) { s.serialize_int(scodes.getrdfserializationcode()); s.serialize_int(dffirstversion); s.serialize_int(forest.nvars); s.serialize_int(forest.nclasses); s.serialize_int(forest.ntrees); s.serialize_int(forest.bufsize); apserv.serializerealarray(s, forest.trees, forest.bufsize); }
/************************************************************************* Procesing INPUT PARAMETERS: DF - decision forest model X - input vector, array[0..NVars-1]. OUTPUT PARAMETERS: Y - result. Regression estimate when solving regression task, vector of posterior probabilities for classification task. See also DFProcessI. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfprocess(decisionforest df, double[] x, ref double[] y) { dforest.dfprocess(df.innerobj, x, ref y); return; }
/************************************************************************* RMS error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: root mean square error. Its meaning for regression task is obvious. As for classification task, RMS error means error when estimating posterior probabilities. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrmserror(decisionforest df, double[,] xy, int npoints) { double result = 0; double[] x = new double[0]; double[] y = new double[0]; int i = 0; int j = 0; int k = 0; int tmpi = 0; int i_ = 0; x = new double[df.nvars-1+1]; y = new double[df.nclasses-1+1]; result = 0; for(i=0; i<=npoints-1; i++) { for(i_=0; i_<=df.nvars-1;i_++) { x[i_] = xy[i,i_]; } dfprocess(df, x, ref y); if( df.nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,df.nvars]); tmpi = 0; for(j=1; j<=df.nclasses-1; j++) { if( (double)(y[j])>(double)(y[tmpi]) ) { tmpi = j; } } for(j=0; j<=df.nclasses-1; j++) { if( j==k ) { result = result+math.sqr(y[j]-1); } else { result = result+math.sqr(y[j]); } } } else { // // regression-specific code // result = result+math.sqr(y[0]-xy[i,df.nvars]); } } result = Math.Sqrt(result/(npoints*df.nclasses)); return result; }
/************************************************************************* Unserialization of DecisionForest strucure INPUT PARAMETERS: RA - real array which stores decision forest OUTPUT PARAMETERS: DF - restored structure -- ALGLIB -- Copyright 13.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfunserialize(ref double[] ra, ref decisionforest df) { int i_ = 0; int i1_ = 0; System.Diagnostics.Debug.Assert((int)Math.Round(ra[0])==dfvnum, "DFUnserialize: incorrect array!"); df.nvars = (int)Math.Round(ra[1]); df.nclasses = (int)Math.Round(ra[2]); df.ntrees = (int)Math.Round(ra[3]); df.bufsize = (int)Math.Round(ra[4]); df.trees = new double[df.bufsize-1+1]; i1_ = (5) - (0); for(i_=0; i_<=df.bufsize-1;i_++) { df.trees[i_] = ra[i_+i1_]; } }
/************************************************************************* This subroutine builds random decision forest. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. R - percent of a training set used to build individual trees. 0<R<=1. recommended values: 0.1 <= R <= 0.66. OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0 or R>1). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforest(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, out int info, out decisionforest df, out dfreport rep) { info = 0; df = new decisionforest(); rep = new dfreport(); dforest.dfbuildrandomdecisionforest(xy, npoints, nvars, nclasses, ntrees, r, ref info, df.innerobj, rep.innerobj); return; }
/************************************************************************* Serialization of DecisionForest strucure INPUT PARAMETERS: DF - original OUTPUT PARAMETERS: RA - array of real numbers which stores decision forest, array[0..RLen-1] RLen - RA lenght -- ALGLIB -- Copyright 13.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfserialize(ref decisionforest df, ref double[] ra, ref int rlen) { int i_ = 0; int i1_ = 0; ra = new double[df.bufsize+5-1+1]; ra[0] = dfvnum; ra[1] = df.nvars; ra[2] = df.nclasses; ra[3] = df.ntrees; ra[4] = df.bufsize; i1_ = (0) - (5); for(i_=5; i_<=5+df.bufsize-1;i_++) { ra[i_] = df.trees[i_+i1_]; } rlen = 5+df.bufsize; }
/************************************************************************* Average error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average error when estimating posterior probabilities. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgerror(ref decisionforest df, ref double[,] xy, int npoints) { double result = 0; double[] x = new double[0]; double[] y = new double[0]; int i = 0; int j = 0; int k = 0; int i_ = 0; x = new double[df.nvars-1+1]; y = new double[df.nclasses-1+1]; result = 0; for(i=0; i<=npoints-1; i++) { for(i_=0; i_<=df.nvars-1;i_++) { x[i_] = xy[i,i_]; } dfprocess(ref df, ref x, ref y); if( df.nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,df.nvars]); for(j=0; j<=df.nclasses-1; j++) { if( j==k ) { result = result+Math.Abs(y[j]-1); } else { result = result+Math.Abs(y[j]); } } } else { // // regression-specific code // result = result+Math.Abs(y[0]-xy[i,df.nvars]); } } result = result/(npoints*df.nclasses); return result; }
/************************************************************************* Relative classification error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: percent of incorrectly classified cases. Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrelclserror(ref decisionforest df, ref double[,] xy, int npoints) { double result = 0; result = (double)(dfclserror(ref df, ref xy, npoints))/(double)(npoints); return result; }
/************************************************************************* Relative classification error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: percent of incorrectly classified cases. Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfrelclserror(decisionforest df, double[,] xy, int npoints) { double result = 0; result = (double)dfclserror(df, xy, npoints)/(double)npoints; return result; }
/************************************************************************* 'interactive' variant of DFProcess for languages like Python which support constructs like "Y = DFProcessI(DF,X)" and interactive mode of interpreter This function allocates new array on each call, so it is significantly slower than its 'non-interactive' counterpart, but it is more convenient when you call it from command line. -- ALGLIB -- Copyright 28.02.2010 by Bochkanov Sergey *************************************************************************/ public static void dfprocessi(decisionforest df, double[] x, out double[] y) { y = new double[0]; dforest.dfprocessi(df.innerobj, x, ref y); return; }
/************************************************************************* Average cross-entropy (in bits per element) on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: CrossEntropy/(NPoints*LN(2)). Zero if model solves regression task. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgce(decisionforest df, double[,] xy, int npoints) { double result = 0; double[] x = new double[0]; double[] y = new double[0]; int i = 0; int j = 0; int k = 0; int tmpi = 0; int i_ = 0; x = new double[df.nvars-1+1]; y = new double[df.nclasses-1+1]; result = 0; for(i=0; i<=npoints-1; i++) { for(i_=0; i_<=df.nvars-1;i_++) { x[i_] = xy[i,i_]; } dfprocess(df, x, ref y); if( df.nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,df.nvars]); tmpi = 0; for(j=1; j<=df.nclasses-1; j++) { if( (double)(y[j])>(double)(y[tmpi]) ) { tmpi = j; } } if( (double)(y[k])!=(double)(0) ) { result = result-Math.Log(y[k]); } else { result = result-Math.Log(math.minrealnumber); } } } result = result/npoints; return result; }
/************************************************************************* Average relative error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average relative error when estimating posterior probability of belonging to the correct class. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgrelerror(decisionforest df, double[,] xy, int npoints) { double result = dforest.dfavgrelerror(df.innerobj, xy, npoints); return result; }
/************************************************************************* Average relative error on the test set INPUT PARAMETERS: DF - decision forest model XY - test set NPoints - test set size RESULT: Its meaning for regression task is obvious. As for classification task, it means average relative error when estimating posterior probability of belonging to the correct class. -- ALGLIB -- Copyright 16.02.2009 by Bochkanov Sergey *************************************************************************/ public static double dfavgrelerror(decisionforest df, double[,] xy, int npoints) { double result = 0; double[] x = new double[0]; double[] y = new double[0]; int relcnt = 0; int i = 0; int j = 0; int k = 0; int i_ = 0; x = new double[df.nvars-1+1]; y = new double[df.nclasses-1+1]; result = 0; relcnt = 0; for(i=0; i<=npoints-1; i++) { for(i_=0; i_<=df.nvars-1;i_++) { x[i_] = xy[i,i_]; } dfprocess(df, x, ref y); if( df.nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,df.nvars]); for(j=0; j<=df.nclasses-1; j++) { if( j==k ) { result = result+Math.Abs(y[j]-1); relcnt = relcnt+1; } } } else { // // regression-specific code // if( (double)(xy[i,df.nvars])!=(double)(0) ) { result = result+Math.Abs((y[0]-xy[i,df.nvars])/xy[i,df.nvars]); relcnt = relcnt+1; } } } if( relcnt>0 ) { result = result/relcnt; } return result; }
public override alglib.apobject make_copy() { decisionforest _result = new decisionforest(); _result.nvars = nvars; _result.nclasses = nclasses; _result.ntrees = ntrees; _result.bufsize = bufsize; _result.trees = (double[])trees.Clone(); return _result; }
/************************************************************************* Serializer: allocation -- ALGLIB -- Copyright 14.03.2011 by Bochkanov Sergey *************************************************************************/ public static void dfalloc(alglib.serializer s, decisionforest forest) { s.alloc_entry(); s.alloc_entry(); s.alloc_entry(); s.alloc_entry(); s.alloc_entry(); s.alloc_entry(); apserv.allocrealarray(s, forest.trees, forest.bufsize); }
/************************************************************************* This subroutine builds random decision forest. This function gives ability to tune number of variables used when choosing best split. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. NRndVars - number of variables used when choosing best split R - percent of a training set used to build individual trees. 0<R<=1. recommended values: 0.1 <= R <= 0.66. OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0 or R>1). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforestx1(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, int nrndvars, double r, ref int info, decisionforest df, dfreport rep) { int samplesize = 0; info = 0; if( (double)(r)<=(double)(0) || (double)(r)>(double)(1) ) { info = -1; return; } if( nrndvars<=0 || nrndvars>nvars ) { info = -1; return; } samplesize = Math.Max((int)Math.Round(r*npoints), 1); dfbuildinternal(xy, npoints, nvars, nclasses, ntrees, samplesize, nrndvars, dfusestrongsplits+dfuseevs, ref info, df, rep); }
/************************************************************************* Serializer: unserialization -- ALGLIB -- Copyright 14.03.2011 by Bochkanov Sergey *************************************************************************/ public static void dfunserialize(alglib.serializer s, decisionforest forest) { int i0 = 0; int i1 = 0; // // check correctness of header // i0 = s.unserialize_int(); alglib.ap.assert(i0==scodes.getrdfserializationcode(), "DFUnserialize: stream header corrupted"); i1 = s.unserialize_int(); alglib.ap.assert(i1==dffirstversion, "DFUnserialize: stream header corrupted"); // // Unserialize data // forest.nvars = s.unserialize_int(); forest.nclasses = s.unserialize_int(); forest.ntrees = s.unserialize_int(); forest.bufsize = s.unserialize_int(); apserv.unserializerealarray(s, ref forest.trees); }
public static void dfbuildinternal(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, int samplesize, int nfeatures, int flags, ref int info, decisionforest df, dfreport rep) { int i = 0; int j = 0; int k = 0; int tmpi = 0; int lasttreeoffs = 0; int offs = 0; int ooboffs = 0; int treesize = 0; int nvarsinpool = 0; bool useevs = new bool(); dfinternalbuffers bufs = new dfinternalbuffers(); int[] permbuf = new int[0]; double[] oobbuf = new double[0]; int[] oobcntbuf = new int[0]; double[,] xys = new double[0,0]; double[] x = new double[0]; double[] y = new double[0]; int oobcnt = 0; int oobrelcnt = 0; double v = 0; double vmin = 0; double vmax = 0; bool bflag = new bool(); int i_ = 0; int i1_ = 0; info = 0; // // Test for inputs // if( (((((npoints<1 || samplesize<1) || samplesize>npoints) || nvars<1) || nclasses<1) || ntrees<1) || nfeatures<1 ) { info = -1; return; } if( nclasses>1 ) { for(i=0; i<=npoints-1; i++) { if( (int)Math.Round(xy[i,nvars])<0 || (int)Math.Round(xy[i,nvars])>=nclasses ) { info = -2; return; } } } info = 1; // // Flags // useevs = flags/dfuseevs%2!=0; // // Allocate data, prepare header // treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize; permbuf = new int[npoints-1+1]; bufs.treebuf = new double[treesize-1+1]; bufs.idxbuf = new int[npoints-1+1]; bufs.tmpbufr = new double[npoints-1+1]; bufs.tmpbufr2 = new double[npoints-1+1]; bufs.tmpbufi = new int[npoints-1+1]; bufs.sortrbuf = new double[npoints]; bufs.sortrbuf2 = new double[npoints]; bufs.sortibuf = new int[npoints]; bufs.varpool = new int[nvars-1+1]; bufs.evsbin = new bool[nvars-1+1]; bufs.evssplits = new double[nvars-1+1]; bufs.classibuf = new int[2*nclasses-1+1]; oobbuf = new double[nclasses*npoints-1+1]; oobcntbuf = new int[npoints-1+1]; df.trees = new double[ntrees*treesize-1+1]; xys = new double[samplesize-1+1, nvars+1]; x = new double[nvars-1+1]; y = new double[nclasses-1+1]; for(i=0; i<=npoints-1; i++) { permbuf[i] = i; } for(i=0; i<=npoints*nclasses-1; i++) { oobbuf[i] = 0; } for(i=0; i<=npoints-1; i++) { oobcntbuf[i] = 0; } // // Prepare variable pool and EVS (extended variable selection/splitting) buffers // (whether EVS is turned on or not): // 1. detect binary variables and pre-calculate splits for them // 2. detect variables with non-distinct values and exclude them from pool // for(i=0; i<=nvars-1; i++) { bufs.varpool[i] = i; } nvarsinpool = nvars; if( useevs ) { for(j=0; j<=nvars-1; j++) { vmin = xy[0,j]; vmax = vmin; for(i=0; i<=npoints-1; i++) { v = xy[i,j]; vmin = Math.Min(vmin, v); vmax = Math.Max(vmax, v); } if( (double)(vmin)==(double)(vmax) ) { // // exclude variable from pool // bufs.varpool[j] = bufs.varpool[nvarsinpool-1]; bufs.varpool[nvarsinpool-1] = -1; nvarsinpool = nvarsinpool-1; continue; } bflag = false; for(i=0; i<=npoints-1; i++) { v = xy[i,j]; if( (double)(v)!=(double)(vmin) && (double)(v)!=(double)(vmax) ) { bflag = true; break; } } if( bflag ) { // // non-binary variable // bufs.evsbin[j] = false; } else { // // Prepare // bufs.evsbin[j] = true; bufs.evssplits[j] = 0.5*(vmin+vmax); if( (double)(bufs.evssplits[j])<=(double)(vmin) ) { bufs.evssplits[j] = vmax; } } } } // // RANDOM FOREST FORMAT // W[0] - size of array // W[1] - version number // W[2] - NVars // W[3] - NClasses (1 for regression) // W[4] - NTrees // W[5] - trees offset // // // TREE FORMAT // W[Offs] - size of sub-array // node info: // W[K+0] - variable number (-1 for leaf mode) // W[K+1] - threshold (class/value for leaf node) // W[K+2] - ">=" branch index (absent for leaf node) // // df.nvars = nvars; df.nclasses = nclasses; df.ntrees = ntrees; // // Build forest // offs = 0; for(i=0; i<=ntrees-1; i++) { // // Prepare sample // for(k=0; k<=samplesize-1; k++) { j = k+math.randominteger(npoints-k); tmpi = permbuf[k]; permbuf[k] = permbuf[j]; permbuf[j] = tmpi; j = permbuf[k]; for(i_=0; i_<=nvars;i_++) { xys[k,i_] = xy[j,i_]; } } // // build tree, copy // dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs); j = (int)Math.Round(bufs.treebuf[0]); i1_ = (0) - (offs); for(i_=offs; i_<=offs+j-1;i_++) { df.trees[i_] = bufs.treebuf[i_+i1_]; } lasttreeoffs = offs; offs = offs+j; // // OOB estimates // for(k=samplesize; k<=npoints-1; k++) { for(j=0; j<=nclasses-1; j++) { y[j] = 0; } j = permbuf[k]; for(i_=0; i_<=nvars-1;i_++) { x[i_] = xy[j,i_]; } dfprocessinternal(df, lasttreeoffs, x, ref y); i1_ = (0) - (j*nclasses); for(i_=j*nclasses; i_<=(j+1)*nclasses-1;i_++) { oobbuf[i_] = oobbuf[i_] + y[i_+i1_]; } oobcntbuf[j] = oobcntbuf[j]+1; } } df.bufsize = offs; // // Normalize OOB results // for(i=0; i<=npoints-1; i++) { if( oobcntbuf[i]!=0 ) { v = (double)1/(double)oobcntbuf[i]; for(i_=i*nclasses; i_<=i*nclasses+nclasses-1;i_++) { oobbuf[i_] = v*oobbuf[i_]; } } } // // Calculate training set estimates // rep.relclserror = dfrelclserror(df, xy, npoints); rep.avgce = dfavgce(df, xy, npoints); rep.rmserror = dfrmserror(df, xy, npoints); rep.avgerror = dfavgerror(df, xy, npoints); rep.avgrelerror = dfavgrelerror(df, xy, npoints); // // Calculate OOB estimates. // rep.oobrelclserror = 0; rep.oobavgce = 0; rep.oobrmserror = 0; rep.oobavgerror = 0; rep.oobavgrelerror = 0; oobcnt = 0; oobrelcnt = 0; for(i=0; i<=npoints-1; i++) { if( oobcntbuf[i]!=0 ) { ooboffs = i*nclasses; if( nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,nvars]); tmpi = 0; for(j=1; j<=nclasses-1; j++) { if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) ) { tmpi = j; } } if( tmpi!=k ) { rep.oobrelclserror = rep.oobrelclserror+1; } if( (double)(oobbuf[ooboffs+k])!=(double)(0) ) { rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]); } else { rep.oobavgce = rep.oobavgce-Math.Log(math.minrealnumber); } for(j=0; j<=nclasses-1; j++) { if( j==k ) { rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]-1); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1); rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1); oobrelcnt = oobrelcnt+1; } else { rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]); } } } else { // // regression-specific code // rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs]-xy[i,nvars]); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]); if( (double)(xy[i,nvars])!=(double)(0) ) { rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]); oobrelcnt = oobrelcnt+1; } } // // update OOB estimates count. // oobcnt = oobcnt+1; } } if( oobcnt>0 ) { rep.oobrelclserror = rep.oobrelclserror/oobcnt; rep.oobavgce = rep.oobavgce/oobcnt; rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses)); rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses); if( oobrelcnt>0 ) { rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt; } } }
/************************************************************************* Internal subroutine for processing one decision tree starting at Offs *************************************************************************/ private static void dfprocessinternal(decisionforest df, int offs, double[] x, ref double[] y) { int k = 0; int idx = 0; // // Set pointer to the root // k = offs+1; // // Navigate through the tree // while( true ) { if( (double)(df.trees[k])==(double)(-1) ) { if( df.nclasses==1 ) { y[0] = y[0]+df.trees[k+1]; } else { idx = (int)Math.Round(df.trees[k+1]); y[idx] = y[idx]+1; } break; } if( (double)(x[(int)Math.Round(df.trees[k])])<(double)(df.trees[k+1]) ) { k = k+innernodewidth; } else { k = offs+(int)Math.Round(df.trees[k+2]); } } }
/************************************************************************* This subroutine builds random decision forest. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. R - percent of a training set used to build individual trees. 0<R<=1. recommended values: 0.1 <= R <= 0.66. OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0 or R>1). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforest(ref double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, ref int info, ref decisionforest df, ref dfreport rep) { int samplesize = 0; if( (double)(r)<=(double)(0) | (double)(r)>(double)(1) ) { info = -1; return; } samplesize = Math.Max((int)Math.Round(r*npoints), 1); dfbuildinternal(ref xy, npoints, nvars, nclasses, ntrees, samplesize, Math.Max(nvars/2, 1), dfusestrongsplits+dfuseevs, ref info, ref df, ref rep); }