public static void dfbuildinternal(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, int samplesize, int nfeatures, int flags, ref int info, decisionforest df, dfreport rep) { int i = 0; int j = 0; int k = 0; int tmpi = 0; int lasttreeoffs = 0; int offs = 0; int ooboffs = 0; int treesize = 0; int nvarsinpool = 0; bool useevs = new bool(); dfinternalbuffers bufs = new dfinternalbuffers(); int[] permbuf = new int[0]; double[] oobbuf = new double[0]; int[] oobcntbuf = new int[0]; double[,] xys = new double[0,0]; double[] x = new double[0]; double[] y = new double[0]; int oobcnt = 0; int oobrelcnt = 0; double v = 0; double vmin = 0; double vmax = 0; bool bflag = new bool(); int i_ = 0; int i1_ = 0; info = 0; // // Test for inputs // if( (((((npoints<1 || samplesize<1) || samplesize>npoints) || nvars<1) || nclasses<1) || ntrees<1) || nfeatures<1 ) { info = -1; return; } if( nclasses>1 ) { for(i=0; i<=npoints-1; i++) { if( (int)Math.Round(xy[i,nvars])<0 || (int)Math.Round(xy[i,nvars])>=nclasses ) { info = -2; return; } } } info = 1; // // Flags // useevs = flags/dfuseevs%2!=0; // // Allocate data, prepare header // treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize; permbuf = new int[npoints-1+1]; bufs.treebuf = new double[treesize-1+1]; bufs.idxbuf = new int[npoints-1+1]; bufs.tmpbufr = new double[npoints-1+1]; bufs.tmpbufr2 = new double[npoints-1+1]; bufs.tmpbufi = new int[npoints-1+1]; bufs.sortrbuf = new double[npoints]; bufs.sortrbuf2 = new double[npoints]; bufs.sortibuf = new int[npoints]; bufs.varpool = new int[nvars-1+1]; bufs.evsbin = new bool[nvars-1+1]; bufs.evssplits = new double[nvars-1+1]; bufs.classibuf = new int[2*nclasses-1+1]; oobbuf = new double[nclasses*npoints-1+1]; oobcntbuf = new int[npoints-1+1]; df.trees = new double[ntrees*treesize-1+1]; xys = new double[samplesize-1+1, nvars+1]; x = new double[nvars-1+1]; y = new double[nclasses-1+1]; for(i=0; i<=npoints-1; i++) { permbuf[i] = i; } for(i=0; i<=npoints*nclasses-1; i++) { oobbuf[i] = 0; } for(i=0; i<=npoints-1; i++) { oobcntbuf[i] = 0; } // // Prepare variable pool and EVS (extended variable selection/splitting) buffers // (whether EVS is turned on or not): // 1. detect binary variables and pre-calculate splits for them // 2. detect variables with non-distinct values and exclude them from pool // for(i=0; i<=nvars-1; i++) { bufs.varpool[i] = i; } nvarsinpool = nvars; if( useevs ) { for(j=0; j<=nvars-1; j++) { vmin = xy[0,j]; vmax = vmin; for(i=0; i<=npoints-1; i++) { v = xy[i,j]; vmin = Math.Min(vmin, v); vmax = Math.Max(vmax, v); } if( (double)(vmin)==(double)(vmax) ) { // // exclude variable from pool // bufs.varpool[j] = bufs.varpool[nvarsinpool-1]; bufs.varpool[nvarsinpool-1] = -1; nvarsinpool = nvarsinpool-1; continue; } bflag = false; for(i=0; i<=npoints-1; i++) { v = xy[i,j]; if( (double)(v)!=(double)(vmin) && (double)(v)!=(double)(vmax) ) { bflag = true; break; } } if( bflag ) { // // non-binary variable // bufs.evsbin[j] = false; } else { // // Prepare // bufs.evsbin[j] = true; bufs.evssplits[j] = 0.5*(vmin+vmax); if( (double)(bufs.evssplits[j])<=(double)(vmin) ) { bufs.evssplits[j] = vmax; } } } } // // RANDOM FOREST FORMAT // W[0] - size of array // W[1] - version number // W[2] - NVars // W[3] - NClasses (1 for regression) // W[4] - NTrees // W[5] - trees offset // // // TREE FORMAT // W[Offs] - size of sub-array // node info: // W[K+0] - variable number (-1 for leaf mode) // W[K+1] - threshold (class/value for leaf node) // W[K+2] - ">=" branch index (absent for leaf node) // // df.nvars = nvars; df.nclasses = nclasses; df.ntrees = ntrees; // // Build forest // offs = 0; for(i=0; i<=ntrees-1; i++) { // // Prepare sample // for(k=0; k<=samplesize-1; k++) { j = k+math.randominteger(npoints-k); tmpi = permbuf[k]; permbuf[k] = permbuf[j]; permbuf[j] = tmpi; j = permbuf[k]; for(i_=0; i_<=nvars;i_++) { xys[k,i_] = xy[j,i_]; } } // // build tree, copy // dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs); j = (int)Math.Round(bufs.treebuf[0]); i1_ = (0) - (offs); for(i_=offs; i_<=offs+j-1;i_++) { df.trees[i_] = bufs.treebuf[i_+i1_]; } lasttreeoffs = offs; offs = offs+j; // // OOB estimates // for(k=samplesize; k<=npoints-1; k++) { for(j=0; j<=nclasses-1; j++) { y[j] = 0; } j = permbuf[k]; for(i_=0; i_<=nvars-1;i_++) { x[i_] = xy[j,i_]; } dfprocessinternal(df, lasttreeoffs, x, ref y); i1_ = (0) - (j*nclasses); for(i_=j*nclasses; i_<=(j+1)*nclasses-1;i_++) { oobbuf[i_] = oobbuf[i_] + y[i_+i1_]; } oobcntbuf[j] = oobcntbuf[j]+1; } } df.bufsize = offs; // // Normalize OOB results // for(i=0; i<=npoints-1; i++) { if( oobcntbuf[i]!=0 ) { v = (double)1/(double)oobcntbuf[i]; for(i_=i*nclasses; i_<=i*nclasses+nclasses-1;i_++) { oobbuf[i_] = v*oobbuf[i_]; } } } // // Calculate training set estimates // rep.relclserror = dfrelclserror(df, xy, npoints); rep.avgce = dfavgce(df, xy, npoints); rep.rmserror = dfrmserror(df, xy, npoints); rep.avgerror = dfavgerror(df, xy, npoints); rep.avgrelerror = dfavgrelerror(df, xy, npoints); // // Calculate OOB estimates. // rep.oobrelclserror = 0; rep.oobavgce = 0; rep.oobrmserror = 0; rep.oobavgerror = 0; rep.oobavgrelerror = 0; oobcnt = 0; oobrelcnt = 0; for(i=0; i<=npoints-1; i++) { if( oobcntbuf[i]!=0 ) { ooboffs = i*nclasses; if( nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,nvars]); tmpi = 0; for(j=1; j<=nclasses-1; j++) { if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) ) { tmpi = j; } } if( tmpi!=k ) { rep.oobrelclserror = rep.oobrelclserror+1; } if( (double)(oobbuf[ooboffs+k])!=(double)(0) ) { rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]); } else { rep.oobavgce = rep.oobavgce-Math.Log(math.minrealnumber); } for(j=0; j<=nclasses-1; j++) { if( j==k ) { rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]-1); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1); rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1); oobrelcnt = oobrelcnt+1; } else { rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]); } } } else { // // regression-specific code // rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs]-xy[i,nvars]); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]); if( (double)(xy[i,nvars])!=(double)(0) ) { rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]); oobrelcnt = oobrelcnt+1; } } // // update OOB estimates count. // oobcnt = oobcnt+1; } } if( oobcnt>0 ) { rep.oobrelclserror = rep.oobrelclserror/oobcnt; rep.oobavgce = rep.oobavgce/oobcnt; rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses)); rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses); if( oobrelcnt>0 ) { rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt; } } }
/************************************************************************* This subroutine builds random decision forest. This function gives ability to tune number of variables used when choosing best split. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. NRndVars - number of variables used when choosing best split R - percent of a training set used to build individual trees. 0<R<=1. recommended values: 0.1 <= R <= 0.66. OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0 or R>1). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforestx1(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, int nrndvars, double r, ref int info, decisionforest df, dfreport rep) { int samplesize = 0; info = 0; if( (double)(r)<=(double)(0) || (double)(r)>(double)(1) ) { info = -1; return; } if( nrndvars<=0 || nrndvars>nvars ) { info = -1; return; } samplesize = Math.Max((int)Math.Round(r*npoints), 1); dfbuildinternal(xy, npoints, nvars, nclasses, ntrees, samplesize, nrndvars, dfusestrongsplits+dfuseevs, ref info, df, rep); }
/************************************************************************* This subroutine builds random decision forest. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. R - percent of a training set used to build individual trees. 0<R<=1. recommended values: 0.1 <= R <= 0.66. OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0 or R>1). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforest(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, out int info, out decisionforest df, out dfreport rep) { info = 0; df = new decisionforest(); rep = new dfreport(); dforest.dfbuildrandomdecisionforest(xy, npoints, nvars, nclasses, ntrees, r, ref info, df.innerobj, rep.innerobj); return; }
public override alglib.apobject make_copy() { dfreport _result = new dfreport(); _result.relclserror = relclserror; _result.avgce = avgce; _result.rmserror = rmserror; _result.avgerror = avgerror; _result.avgrelerror = avgrelerror; _result.oobrelclserror = oobrelclserror; _result.oobavgce = oobavgce; _result.oobrmserror = oobrmserror; _result.oobavgerror = oobavgerror; _result.oobavgrelerror = oobavgrelerror; return _result; }
/************************************************************************* This subroutine builds random decision forest. INPUT PARAMETERS: XY - training set NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - task type: * NClasses=1 - regression task with one dependent variable * NClasses>1 - classification task with NClasses classes. NTrees - number of trees in a forest, NTrees>=1. recommended values: 50-100. R - percent of a training set used to build individual trees. 0<R<=1. recommended values: 0.1 <= R <= 0.66. OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<1, NVars<1, NClasses<1, NTrees<1, R<=0 or R>1). * 1, if task has been solved DF - model built Rep - training report, contains error on a training set and out-of-bag estimates of generalization error. -- ALGLIB -- Copyright 19.02.2009 by Bochkanov Sergey *************************************************************************/ public static void dfbuildrandomdecisionforest(ref double[,] xy, int npoints, int nvars, int nclasses, int ntrees, double r, ref int info, ref decisionforest df, ref dfreport rep) { int samplesize = 0; if( (double)(r)<=(double)(0) | (double)(r)>(double)(1) ) { info = -1; return; } samplesize = Math.Max((int)Math.Round(r*npoints), 1); dfbuildinternal(ref xy, npoints, nvars, nclasses, ntrees, samplesize, Math.Max(nvars/2, 1), dfusestrongsplits+dfuseevs, ref info, ref df, ref rep); }