/************************************************************************* Builds one decision tree (internal recursive subroutine) Parameters: TreeBuf - large enough array, at least TreeSize IdxBuf - at least NPoints elements TmpBufR - at least NPoints TmpBufR2 - at least NPoints TmpBufI - at least NPoints TmpBufI2 - at least NPoints+1 *************************************************************************/ private static void dfbuildtreerec(double[,] xy, int npoints, int nvars, int nclasses, int nfeatures, int nvarsinpool, int flags, ref int numprocessed, int idx1, int idx2, dfinternalbuffers bufs) { int i = 0; int j = 0; int k = 0; bool bflag = new bool(); int i1 = 0; int i2 = 0; int info = 0; double sl = 0; double sr = 0; double w = 0; int idxbest = 0; double ebest = 0; double tbest = 0; int varcur = 0; double s = 0; double v = 0; double v1 = 0; double v2 = 0; double threshold = 0; int oldnp = 0; double currms = 0; bool useevs = new bool(); // // these initializers are not really necessary, // but without them compiler complains about uninitialized locals // tbest = 0; // // Prepare // alglib.ap.assert(npoints>0); alglib.ap.assert(idx2>=idx1); useevs = flags/dfuseevs%2!=0; // // Leaf node // if( idx2==idx1 ) { bufs.treebuf[numprocessed] = -1; bufs.treebuf[numprocessed+1] = xy[bufs.idxbuf[idx1],nvars]; numprocessed = numprocessed+leafnodewidth; return; } // // Non-leaf node. // Select random variable, prepare split: // 1. prepare default solution - no splitting, class at random // 2. investigate possible splits, compare with default/best // idxbest = -1; if( nclasses>1 ) { // // default solution for classification // for(i=0; i<=nclasses-1; i++) { bufs.classibuf[i] = 0; } s = idx2-idx1+1; for(i=idx1; i<=idx2; i++) { j = (int)Math.Round(xy[bufs.idxbuf[i],nvars]); bufs.classibuf[j] = bufs.classibuf[j]+1; } ebest = 0; for(i=0; i<=nclasses-1; i++) { ebest = ebest+bufs.classibuf[i]*math.sqr(1-bufs.classibuf[i]/s)+(s-bufs.classibuf[i])*math.sqr(bufs.classibuf[i]/s); } ebest = Math.Sqrt(ebest/(nclasses*(idx2-idx1+1))); } else { // // default solution for regression // v = 0; for(i=idx1; i<=idx2; i++) { v = v+xy[bufs.idxbuf[i],nvars]; } v = v/(idx2-idx1+1); ebest = 0; for(i=idx1; i<=idx2; i++) { ebest = ebest+math.sqr(xy[bufs.idxbuf[i],nvars]-v); } ebest = Math.Sqrt(ebest/(idx2-idx1+1)); } i = 0; while( i<=Math.Min(nfeatures, nvarsinpool)-1 ) { // // select variables from pool // j = i+math.randominteger(nvarsinpool-i); k = bufs.varpool[i]; bufs.varpool[i] = bufs.varpool[j]; bufs.varpool[j] = k; varcur = bufs.varpool[i]; // // load variable values to working array // // apply EVS preprocessing: if all variable values are same, // variable is excluded from pool. // // This is necessary for binary pre-splits (see later) to work. // for(j=idx1; j<=idx2; j++) { bufs.tmpbufr[j-idx1] = xy[bufs.idxbuf[j],varcur]; } if( useevs ) { bflag = false; v = bufs.tmpbufr[0]; for(j=0; j<=idx2-idx1; j++) { if( (double)(bufs.tmpbufr[j])!=(double)(v) ) { bflag = true; break; } } if( !bflag ) { // // exclude variable from pool, // go to the next iteration. // I is not increased. // k = bufs.varpool[i]; bufs.varpool[i] = bufs.varpool[nvarsinpool-1]; bufs.varpool[nvarsinpool-1] = k; nvarsinpool = nvarsinpool-1; continue; } } // // load labels to working array // if( nclasses>1 ) { for(j=idx1; j<=idx2; j++) { bufs.tmpbufi[j-idx1] = (int)Math.Round(xy[bufs.idxbuf[j],nvars]); } } else { for(j=idx1; j<=idx2; j++) { bufs.tmpbufr2[j-idx1] = xy[bufs.idxbuf[j],nvars]; } } // // calculate split // if( useevs && bufs.evsbin[varcur] ) { // // Pre-calculated splits for binary variables. // Threshold is already known, just calculate RMS error // threshold = bufs.evssplits[varcur]; if( nclasses>1 ) { // // classification-specific code // for(j=0; j<=2*nclasses-1; j++) { bufs.classibuf[j] = 0; } sl = 0; sr = 0; for(j=0; j<=idx2-idx1; j++) { k = bufs.tmpbufi[j]; if( (double)(bufs.tmpbufr[j])<(double)(threshold) ) { bufs.classibuf[k] = bufs.classibuf[k]+1; sl = sl+1; } else { bufs.classibuf[k+nclasses] = bufs.classibuf[k+nclasses]+1; sr = sr+1; } } alglib.ap.assert((double)(sl)!=(double)(0) && (double)(sr)!=(double)(0), "DFBuildTreeRec: something strange!"); currms = 0; for(j=0; j<=nclasses-1; j++) { w = bufs.classibuf[j]; currms = currms+w*math.sqr(w/sl-1); currms = currms+(sl-w)*math.sqr(w/sl); w = bufs.classibuf[nclasses+j]; currms = currms+w*math.sqr(w/sr-1); currms = currms+(sr-w)*math.sqr(w/sr); } currms = Math.Sqrt(currms/(nclasses*(idx2-idx1+1))); } else { // // regression-specific code // sl = 0; sr = 0; v1 = 0; v2 = 0; for(j=0; j<=idx2-idx1; j++) { if( (double)(bufs.tmpbufr[j])<(double)(threshold) ) { v1 = v1+bufs.tmpbufr2[j]; sl = sl+1; } else { v2 = v2+bufs.tmpbufr2[j]; sr = sr+1; } } alglib.ap.assert((double)(sl)!=(double)(0) && (double)(sr)!=(double)(0), "DFBuildTreeRec: something strange!"); v1 = v1/sl; v2 = v2/sr; currms = 0; for(j=0; j<=idx2-idx1; j++) { if( (double)(bufs.tmpbufr[j])<(double)(threshold) ) { currms = currms+math.sqr(v1-bufs.tmpbufr2[j]); } else { currms = currms+math.sqr(v2-bufs.tmpbufr2[j]); } } currms = Math.Sqrt(currms/(idx2-idx1+1)); } info = 1; } else { // // Generic splits // if( nclasses>1 ) { dfsplitc(ref bufs.tmpbufr, ref bufs.tmpbufi, ref bufs.classibuf, idx2-idx1+1, nclasses, dfusestrongsplits, ref info, ref threshold, ref currms, ref bufs.sortrbuf, ref bufs.sortibuf); } else { dfsplitr(ref bufs.tmpbufr, ref bufs.tmpbufr2, idx2-idx1+1, dfusestrongsplits, ref info, ref threshold, ref currms, ref bufs.sortrbuf, ref bufs.sortrbuf2); } } if( info>0 ) { if( (double)(currms)<=(double)(ebest) ) { ebest = currms; idxbest = varcur; tbest = threshold; } } // // Next iteration // i = i+1; } // // to split or not to split // if( idxbest<0 ) { // // All values are same, cannot split. // bufs.treebuf[numprocessed] = -1; if( nclasses>1 ) { // // Select random class label (randomness allows us to // approximate distribution of the classes) // bufs.treebuf[numprocessed+1] = (int)Math.Round(xy[bufs.idxbuf[idx1+math.randominteger(idx2-idx1+1)],nvars]); } else { // // Select average (for regression task). // v = 0; for(i=idx1; i<=idx2; i++) { v = v+xy[bufs.idxbuf[i],nvars]/(idx2-idx1+1); } bufs.treebuf[numprocessed+1] = v; } numprocessed = numprocessed+leafnodewidth; } else { // // we can split // bufs.treebuf[numprocessed] = idxbest; bufs.treebuf[numprocessed+1] = tbest; i1 = idx1; i2 = idx2; while( i1<=i2 ) { // // Reorder indices so that left partition is in [Idx1..I1-1], // and right partition is in [I2+1..Idx2] // if( (double)(xy[bufs.idxbuf[i1],idxbest])<(double)(tbest) ) { i1 = i1+1; continue; } if( (double)(xy[bufs.idxbuf[i2],idxbest])>=(double)(tbest) ) { i2 = i2-1; continue; } j = bufs.idxbuf[i1]; bufs.idxbuf[i1] = bufs.idxbuf[i2]; bufs.idxbuf[i2] = j; i1 = i1+1; i2 = i2-1; } oldnp = numprocessed; numprocessed = numprocessed+innernodewidth; dfbuildtreerec(xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, idx1, i1-1, bufs); bufs.treebuf[oldnp+2] = numprocessed; dfbuildtreerec(xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, i2+1, idx2, bufs); } }
public static void dfbuildinternal(double[,] xy, int npoints, int nvars, int nclasses, int ntrees, int samplesize, int nfeatures, int flags, ref int info, decisionforest df, dfreport rep) { int i = 0; int j = 0; int k = 0; int tmpi = 0; int lasttreeoffs = 0; int offs = 0; int ooboffs = 0; int treesize = 0; int nvarsinpool = 0; bool useevs = new bool(); dfinternalbuffers bufs = new dfinternalbuffers(); int[] permbuf = new int[0]; double[] oobbuf = new double[0]; int[] oobcntbuf = new int[0]; double[,] xys = new double[0,0]; double[] x = new double[0]; double[] y = new double[0]; int oobcnt = 0; int oobrelcnt = 0; double v = 0; double vmin = 0; double vmax = 0; bool bflag = new bool(); int i_ = 0; int i1_ = 0; info = 0; // // Test for inputs // if( (((((npoints<1 || samplesize<1) || samplesize>npoints) || nvars<1) || nclasses<1) || ntrees<1) || nfeatures<1 ) { info = -1; return; } if( nclasses>1 ) { for(i=0; i<=npoints-1; i++) { if( (int)Math.Round(xy[i,nvars])<0 || (int)Math.Round(xy[i,nvars])>=nclasses ) { info = -2; return; } } } info = 1; // // Flags // useevs = flags/dfuseevs%2!=0; // // Allocate data, prepare header // treesize = 1+innernodewidth*(samplesize-1)+leafnodewidth*samplesize; permbuf = new int[npoints-1+1]; bufs.treebuf = new double[treesize-1+1]; bufs.idxbuf = new int[npoints-1+1]; bufs.tmpbufr = new double[npoints-1+1]; bufs.tmpbufr2 = new double[npoints-1+1]; bufs.tmpbufi = new int[npoints-1+1]; bufs.sortrbuf = new double[npoints]; bufs.sortrbuf2 = new double[npoints]; bufs.sortibuf = new int[npoints]; bufs.varpool = new int[nvars-1+1]; bufs.evsbin = new bool[nvars-1+1]; bufs.evssplits = new double[nvars-1+1]; bufs.classibuf = new int[2*nclasses-1+1]; oobbuf = new double[nclasses*npoints-1+1]; oobcntbuf = new int[npoints-1+1]; df.trees = new double[ntrees*treesize-1+1]; xys = new double[samplesize-1+1, nvars+1]; x = new double[nvars-1+1]; y = new double[nclasses-1+1]; for(i=0; i<=npoints-1; i++) { permbuf[i] = i; } for(i=0; i<=npoints*nclasses-1; i++) { oobbuf[i] = 0; } for(i=0; i<=npoints-1; i++) { oobcntbuf[i] = 0; } // // Prepare variable pool and EVS (extended variable selection/splitting) buffers // (whether EVS is turned on or not): // 1. detect binary variables and pre-calculate splits for them // 2. detect variables with non-distinct values and exclude them from pool // for(i=0; i<=nvars-1; i++) { bufs.varpool[i] = i; } nvarsinpool = nvars; if( useevs ) { for(j=0; j<=nvars-1; j++) { vmin = xy[0,j]; vmax = vmin; for(i=0; i<=npoints-1; i++) { v = xy[i,j]; vmin = Math.Min(vmin, v); vmax = Math.Max(vmax, v); } if( (double)(vmin)==(double)(vmax) ) { // // exclude variable from pool // bufs.varpool[j] = bufs.varpool[nvarsinpool-1]; bufs.varpool[nvarsinpool-1] = -1; nvarsinpool = nvarsinpool-1; continue; } bflag = false; for(i=0; i<=npoints-1; i++) { v = xy[i,j]; if( (double)(v)!=(double)(vmin) && (double)(v)!=(double)(vmax) ) { bflag = true; break; } } if( bflag ) { // // non-binary variable // bufs.evsbin[j] = false; } else { // // Prepare // bufs.evsbin[j] = true; bufs.evssplits[j] = 0.5*(vmin+vmax); if( (double)(bufs.evssplits[j])<=(double)(vmin) ) { bufs.evssplits[j] = vmax; } } } } // // RANDOM FOREST FORMAT // W[0] - size of array // W[1] - version number // W[2] - NVars // W[3] - NClasses (1 for regression) // W[4] - NTrees // W[5] - trees offset // // // TREE FORMAT // W[Offs] - size of sub-array // node info: // W[K+0] - variable number (-1 for leaf mode) // W[K+1] - threshold (class/value for leaf node) // W[K+2] - ">=" branch index (absent for leaf node) // // df.nvars = nvars; df.nclasses = nclasses; df.ntrees = ntrees; // // Build forest // offs = 0; for(i=0; i<=ntrees-1; i++) { // // Prepare sample // for(k=0; k<=samplesize-1; k++) { j = k+math.randominteger(npoints-k); tmpi = permbuf[k]; permbuf[k] = permbuf[j]; permbuf[j] = tmpi; j = permbuf[k]; for(i_=0; i_<=nvars;i_++) { xys[k,i_] = xy[j,i_]; } } // // build tree, copy // dfbuildtree(xys, samplesize, nvars, nclasses, nfeatures, nvarsinpool, flags, bufs); j = (int)Math.Round(bufs.treebuf[0]); i1_ = (0) - (offs); for(i_=offs; i_<=offs+j-1;i_++) { df.trees[i_] = bufs.treebuf[i_+i1_]; } lasttreeoffs = offs; offs = offs+j; // // OOB estimates // for(k=samplesize; k<=npoints-1; k++) { for(j=0; j<=nclasses-1; j++) { y[j] = 0; } j = permbuf[k]; for(i_=0; i_<=nvars-1;i_++) { x[i_] = xy[j,i_]; } dfprocessinternal(df, lasttreeoffs, x, ref y); i1_ = (0) - (j*nclasses); for(i_=j*nclasses; i_<=(j+1)*nclasses-1;i_++) { oobbuf[i_] = oobbuf[i_] + y[i_+i1_]; } oobcntbuf[j] = oobcntbuf[j]+1; } } df.bufsize = offs; // // Normalize OOB results // for(i=0; i<=npoints-1; i++) { if( oobcntbuf[i]!=0 ) { v = (double)1/(double)oobcntbuf[i]; for(i_=i*nclasses; i_<=i*nclasses+nclasses-1;i_++) { oobbuf[i_] = v*oobbuf[i_]; } } } // // Calculate training set estimates // rep.relclserror = dfrelclserror(df, xy, npoints); rep.avgce = dfavgce(df, xy, npoints); rep.rmserror = dfrmserror(df, xy, npoints); rep.avgerror = dfavgerror(df, xy, npoints); rep.avgrelerror = dfavgrelerror(df, xy, npoints); // // Calculate OOB estimates. // rep.oobrelclserror = 0; rep.oobavgce = 0; rep.oobrmserror = 0; rep.oobavgerror = 0; rep.oobavgrelerror = 0; oobcnt = 0; oobrelcnt = 0; for(i=0; i<=npoints-1; i++) { if( oobcntbuf[i]!=0 ) { ooboffs = i*nclasses; if( nclasses>1 ) { // // classification-specific code // k = (int)Math.Round(xy[i,nvars]); tmpi = 0; for(j=1; j<=nclasses-1; j++) { if( (double)(oobbuf[ooboffs+j])>(double)(oobbuf[ooboffs+tmpi]) ) { tmpi = j; } } if( tmpi!=k ) { rep.oobrelclserror = rep.oobrelclserror+1; } if( (double)(oobbuf[ooboffs+k])!=(double)(0) ) { rep.oobavgce = rep.oobavgce-Math.Log(oobbuf[ooboffs+k]); } else { rep.oobavgce = rep.oobavgce-Math.Log(math.minrealnumber); } for(j=0; j<=nclasses-1; j++) { if( j==k ) { rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]-1); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]-1); rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs(oobbuf[ooboffs+j]-1); oobrelcnt = oobrelcnt+1; } else { rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs+j]); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs+j]); } } } else { // // regression-specific code // rep.oobrmserror = rep.oobrmserror+math.sqr(oobbuf[ooboffs]-xy[i,nvars]); rep.oobavgerror = rep.oobavgerror+Math.Abs(oobbuf[ooboffs]-xy[i,nvars]); if( (double)(xy[i,nvars])!=(double)(0) ) { rep.oobavgrelerror = rep.oobavgrelerror+Math.Abs((oobbuf[ooboffs]-xy[i,nvars])/xy[i,nvars]); oobrelcnt = oobrelcnt+1; } } // // update OOB estimates count. // oobcnt = oobcnt+1; } } if( oobcnt>0 ) { rep.oobrelclserror = rep.oobrelclserror/oobcnt; rep.oobavgce = rep.oobavgce/oobcnt; rep.oobrmserror = Math.Sqrt(rep.oobrmserror/(oobcnt*nclasses)); rep.oobavgerror = rep.oobavgerror/(oobcnt*nclasses); if( oobrelcnt>0 ) { rep.oobavgrelerror = rep.oobavgrelerror/oobrelcnt; } } }
/************************************************************************* Builds one decision tree. Just a wrapper for the DFBuildTreeRec. *************************************************************************/ private static void dfbuildtree(double[,] xy, int npoints, int nvars, int nclasses, int nfeatures, int nvarsinpool, int flags, dfinternalbuffers bufs) { int numprocessed = 0; int i = 0; alglib.ap.assert(npoints>0); // // Prepare IdxBuf. It stores indices of the training set elements. // When training set is being split, contents of IdxBuf is // correspondingly reordered so we can know which elements belong // to which branch of decision tree. // for(i=0; i<=npoints-1; i++) { bufs.idxbuf[i] = i; } // // Recursive procedure // numprocessed = 1; dfbuildtreerec(xy, npoints, nvars, nclasses, nfeatures, nvarsinpool, flags, ref numprocessed, 0, npoints-1, bufs); bufs.treebuf[0] = numprocessed; }
public override alglib.apobject make_copy() { dfinternalbuffers _result = new dfinternalbuffers(); _result.treebuf = (double[])treebuf.Clone(); _result.idxbuf = (int[])idxbuf.Clone(); _result.tmpbufr = (double[])tmpbufr.Clone(); _result.tmpbufr2 = (double[])tmpbufr2.Clone(); _result.tmpbufi = (int[])tmpbufi.Clone(); _result.classibuf = (int[])classibuf.Clone(); _result.sortrbuf = (double[])sortrbuf.Clone(); _result.sortrbuf2 = (double[])sortrbuf2.Clone(); _result.sortibuf = (int[])sortibuf.Clone(); _result.varpool = (int[])varpool.Clone(); _result.evsbin = (bool[])evsbin.Clone(); _result.evssplits = (double[])evssplits.Clone(); return _result; }