/************************************************************************* This subroutine trains logit model. INPUT PARAMETERS: XY - training set, array[0..NPoints-1,0..NVars] First NVars columns store values of independent variables, next column stores number of class (from 0 to NClasses-1) which dataset element belongs to. Fractional values are rounded to nearest integer. NPoints - training set size, NPoints>=1 NVars - number of independent variables, NVars>=1 NClasses - number of classes, NClasses>=2 OUTPUT PARAMETERS: Info - return code: * -2, if there is a point with class number outside of [0..NClasses-1]. * -1, if incorrect parameters was passed (NPoints<NVars+2, NVars<1, NClasses<2). * 1, if task has been solved LM - model built Rep - training report -- ALGLIB -- Copyright 10.09.2008 by Bochkanov Sergey *************************************************************************/ public static void mnltrainh(double[,] xy, int npoints, int nvars, int nclasses, ref int info, logitmodel lm, mnlreport rep) { int i = 0; int j = 0; int k = 0; int ssize = 0; bool allsame = new bool(); int offs = 0; double threshold = 0; double wminstep = 0; double decay = 0; int wdim = 0; int expoffs = 0; double v = 0; double s = 0; mlpbase.multilayerperceptron network = new mlpbase.multilayerperceptron(); int nin = 0; int nout = 0; int wcount = 0; double e = 0; double[] g = new double[0]; double[,] h = new double[0,0]; bool spd = new bool(); double[] x = new double[0]; double[] y = new double[0]; double[] wbase = new double[0]; double wstep = 0; double[] wdir = new double[0]; double[] work = new double[0]; int mcstage = 0; logitmcstate mcstate = new logitmcstate(); int mcinfo = 0; int mcnfev = 0; int solverinfo = 0; densesolver.densesolverreport solverrep = new densesolver.densesolverreport(); int i_ = 0; int i1_ = 0; info = 0; threshold = 1000*math.machineepsilon; wminstep = 0.001; decay = 0.001; // // Test for inputs // if( (npoints<nvars+2 || nvars<1) || nclasses<2 ) { info = -1; return; } for(i=0; i<=npoints-1; i++) { if( (int)Math.Round(xy[i,nvars])<0 || (int)Math.Round(xy[i,nvars])>=nclasses ) { info = -2; return; } } info = 1; // // Initialize data // rep.ngrad = 0; rep.nhess = 0; // // Allocate array // wdim = (nvars+1)*(nclasses-1); offs = 5; expoffs = offs+wdim; ssize = 5+(nvars+1)*(nclasses-1)+nclasses; lm.w = new double[ssize-1+1]; lm.w[0] = ssize; lm.w[1] = logitvnum; lm.w[2] = nvars; lm.w[3] = nclasses; lm.w[4] = offs; // // Degenerate case: all outputs are equal // allsame = true; for(i=1; i<=npoints-1; i++) { if( (int)Math.Round(xy[i,nvars])!=(int)Math.Round(xy[i-1,nvars]) ) { allsame = false; } } if( allsame ) { for(i=0; i<=(nvars+1)*(nclasses-1)-1; i++) { lm.w[offs+i] = 0; } v = -(2*Math.Log(math.minrealnumber)); k = (int)Math.Round(xy[0,nvars]); if( k==nclasses-1 ) { for(i=0; i<=nclasses-2; i++) { lm.w[offs+i*(nvars+1)+nvars] = -v; } } else { for(i=0; i<=nclasses-2; i++) { if( i==k ) { lm.w[offs+i*(nvars+1)+nvars] = v; } else { lm.w[offs+i*(nvars+1)+nvars] = 0; } } } return; } // // General case. // Prepare task and network. Allocate space. // mlpbase.mlpcreatec0(nvars, nclasses, network); mlpbase.mlpinitpreprocessor(network, xy, npoints); mlpbase.mlpproperties(network, ref nin, ref nout, ref wcount); for(i=0; i<=wcount-1; i++) { network.weights[i] = (2*math.randomreal()-1)/nvars; } g = new double[wcount-1+1]; h = new double[wcount-1+1, wcount-1+1]; wbase = new double[wcount-1+1]; wdir = new double[wcount-1+1]; work = new double[wcount-1+1]; // // First stage: optimize in gradient direction. // for(k=0; k<=wcount/3+10; k++) { // // Calculate gradient in starting point // mlpbase.mlpgradnbatch(network, xy, npoints, ref e, ref g); v = 0.0; for(i_=0; i_<=wcount-1;i_++) { v += network.weights[i_]*network.weights[i_]; } e = e+0.5*decay*v; for(i_=0; i_<=wcount-1;i_++) { g[i_] = g[i_] + decay*network.weights[i_]; } rep.ngrad = rep.ngrad+1; // // Setup optimization scheme // for(i_=0; i_<=wcount-1;i_++) { wdir[i_] = -g[i_]; } v = 0.0; for(i_=0; i_<=wcount-1;i_++) { v += wdir[i_]*wdir[i_]; } wstep = Math.Sqrt(v); v = 1/Math.Sqrt(v); for(i_=0; i_<=wcount-1;i_++) { wdir[i_] = v*wdir[i_]; } mcstage = 0; mnlmcsrch(wcount, ref network.weights, ref e, ref g, wdir, ref wstep, ref mcinfo, ref mcnfev, ref work, mcstate, ref mcstage); while( mcstage!=0 ) { mlpbase.mlpgradnbatch(network, xy, npoints, ref e, ref g); v = 0.0; for(i_=0; i_<=wcount-1;i_++) { v += network.weights[i_]*network.weights[i_]; } e = e+0.5*decay*v; for(i_=0; i_<=wcount-1;i_++) { g[i_] = g[i_] + decay*network.weights[i_]; } rep.ngrad = rep.ngrad+1; mnlmcsrch(wcount, ref network.weights, ref e, ref g, wdir, ref wstep, ref mcinfo, ref mcnfev, ref work, mcstate, ref mcstage); } } // // Second stage: use Hessian when we are close to the minimum // while( true ) { // // Calculate and update E/G/H // mlpbase.mlphessiannbatch(network, xy, npoints, ref e, ref g, ref h); v = 0.0; for(i_=0; i_<=wcount-1;i_++) { v += network.weights[i_]*network.weights[i_]; } e = e+0.5*decay*v; for(i_=0; i_<=wcount-1;i_++) { g[i_] = g[i_] + decay*network.weights[i_]; } for(k=0; k<=wcount-1; k++) { h[k,k] = h[k,k]+decay; } rep.nhess = rep.nhess+1; // // Select step direction // NOTE: it is important to use lower-triangle Cholesky // factorization since it is much faster than higher-triangle version. // spd = trfac.spdmatrixcholesky(ref h, wcount, false); densesolver.spdmatrixcholeskysolve(h, wcount, false, g, ref solverinfo, solverrep, ref wdir); spd = solverinfo>0; if( spd ) { // // H is positive definite. // Step in Newton direction. // for(i_=0; i_<=wcount-1;i_++) { wdir[i_] = -1*wdir[i_]; } spd = true; } else { // // H is indefinite. // Step in gradient direction. // for(i_=0; i_<=wcount-1;i_++) { wdir[i_] = -g[i_]; } spd = false; } // // Optimize in WDir direction // v = 0.0; for(i_=0; i_<=wcount-1;i_++) { v += wdir[i_]*wdir[i_]; } wstep = Math.Sqrt(v); v = 1/Math.Sqrt(v); for(i_=0; i_<=wcount-1;i_++) { wdir[i_] = v*wdir[i_]; } mcstage = 0; mnlmcsrch(wcount, ref network.weights, ref e, ref g, wdir, ref wstep, ref mcinfo, ref mcnfev, ref work, mcstate, ref mcstage); while( mcstage!=0 ) { mlpbase.mlpgradnbatch(network, xy, npoints, ref e, ref g); v = 0.0; for(i_=0; i_<=wcount-1;i_++) { v += network.weights[i_]*network.weights[i_]; } e = e+0.5*decay*v; for(i_=0; i_<=wcount-1;i_++) { g[i_] = g[i_] + decay*network.weights[i_]; } rep.ngrad = rep.ngrad+1; mnlmcsrch(wcount, ref network.weights, ref e, ref g, wdir, ref wstep, ref mcinfo, ref mcnfev, ref work, mcstate, ref mcstage); } if( spd && ((mcinfo==2 || mcinfo==4) || mcinfo==6) ) { break; } } // // Convert from NN format to MNL format // i1_ = (0) - (offs); for(i_=offs; i_<=offs+wcount-1;i_++) { lm.w[i_] = network.weights[i_+i1_]; } for(k=0; k<=nvars-1; k++) { for(i=0; i<=nclasses-2; i++) { s = network.columnsigmas[k]; if( (double)(s)==(double)(0) ) { s = 1; } j = offs+(nvars+1)*i; v = lm.w[j+k]; lm.w[j+k] = v/s; lm.w[j+nvars] = lm.w[j+nvars]+v*network.columnmeans[k]/s; } } for(k=0; k<=nclasses-2; k++) { lm.w[offs+(nvars+1)*k+nvars] = -lm.w[offs+(nvars+1)*k+nvars]; } }
/************************************************************************* THE PURPOSE OF MCSRCH IS TO FIND A STEP WHICH SATISFIES A SUFFICIENT DECREASE CONDITION AND A CURVATURE CONDITION. AT EACH STAGE THE SUBROUTINE UPDATES AN INTERVAL OF UNCERTAINTY WITH ENDPOINTS STX AND STY. THE INTERVAL OF UNCERTAINTY IS INITIALLY CHOSEN SO THAT IT CONTAINS A MINIMIZER OF THE MODIFIED FUNCTION F(X+STP*S) - F(X) - FTOL*STP*(GRADF(X)'S). IF A STEP IS OBTAINED FOR WHICH THE MODIFIED FUNCTION HAS A NONPOSITIVE FUNCTION VALUE AND NONNEGATIVE DERIVATIVE, THEN THE INTERVAL OF UNCERTAINTY IS CHOSEN SO THAT IT CONTAINS A MINIMIZER OF F(X+STP*S). THE ALGORITHM IS DESIGNED TO FIND A STEP WHICH SATISFIES THE SUFFICIENT DECREASE CONDITION F(X+STP*S) .LE. F(X) + FTOL*STP*(GRADF(X)'S), AND THE CURVATURE CONDITION ABS(GRADF(X+STP*S)'S)) .LE. GTOL*ABS(GRADF(X)'S). IF FTOL IS LESS THAN GTOL AND IF, FOR EXAMPLE, THE FUNCTION IS BOUNDED BELOW, THEN THERE IS ALWAYS A STEP WHICH SATISFIES BOTH CONDITIONS. IF NO STEP CAN BE FOUND WHICH SATISFIES BOTH CONDITIONS, THEN THE ALGORITHM USUALLY STOPS WHEN ROUNDING ERRORS PREVENT FURTHER PROGRESS. IN THIS CASE STP ONLY SATISFIES THE SUFFICIENT DECREASE CONDITION. PARAMETERS DESCRIPRION N IS A POSITIVE INTEGER INPUT VARIABLE SET TO THE NUMBER OF VARIABLES. X IS AN ARRAY OF LENGTH N. ON INPUT IT MUST CONTAIN THE BASE POINT FOR THE LINE SEARCH. ON OUTPUT IT CONTAINS X+STP*S. F IS A VARIABLE. ON INPUT IT MUST CONTAIN THE VALUE OF F AT X. ON OUTPUT IT CONTAINS THE VALUE OF F AT X + STP*S. G IS AN ARRAY OF LENGTH N. ON INPUT IT MUST CONTAIN THE GRADIENT OF F AT X. ON OUTPUT IT CONTAINS THE GRADIENT OF F AT X + STP*S. S IS AN INPUT ARRAY OF LENGTH N WHICH SPECIFIES THE SEARCH DIRECTION. STP IS A NONNEGATIVE VARIABLE. ON INPUT STP CONTAINS AN INITIAL ESTIMATE OF A SATISFACTORY STEP. ON OUTPUT STP CONTAINS THE FINAL ESTIMATE. FTOL AND GTOL ARE NONNEGATIVE INPUT VARIABLES. TERMINATION OCCURS WHEN THE SUFFICIENT DECREASE CONDITION AND THE DIRECTIONAL DERIVATIVE CONDITION ARE SATISFIED. XTOL IS A NONNEGATIVE INPUT VARIABLE. TERMINATION OCCURS WHEN THE RELATIVE WIDTH OF THE INTERVAL OF UNCERTAINTY IS AT MOST XTOL. STPMIN AND STPMAX ARE NONNEGATIVE INPUT VARIABLES WHICH SPECIFY LOWER AND UPPER BOUNDS FOR THE STEP. MAXFEV IS A POSITIVE INTEGER INPUT VARIABLE. TERMINATION OCCURS WHEN THE NUMBER OF CALLS TO FCN IS AT LEAST MAXFEV BY THE END OF AN ITERATION. INFO IS AN INTEGER OUTPUT VARIABLE SET AS FOLLOWS: INFO = 0 IMPROPER INPUT PARAMETERS. INFO = 1 THE SUFFICIENT DECREASE CONDITION AND THE DIRECTIONAL DERIVATIVE CONDITION HOLD. INFO = 2 RELATIVE WIDTH OF THE INTERVAL OF UNCERTAINTY IS AT MOST XTOL. INFO = 3 NUMBER OF CALLS TO FCN HAS REACHED MAXFEV. INFO = 4 THE STEP IS AT THE LOWER BOUND STPMIN. INFO = 5 THE STEP IS AT THE UPPER BOUND STPMAX. INFO = 6 ROUNDING ERRORS PREVENT FURTHER PROGRESS. THERE MAY NOT BE A STEP WHICH SATISFIES THE SUFFICIENT DECREASE AND CURVATURE CONDITIONS. TOLERANCES MAY BE TOO SMALL. NFEV IS AN INTEGER OUTPUT VARIABLE SET TO THE NUMBER OF CALLS TO FCN. WA IS A WORK ARRAY OF LENGTH N. ARGONNE NATIONAL LABORATORY. MINPACK PROJECT. JUNE 1983 JORGE J. MORE', DAVID J. THUENTE *************************************************************************/ private static void mnlmcsrch(int n, ref double[] x, ref double f, ref double[] g, double[] s, ref double stp, ref int info, ref int nfev, ref double[] wa, logitmcstate state, ref int stage) { double v = 0; double p5 = 0; double p66 = 0; double zero = 0; int i_ = 0; // // init // p5 = 0.5; p66 = 0.66; state.xtrapf = 4.0; zero = 0; // // Main cycle // while( true ) { if( stage==0 ) { // // NEXT // stage = 2; continue; } if( stage==2 ) { state.infoc = 1; info = 0; // // CHECK THE INPUT PARAMETERS FOR ERRORS. // if( ((((((n<=0 || (double)(stp)<=(double)(0)) || (double)(ftol)<(double)(0)) || (double)(gtol)<(double)(zero)) || (double)(xtol)<(double)(zero)) || (double)(stpmin)<(double)(zero)) || (double)(stpmax)<(double)(stpmin)) || maxfev<=0 ) { stage = 0; return; } // // COMPUTE THE INITIAL GRADIENT IN THE SEARCH DIRECTION // AND CHECK THAT S IS A DESCENT DIRECTION. // v = 0.0; for(i_=0; i_<=n-1;i_++) { v += g[i_]*s[i_]; } state.dginit = v; if( (double)(state.dginit)>=(double)(0) ) { stage = 0; return; } // // INITIALIZE LOCAL VARIABLES. // state.brackt = false; state.stage1 = true; nfev = 0; state.finit = f; state.dgtest = ftol*state.dginit; state.width = stpmax-stpmin; state.width1 = state.width/p5; for(i_=0; i_<=n-1;i_++) { wa[i_] = x[i_]; } // // THE VARIABLES STX, FX, DGX CONTAIN THE VALUES OF THE STEP, // FUNCTION, AND DIRECTIONAL DERIVATIVE AT THE BEST STEP. // THE VARIABLES STY, FY, DGY CONTAIN THE VALUE OF THE STEP, // FUNCTION, AND DERIVATIVE AT THE OTHER ENDPOINT OF // THE INTERVAL OF UNCERTAINTY. // THE VARIABLES STP, F, DG CONTAIN THE VALUES OF THE STEP, // FUNCTION, AND DERIVATIVE AT THE CURRENT STEP. // state.stx = 0; state.fx = state.finit; state.dgx = state.dginit; state.sty = 0; state.fy = state.finit; state.dgy = state.dginit; // // NEXT // stage = 3; continue; } if( stage==3 ) { // // START OF ITERATION. // // SET THE MINIMUM AND MAXIMUM STEPS TO CORRESPOND // TO THE PRESENT INTERVAL OF UNCERTAINTY. // if( state.brackt ) { if( (double)(state.stx)<(double)(state.sty) ) { state.stmin = state.stx; state.stmax = state.sty; } else { state.stmin = state.sty; state.stmax = state.stx; } } else { state.stmin = state.stx; state.stmax = stp+state.xtrapf*(stp-state.stx); } // // FORCE THE STEP TO BE WITHIN THE BOUNDS STPMAX AND STPMIN. // if( (double)(stp)>(double)(stpmax) ) { stp = stpmax; } if( (double)(stp)<(double)(stpmin) ) { stp = stpmin; } // // IF AN UNUSUAL TERMINATION IS TO OCCUR THEN LET // STP BE THE LOWEST POINT OBTAINED SO FAR. // if( (((state.brackt && ((double)(stp)<=(double)(state.stmin) || (double)(stp)>=(double)(state.stmax))) || nfev>=maxfev-1) || state.infoc==0) || (state.brackt && (double)(state.stmax-state.stmin)<=(double)(xtol*state.stmax)) ) { stp = state.stx; } // // EVALUATE THE FUNCTION AND GRADIENT AT STP // AND COMPUTE THE DIRECTIONAL DERIVATIVE. // for(i_=0; i_<=n-1;i_++) { x[i_] = wa[i_]; } for(i_=0; i_<=n-1;i_++) { x[i_] = x[i_] + stp*s[i_]; } // // NEXT // stage = 4; return; } if( stage==4 ) { info = 0; nfev = nfev+1; v = 0.0; for(i_=0; i_<=n-1;i_++) { v += g[i_]*s[i_]; } state.dg = v; state.ftest1 = state.finit+stp*state.dgtest; // // TEST FOR CONVERGENCE. // if( (state.brackt && ((double)(stp)<=(double)(state.stmin) || (double)(stp)>=(double)(state.stmax))) || state.infoc==0 ) { info = 6; } if( ((double)(stp)==(double)(stpmax) && (double)(f)<=(double)(state.ftest1)) && (double)(state.dg)<=(double)(state.dgtest) ) { info = 5; } if( (double)(stp)==(double)(stpmin) && ((double)(f)>(double)(state.ftest1) || (double)(state.dg)>=(double)(state.dgtest)) ) { info = 4; } if( nfev>=maxfev ) { info = 3; } if( state.brackt && (double)(state.stmax-state.stmin)<=(double)(xtol*state.stmax) ) { info = 2; } if( (double)(f)<=(double)(state.ftest1) && (double)(Math.Abs(state.dg))<=(double)(-(gtol*state.dginit)) ) { info = 1; } // // CHECK FOR TERMINATION. // if( info!=0 ) { stage = 0; return; } // // IN THE FIRST STAGE WE SEEK A STEP FOR WHICH THE MODIFIED // FUNCTION HAS A NONPOSITIVE VALUE AND NONNEGATIVE DERIVATIVE. // if( (state.stage1 && (double)(f)<=(double)(state.ftest1)) && (double)(state.dg)>=(double)(Math.Min(ftol, gtol)*state.dginit) ) { state.stage1 = false; } // // A MODIFIED FUNCTION IS USED TO PREDICT THE STEP ONLY IF // WE HAVE NOT OBTAINED A STEP FOR WHICH THE MODIFIED // FUNCTION HAS A NONPOSITIVE FUNCTION VALUE AND NONNEGATIVE // DERIVATIVE, AND IF A LOWER FUNCTION VALUE HAS BEEN // OBTAINED BUT THE DECREASE IS NOT SUFFICIENT. // if( (state.stage1 && (double)(f)<=(double)(state.fx)) && (double)(f)>(double)(state.ftest1) ) { // // DEFINE THE MODIFIED FUNCTION AND DERIVATIVE VALUES. // state.fm = f-stp*state.dgtest; state.fxm = state.fx-state.stx*state.dgtest; state.fym = state.fy-state.sty*state.dgtest; state.dgm = state.dg-state.dgtest; state.dgxm = state.dgx-state.dgtest; state.dgym = state.dgy-state.dgtest; // // CALL CSTEP TO UPDATE THE INTERVAL OF UNCERTAINTY // AND TO COMPUTE THE NEW STEP. // mnlmcstep(ref state.stx, ref state.fxm, ref state.dgxm, ref state.sty, ref state.fym, ref state.dgym, ref stp, state.fm, state.dgm, ref state.brackt, state.stmin, state.stmax, ref state.infoc); // // RESET THE FUNCTION AND GRADIENT VALUES FOR F. // state.fx = state.fxm+state.stx*state.dgtest; state.fy = state.fym+state.sty*state.dgtest; state.dgx = state.dgxm+state.dgtest; state.dgy = state.dgym+state.dgtest; } else { // // CALL MCSTEP TO UPDATE THE INTERVAL OF UNCERTAINTY // AND TO COMPUTE THE NEW STEP. // mnlmcstep(ref state.stx, ref state.fx, ref state.dgx, ref state.sty, ref state.fy, ref state.dgy, ref stp, f, state.dg, ref state.brackt, state.stmin, state.stmax, ref state.infoc); } // // FORCE A SUFFICIENT DECREASE IN THE SIZE OF THE // INTERVAL OF UNCERTAINTY. // if( state.brackt ) { if( (double)(Math.Abs(state.sty-state.stx))>=(double)(p66*state.width1) ) { stp = state.stx+p5*(state.sty-state.stx); } state.width1 = state.width; state.width = Math.Abs(state.sty-state.stx); } // // NEXT. // stage = 3; continue; } } }
public override alglib.apobject make_copy() { logitmcstate _result = new logitmcstate(); _result.brackt = brackt; _result.stage1 = stage1; _result.infoc = infoc; _result.dg = dg; _result.dgm = dgm; _result.dginit = dginit; _result.dgtest = dgtest; _result.dgx = dgx; _result.dgxm = dgxm; _result.dgy = dgy; _result.dgym = dgym; _result.finit = finit; _result.ftest1 = ftest1; _result.fm = fm; _result.fx = fx; _result.fxm = fxm; _result.fy = fy; _result.fym = fym; _result.stx = stx; _result.sty = sty; _result.stmin = stmin; _result.stmax = stmax; _result.width = width; _result.width1 = width1; _result.xtrapf = xtrapf; return _result; }