public static bool testminlbfgs(bool silent) { bool result = new bool(); bool waserrors = new bool(); bool referror = new bool(); bool lin1error = new bool(); bool lin2error = new bool(); bool eqerror = new bool(); bool converror = new bool(); int n = 0; int m = 0; double[] x = new double[0]; double[] xe = new double[0]; double[] b = new double[0]; int i = 0; int j = 0; double v = 0; double[,] a = new double[0, 0]; lbfgs.lbfgsstate state = new lbfgs.lbfgsstate(); lbfgs.lbfgsreport rep = new lbfgs.lbfgsreport(); int i_ = 0; waserrors = false; // // Reference problem // x = new double[2 + 1]; n = 3; m = 2; x[0] = 100 * AP.Math.RandomReal() - 50; x[1] = 100 * AP.Math.RandomReal() - 50; x[2] = 100 * AP.Math.RandomReal() - 50; lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0, 0.0, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = AP.Math.Sqr(state.x[0] - 2) + AP.Math.Sqr(state.x[1]) + AP.Math.Sqr(state.x[2] - state.x[0]); state.g[0] = 2 * (state.x[0] - 2) + 2 * (state.x[0] - state.x[2]); state.g[1] = 2 * state.x[1]; state.g[2] = 2 * (state.x[2] - state.x[0]); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); Console.WriteLine("Problem 1: " + state.f.ToString() + " punto: " + x[0].ToString() + " " + x[1].ToString() + " " + x[2].ToString()); referror = rep.terminationtype <= 0 | Math.Abs(x[0] - 2) > 0.001 | Math.Abs(x[1]) > 0.001 | Math.Abs(x[2] - 2) > 0.001; // // 1D problem #1 // x = new double[0 + 1]; n = 1; m = 1; x[0] = 100 * AP.Math.RandomReal() - 50; lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0, 0.0, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = -Math.Cos(state.x[0]); state.g[0] = Math.Sin(state.x[0]); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); lin1error = rep.terminationtype <= 0 | Math.Abs(x[0] / Math.PI - (int)Math.Round(x[0] / Math.PI)) > 0.001; // // 1D problem #2 // x = new double[0 + 1]; n = 1; m = 1; x[0] = 100 * AP.Math.RandomReal() - 50; lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0, 0.0, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = AP.Math.Sqr(state.x[0]) / (1 + AP.Math.Sqr(state.x[0])); state.g[0] = (2 * state.x[0] * (1 + AP.Math.Sqr(state.x[0])) - AP.Math.Sqr(state.x[0]) * 2 * state.x[0]) / AP.Math.Sqr(1 + AP.Math.Sqr(state.x[0])); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); lin2error = rep.terminationtype <= 0 | Math.Abs(x[0]) > 0.001; // // Linear equations // eqerror = false; for (n = 1; n <= 10; n++) { // // Prepare task // a = new double[n - 1 + 1, n - 1 + 1]; x = new double[n - 1 + 1]; xe = new double[n - 1 + 1]; b = new double[n - 1 + 1]; for (i = 0; i <= n - 1; i++) { xe[i] = 2 * AP.Math.RandomReal() - 1; } for (i = 0; i <= n - 1; i++) { for (j = 0; j <= n - 1; j++) { a[i, j] = 2 * AP.Math.RandomReal() - 1; } } for (i = 0; i <= n - 1; i++) { v = 0.0; for (i_ = 0; i_ <= n - 1; i_++) { v += a[i, i_] * xe[i_]; } b[i] = v; } // // Test different M // for (m = 1; m <= n; m++) { // // Solve task // for (i = 0; i <= n - 1; i++) { x[i] = 2 * AP.Math.RandomReal() - 1; } lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0, 0.0, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = 0; for (i = 0; i <= n - 1; i++) { state.g[i] = 0; } for (i = 0; i <= n - 1; i++) { v = 0.0; for (i_ = 0; i_ <= n - 1; i_++) { v += a[i, i_] * state.x[i_]; } state.f = state.f + AP.Math.Sqr(v - b[i]); for (j = 0; j <= n - 1; j++) { state.g[j] = state.g[j] + 2 * (v - b[i]) * a[i, j]; } } } lbfgs.minlbfgsresults(ref state, ref x, ref rep); eqerror = eqerror | rep.terminationtype <= 0; for (i = 0; i <= n - 1; i++) { eqerror = eqerror | Math.Abs(x[i] - xe[i]) > 0.001; } } } // // Testing convergence properties // converror = false; x = new double[2 + 1]; n = 3; m = 2; for (i = 0; i <= 2; i++) { x[i] = 6 * AP.Math.RandomReal() - 3; } lbfgs.minlbfgs(n, m, ref x, 0.0001, 0.0, 0.0, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = AP.Math.Sqr(Math.Exp(state.x[0]) - 2) + AP.Math.Sqr(state.x[1]) + AP.Math.Sqr(state.x[2] - state.x[0]); state.g[0] = 2 * (Math.Exp(state.x[0]) - 2) * Math.Exp(state.x[0]) + 2 * (state.x[0] - state.x[2]); state.g[1] = 2 * state.x[1]; state.g[2] = 2 * (state.x[2] - state.x[0]); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); converror = converror | Math.Abs(x[0] - Math.Log(2)) > 0.05; converror = converror | Math.Abs(x[1]) > 0.05; converror = converror | Math.Abs(x[2] - Math.Log(2)) > 0.05; converror = converror | rep.terminationtype != 4; for (i = 0; i <= 2; i++) { x[i] = 6 * AP.Math.RandomReal() - 3; } lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0001, 0.0, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = AP.Math.Sqr(Math.Exp(state.x[0]) - 2) + AP.Math.Sqr(state.x[1]) + AP.Math.Sqr(state.x[2] - state.x[0]); state.g[0] = 2 * (Math.Exp(state.x[0]) - 2) * Math.Exp(state.x[0]) + 2 * (state.x[0] - state.x[2]); state.g[1] = 2 * state.x[1]; state.g[2] = 2 * (state.x[2] - state.x[0]); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); converror = converror | Math.Abs(x[0] - Math.Log(2)) > 0.05; converror = converror | Math.Abs(x[1]) > 0.05; converror = converror | Math.Abs(x[2] - Math.Log(2)) > 0.05; converror = converror | rep.terminationtype != 1; for (i = 0; i <= 2; i++) { x[i] = 6 * AP.Math.RandomReal() - 3; } lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0, 0.0001, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = AP.Math.Sqr(Math.Exp(state.x[0]) - 2) + AP.Math.Sqr(state.x[1]) + AP.Math.Sqr(state.x[2] - state.x[0]); state.g[0] = 2 * (Math.Exp(state.x[0]) - 2) * Math.Exp(state.x[0]) + 2 * (state.x[0] - state.x[2]); state.g[1] = 2 * state.x[1]; state.g[2] = 2 * (state.x[2] - state.x[0]); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); converror = converror | Math.Abs(x[0] - Math.Log(2)) > 0.05; converror = converror | Math.Abs(x[1]) > 0.05; converror = converror | Math.Abs(x[2] - Math.Log(2)) > 0.05; converror = converror | rep.terminationtype != 2; for (i = 0; i <= 2; i++) { x[i] = 2 * AP.Math.RandomReal() - 1; } lbfgs.minlbfgs(n, m, ref x, 0.0, 0.0, 0.0, 10, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { state.f = AP.Math.Sqr(Math.Exp(state.x[0]) - 2) + AP.Math.Sqr(state.x[1]) + AP.Math.Sqr(state.x[2] - state.x[0]); state.g[0] = 2 * (Math.Exp(state.x[0]) - 2) * Math.Exp(state.x[0]) + 2 * (state.x[0] - state.x[2]); state.g[1] = 2 * state.x[1]; state.g[2] = 2 * (state.x[2] - state.x[0]); } lbfgs.minlbfgsresults(ref state, ref x, ref rep); converror = converror | rep.terminationtype != 5 | rep.iterationscount != 10; // // end // waserrors = referror | lin1error | lin2error | eqerror | converror; if (!silent) { System.Console.Write("TESTING L-BFGS OPTIMIZATION"); System.Console.WriteLine(); System.Console.Write("REFERENCE PROBLEM: "); if (referror) { System.Console.Write("FAILED"); System.Console.WriteLine(); } else { System.Console.Write("OK"); System.Console.WriteLine(); } System.Console.Write("1-D PROBLEM #1: "); if (lin1error) { System.Console.Write("FAILED"); System.Console.WriteLine(); } else { System.Console.Write("OK"); System.Console.WriteLine(); } System.Console.Write("1-D PROBLEM #2: "); if (lin2error) { System.Console.Write("FAILED"); System.Console.WriteLine(); } else { System.Console.Write("OK"); System.Console.WriteLine(); } System.Console.Write("LINEAR EQUATIONS: "); if (eqerror) { System.Console.Write("FAILED"); System.Console.WriteLine(); } else { System.Console.Write("OK"); System.Console.WriteLine(); } System.Console.Write("CONVERGENCE PROPERTIES: "); if (converror) { System.Console.Write("FAILED"); System.Console.WriteLine(); } else { System.Console.Write("OK"); System.Console.WriteLine(); } if (waserrors) { System.Console.Write("TEST FAILED"); System.Console.WriteLine(); } else { System.Console.Write("TEST PASSED"); System.Console.WriteLine(); } System.Console.WriteLine(); System.Console.WriteLine(); } result = !waserrors; return(result); }
/************************************************************************* * Neural network training using early stopping (base algorithm - L-BFGS with * regularization). * * INPUT PARAMETERS: * Network - neural network with initialized geometry * TrnXY - training set * TrnSize - training set size * ValXY - validation set * ValSize - validation set size * Decay - weight decay constant, >=0.001 * Decay term 'Decay*||Weights||^2' is added to error * function. * If you don't know what Decay to choose, use 0.001. * Restarts - number of restarts from random position, >0. * If you don't know what Restarts to choose, use 2. * * OUTPUT PARAMETERS: * Network - trained neural network. * Info - return code: * -2, if there is a point with class number * outside of [0..NOut-1]. * -1, if wrong parameters specified * (NPoints<0, Restarts<1, ...). * 2, task has been solved, stopping criterion met - * sufficiently small step size. Not expected (we * use EARLY stopping) but possible and not an * error. * 6, task has been solved, stopping criterion met - * increasing of validation set error. * Rep - training report * * NOTE: * * Algorithm stops if validation set error increases for a long enough or * step size is small enought (there are task where validation set may * decrease for eternity). In any case solution returned corresponds to the * minimum of validation set error. * * -- ALGLIB -- * Copyright 10.03.2009 by Bochkanov Sergey *************************************************************************/ public static void mlptraines(ref mlpbase.multilayerperceptron network, ref double[,] trnxy, int trnsize, ref double[,] valxy, int valsize, double decay, int restarts, ref int info, ref mlpreport rep) { int i = 0; //int j = 0; int pass = 0; int nin = 0; int nout = 0; int wcount = 0; double[] w = new double[0]; double[] wbest = new double[0]; double e = 0; double v = 0; double ebest = 0; double[] wfinal = new double[0]; double efinal = 0; int itbest = 0; lbfgs.lbfgsreport internalrep = new lbfgs.lbfgsreport(); lbfgs.lbfgsstate state = new lbfgs.lbfgsstate(); double wstep = 0; int i_ = 0; wstep = 0.001; // // Test inputs, parse flags, read network geometry // if (trnsize <= 0 | valsize <= 0 | restarts < 1 | decay < 0) { info = -1; return; } mlpbase.mlpproperties(ref network, ref nin, ref nout, ref wcount); if (mlpbase.mlpissoftmax(ref network)) { for (i = 0; i <= trnsize - 1; i++) { if ((int)Math.Round(trnxy[i, nin]) < 0 | (int)Math.Round(trnxy[i, nin]) >= nout) { info = -2; return; } } for (i = 0; i <= valsize - 1; i++) { if ((int)Math.Round(valxy[i, nin]) < 0 | (int)Math.Round(valxy[i, nin]) >= nout) { info = -2; return; } } } info = 2; // // Prepare // mlpbase.mlpinitpreprocessor(ref network, ref trnxy, trnsize); w = new double[wcount - 1 + 1]; wbest = new double[wcount - 1 + 1]; wfinal = new double[wcount - 1 + 1]; efinal = AP.Math.MaxRealNumber; for (i = 0; i <= wcount - 1; i++) { wfinal[i] = 0; } // // Multiple starts // rep.ncholesky = 0; rep.nhess = 0; rep.ngrad = 0; for (pass = 1; pass <= restarts; pass++) { // // Process // mlpbase.mlprandomize(ref network); ebest = mlpbase.mlperror(ref network, ref valxy, valsize); for (i_ = 0; i_ <= wcount - 1; i_++) { wbest[i_] = network.weights[i_]; } itbest = 0; for (i_ = 0; i_ <= wcount - 1; i_++) { w[i_] = network.weights[i_]; } lbfgs.minlbfgs(wcount, Math.Min(wcount, 50), ref w, 0.0, 0.0, wstep, 0, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { // // Calculate gradient // for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = state.x[i_]; } mlpbase.mlpgradnbatch(ref network, ref trnxy, trnsize, ref state.f, ref state.g); v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } state.f = state.f + 0.5 * decay * v; for (i_ = 0; i_ <= wcount - 1; i_++) { state.g[i_] = state.g[i_] + decay * network.weights[i_]; } rep.ngrad = rep.ngrad + 1; // // Validation set // if (state.xupdated) { for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = w[i_]; } e = mlpbase.mlperror(ref network, ref valxy, valsize); if (e < ebest) { ebest = e; for (i_ = 0; i_ <= wcount - 1; i_++) { wbest[i_] = network.weights[i_]; } itbest = internalrep.iterationscount; } if (internalrep.iterationscount > 30 & internalrep.iterationscount > 1.5 * itbest) { info = 6; break; } } } lbfgs.minlbfgsresults(ref state, ref w, ref internalrep); // // Compare with final answer // if (ebest < efinal) { for (i_ = 0; i_ <= wcount - 1; i_++) { wfinal[i_] = wbest[i_]; } efinal = ebest; } } // // The best network // for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = wfinal[i_]; } }
/************************************************************************* * Neural network training using modified Levenberg-Marquardt with exact * Hessian calculation and regularization. Subroutine trains neural network * with restarts from random positions. Algorithm is well suited for small * and medium scale problems (hundreds of weights). * * INPUT PARAMETERS: * Network - neural network with initialized geometry * XY - training set * NPoints - training set size * Decay - weight decay constant, >=0.001 * Decay term 'Decay*||Weights||^2' is added to error * function. * If you don't know what Decay to choose, use 0.001. * Restarts - number of restarts from random position, >0. * If you don't know what Restarts to choose, use 2. * * OUTPUT PARAMETERS: * Network - trained neural network. * Info - return code: * -9, if internal matrix inverse subroutine failed * -2, if there is a point with class number * outside of [0..NOut-1]. * -1, if wrong parameters specified * (NPoints<0, Restarts<1). * 2, if task has been solved. * Rep - training report * * -- ALGLIB -- * Copyright 10.03.2009 by Bochkanov Sergey *************************************************************************/ public static void mlptrainlm(ref mlpbase.multilayerperceptron network, ref double[,] xy, int npoints, double decay, int restarts, ref int info, ref mlpreport rep) { int nin = 0; int nout = 0; int wcount = 0; //double lmftol = 0; double lmsteptol = 0; int i = 0; //int j = 0; int k = 0; //int mx = 0; double v = 0; double e = 0; double enew = 0; double xnorm2 = 0; double stepnorm = 0; double[] g = new double[0]; double[] d = new double[0]; double[,] h = new double[0, 0]; double[,] hmod = new double[0, 0]; double[,] z = new double[0, 0]; bool spd = new bool(); double nu = 0; double lambda = 0; double lambdaup = 0; double lambdadown = 0; //int cvcnt = 0; //double cvrelcnt = 0; lbfgs.lbfgsreport internalrep = new lbfgs.lbfgsreport(); lbfgs.lbfgsstate state = new lbfgs.lbfgsstate(); double[] x = new double[0]; double[] y = new double[0]; double[] wbase = new double[0]; //double wstep = 0; double[] wdir = new double[0]; double[] wt = new double[0]; double[] wx = new double[0]; int pass = 0; double[] wbest = new double[0]; double ebest = 0; int i_ = 0; mlpbase.mlpproperties(ref network, ref nin, ref nout, ref wcount); lambdaup = 10; lambdadown = 0.3; //lmftol = 0.001; lmsteptol = 0.001; // // Test for inputs // if (npoints <= 0 | restarts < 1) { info = -1; return; } if (mlpbase.mlpissoftmax(ref network)) { for (i = 0; i <= npoints - 1; i++) { if ((int)Math.Round(xy[i, nin]) < 0 | (int)Math.Round(xy[i, nin]) >= nout) { info = -2; return; } } } decay = Math.Max(decay, mindecay); info = 2; // // Initialize data // rep.ngrad = 0; rep.nhess = 0; rep.ncholesky = 0; // // General case. // Prepare task and network. Allocate space. // mlpbase.mlpinitpreprocessor(ref network, ref xy, npoints); g = new double[wcount - 1 + 1]; h = new double[wcount - 1 + 1, wcount - 1 + 1]; hmod = new double[wcount - 1 + 1, wcount - 1 + 1]; wbase = new double[wcount - 1 + 1]; wdir = new double[wcount - 1 + 1]; wbest = new double[wcount - 1 + 1]; wt = new double[wcount - 1 + 1]; wx = new double[wcount - 1 + 1]; ebest = AP.Math.MaxRealNumber; // // Multiple passes // for (pass = 1; pass <= restarts; pass++) { // // Initialize weights // mlpbase.mlprandomize(ref network); // // First stage of the hybrid algorithm: LBFGS // for (i_ = 0; i_ <= wcount - 1; i_++) { wbase[i_] = network.weights[i_]; } lbfgs.minlbfgs(wcount, Math.Min(wcount, 5), ref wbase, 0.0, 0.0, 0.0, Math.Max(25, wcount), 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { // // gradient // for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = state.x[i_]; } mlpbase.mlpgradbatch(ref network, ref xy, npoints, ref state.f, ref state.g); // // weight decay // v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } state.f = state.f + 0.5 * decay * v; for (i_ = 0; i_ <= wcount - 1; i_++) { state.g[i_] = state.g[i_] + decay * network.weights[i_]; } // // next iteration // rep.ngrad = rep.ngrad + 1; } lbfgs.minlbfgsresults(ref state, ref wbase, ref internalrep); for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = wbase[i_]; } // // Second stage of the hybrid algorithm: LM // // Initialize H with identity matrix, // G with gradient, // E with regularized error. // mlpbase.mlphessianbatch(ref network, ref xy, npoints, ref e, ref g, ref h); v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } e = e + 0.5 * decay * v; for (i_ = 0; i_ <= wcount - 1; i_++) { g[i_] = g[i_] + decay * network.weights[i_]; } for (k = 0; k <= wcount - 1; k++) { h[k, k] = h[k, k] + decay; } rep.nhess = rep.nhess + 1; lambda = 0.001; nu = 2; while (true) { // // 1. HMod = H+lambda*I // 2. Try to solve (H+Lambda*I)*dx = -g. // Increase lambda if left part is not positive definite. // for (i = 0; i <= wcount - 1; i++) { for (i_ = 0; i_ <= wcount - 1; i_++) { hmod[i, i_] = h[i, i_]; } hmod[i, i] = hmod[i, i] + lambda; } spd = cholesky.spdmatrixcholesky(ref hmod, wcount, true); rep.ncholesky = rep.ncholesky + 1; if (!spd) { lambda = lambda * lambdaup * nu; nu = nu * 2; continue; } if (!spdsolve.spdmatrixcholeskysolve(ref hmod, g, wcount, true, ref wdir)) { lambda = lambda * lambdaup * nu; nu = nu * 2; continue; } for (i_ = 0; i_ <= wcount - 1; i_++) { wdir[i_] = -1 * wdir[i_]; } // // Lambda found. // 1. Save old w in WBase // 1. Test some stopping criterions // 2. If error(w+wdir)>error(w), increase lambda // for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = network.weights[i_] + wdir[i_]; } xnorm2 = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { xnorm2 += network.weights[i_] * network.weights[i_]; } stepnorm = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { stepnorm += wdir[i_] * wdir[i_]; } stepnorm = Math.Sqrt(stepnorm); enew = mlpbase.mlperror(ref network, ref xy, npoints) + 0.5 * decay * xnorm2; if (stepnorm < lmsteptol * (1 + Math.Sqrt(xnorm2))) { break; } if (enew > e) { lambda = lambda * lambdaup * nu; nu = nu * 2; continue; } // // Optimize using inv(cholesky(H)) as preconditioner // if (!trinverse.rmatrixtrinverse(ref hmod, wcount, true, false)) { // // if matrix can't be inverted then exit with errors // TODO: make WCount steps in direction suggested by HMod // info = -9; return; } for (i_ = 0; i_ <= wcount - 1; i_++) { wbase[i_] = network.weights[i_]; } for (i = 0; i <= wcount - 1; i++) { wt[i] = 0; } lbfgs.minlbfgs(wcount, wcount, ref wt, 0.0, 0.0, 0.0, 5, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { // // gradient // for (i = 0; i <= wcount - 1; i++) { v = 0.0; for (i_ = i; i_ <= wcount - 1; i_++) { v += state.x[i_] * hmod[i, i_]; } network.weights[i] = wbase[i] + v; } mlpbase.mlpgradbatch(ref network, ref xy, npoints, ref state.f, ref g); for (i = 0; i <= wcount - 1; i++) { state.g[i] = 0; } for (i = 0; i <= wcount - 1; i++) { v = g[i]; for (i_ = i; i_ <= wcount - 1; i_++) { state.g[i_] = state.g[i_] + v * hmod[i, i_]; } } // // weight decay // grad(x'*x) = A'*(x0+A*t) // v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } state.f = state.f + 0.5 * decay * v; for (i = 0; i <= wcount - 1; i++) { v = decay * network.weights[i]; for (i_ = i; i_ <= wcount - 1; i_++) { state.g[i_] = state.g[i_] + v * hmod[i, i_]; } } // // next iteration // rep.ngrad = rep.ngrad + 1; } lbfgs.minlbfgsresults(ref state, ref wt, ref internalrep); // // Accept new position. // Calculate Hessian // for (i = 0; i <= wcount - 1; i++) { v = 0.0; for (i_ = i; i_ <= wcount - 1; i_++) { v += wt[i_] * hmod[i, i_]; } network.weights[i] = wbase[i] + v; } mlpbase.mlphessianbatch(ref network, ref xy, npoints, ref e, ref g, ref h); v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } e = e + 0.5 * decay * v; for (i_ = 0; i_ <= wcount - 1; i_++) { g[i_] = g[i_] + decay * network.weights[i_]; } for (k = 0; k <= wcount - 1; k++) { h[k, k] = h[k, k] + decay; } rep.nhess = rep.nhess + 1; // // Update lambda // lambda = lambda * lambdadown; nu = 2; } // // update WBest // v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } e = 0.5 * decay * v + mlpbase.mlperror(ref network, ref xy, npoints); if (e < ebest) { ebest = e; for (i_ = 0; i_ <= wcount - 1; i_++) { wbest[i_] = network.weights[i_]; } } } // // copy WBest to output // for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = wbest[i_]; } }
/************************************************************************* * Neural network training using L-BFGS algorithm with regularization. * Subroutine trains neural network with restarts from random positions. * Algorithm is well suited for problems of any dimensionality (memory * requirements and step complexity are linear by weights number). * * INPUT PARAMETERS: * Network - neural network with initialized geometry * XY - training set * NPoints - training set size * Decay - weight decay constant, >=0.001 * Decay term 'Decay*||Weights||^2' is added to error * function. * If you don't know what Decay to choose, use 0.001. * Restarts - number of restarts from random position, >0. * If you don't know what Restarts to choose, use 2. * WStep - stopping criterion. Algorithm stops if step size is * less than WStep. Recommended value - 0.01. Zero step * size means stopping after MaxIts iterations. * MaxIts - stopping criterion. Algorithm stops after MaxIts * iterations (NOT gradient calculations). Zero MaxIts * means stopping when step is sufficiently small. * * OUTPUT PARAMETERS: * Network - trained neural network. * Info - return code: * -8, if both WStep=0 and MaxIts=0 * -2, if there is a point with class number * outside of [0..NOut-1]. * -1, if wrong parameters specified * (NPoints<0, Restarts<1). * 2, if task has been solved. * Rep - training report * * -- ALGLIB -- * Copyright 09.12.2007 by Bochkanov Sergey *************************************************************************/ public static void mlptrainlbfgs(ref mlpbase.multilayerperceptron network, ref double[,] xy, int npoints, double decay, int restarts, double wstep, int maxits, ref int info, ref mlpreport rep) { int i = 0; //int j = 0; int pass = 0; int nin = 0; int nout = 0; int wcount = 0; double[] w = new double[0]; double[] wbest = new double[0]; double e = 0; double v = 0; double ebest = 0; lbfgs.lbfgsreport internalrep = new lbfgs.lbfgsreport(); lbfgs.lbfgsstate state = new lbfgs.lbfgsstate(); int i_ = 0; // // Test inputs, parse flags, read network geometry // if (wstep == 0 & maxits == 0) { info = -8; return; } if (npoints <= 0 | restarts < 1 | wstep < 0 | maxits < 0) { info = -1; return; } mlpbase.mlpproperties(ref network, ref nin, ref nout, ref wcount); if (mlpbase.mlpissoftmax(ref network)) { for (i = 0; i <= npoints - 1; i++) { if ((int)Math.Round(xy[i, nin]) < 0 | (int)Math.Round(xy[i, nin]) >= nout) { info = -2; return; } } } decay = Math.Max(decay, mindecay); info = 2; // // Prepare // mlpbase.mlpinitpreprocessor(ref network, ref xy, npoints); w = new double[wcount - 1 + 1]; wbest = new double[wcount - 1 + 1]; ebest = AP.Math.MaxRealNumber; // // Multiple starts // rep.ncholesky = 0; rep.nhess = 0; rep.ngrad = 0; for (pass = 1; pass <= restarts; pass++) { // // Process // mlpbase.mlprandomize(ref network); for (i_ = 0; i_ <= wcount - 1; i_++) { w[i_] = network.weights[i_]; } lbfgs.minlbfgs(wcount, Math.Min(wcount, 50), ref w, 0.0, 0.0, wstep, maxits, 0, ref state); while (lbfgs.minlbfgsiteration(ref state)) { for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = state.x[i_]; } mlpbase.mlpgradnbatch(ref network, ref xy, npoints, ref state.f, ref state.g); v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } state.f = state.f + 0.5 * decay * v; for (i_ = 0; i_ <= wcount - 1; i_++) { state.g[i_] = state.g[i_] + decay * network.weights[i_]; } rep.ngrad = rep.ngrad + 1; } lbfgs.minlbfgsresults(ref state, ref w, ref internalrep); for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = w[i_]; } // // Compare with best // v = 0.0; for (i_ = 0; i_ <= wcount - 1; i_++) { v += network.weights[i_] * network.weights[i_]; } e = mlpbase.mlperrorn(ref network, ref xy, npoints) + 0.5 * decay * v; if (e < ebest) { for (i_ = 0; i_ <= wcount - 1; i_++) { wbest[i_] = network.weights[i_]; } ebest = e; } } // // The best network // for (i_ = 0; i_ <= wcount - 1; i_++) { network.weights[i_] = wbest[i_]; } }