static public int LEVMAR_DIF(
      FitFunction func, /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
      double [] p,         /* I/O: initial parameter estimates. On output has the estimated solution */
      double [] x,         /* I: measurement vector */
 
      int itmax,          /* I: maximum number of iterations */
      double[] opts,    /* I: opts[0-4] = minim. options [\mu, \epsilon1, \epsilon2, \epsilon3, \delta]. Respectively the
                       * scale factor for initial \mu, stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2 and
                       * the step used in difference approximation to the jacobian. Set to NULL for defaults to be used.
                       * If \delta<0, the jacobian is approximated with central differences which are more accurate
                       * (but slower!) compared to the forward differences employed by default. 
                       */
      double[] info,
      /* O: information regarding the minimization. Set to NULL if don't care
                      * info[0]= ||e||_2 at initial p.
                      * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
                      * info[5]= # iterations,
                      * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
                      *                                 2 - stopped by small Dp
                      *                                 3 - stopped by itmax
                      *                                 4 - singular matrix. Restart from current p with increased mu 
                      *                                 5 - no further error reduction is possible. Restart with increased mu
                      *                                 6 - stopped by small ||e||_2
                      * info[7]= # function evaluations
                      * info[8]= # jacobian evaluations
                      */
      ref object workingmemory,     /* working memory, allocate if NULL */
      double[] covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
      object adata)       /* pointer to possibly additional data, passed uninterpreted to func.
                      * Set to NULL if not needed
                      */
    {

      int m = p.Length;              /* I: parameter vector dimension (i.e. #unknowns) */
      int n = x.Length;              /* I: measurement vector dimension */
      int i, j, k, l;

      bool issolved;
      /* temp work arrays */
      double[] e;          /* nx1 */
      double[] hx;         /* \hat{x}_i, nx1 */

      double[] jacTe;      /* J^T e_i mx1 */

      double[] jac;        /* nxm */

      double[] jacTjac;    /* mxm */

      double[] Dp;         /* mx1 */

      double[] diag_jacTjac;   /* diagonal of J^T J, mx1 */

      double[] pDp;        /* p + Dp, mx1 */

      double[] wrk;        /* nx1 */

      bool using_ffdif=true;
      double[] wrk2=null; /* nx1, used for differentiating with central differences only */

      double mu,  /* damping constant */
        tmp; /* mainly used in matrix & vector multiplications */
      double p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */
      double p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
      double tau, eps1, eps2, eps2_sq, eps3, delta;
      double init_p_eL2;
      int nu, nu2, stop, nfev, njap=0, K=(m>=10)? m: 10, updjac;
      bool updp=true;
      bool  newjac;
      int nm=n*m;

      mu=jacTe_inf=p_L2=0.0; /* -Wall */
      stop = updjac = 0;  newjac = false; /* -Wall */

      if(n<m)
      {
        throw new ArithmeticException(string.Format("Cannot solve a problem with fewer measurements {0} than unknowns {1}", n, m));
      }

      if(opts!=null)
      {
        tau=opts[0];
        eps1=opts[1];
        eps2=opts[2];
        eps2_sq=opts[2]*opts[2];
        eps3=opts[3];
        delta=opts[4];
        if(delta<0.0)
        {
          delta=-delta; /* make positive */
          using_ffdif=false; /* use central differencing */
          wrk2 = new double[n];
        }
      }
      else
      { // use default values
        tau=LM_INIT_MU;
        eps1=LM_STOP_THRESH;
        eps2=LM_STOP_THRESH;
        eps2_sq=LM_STOP_THRESH*LM_STOP_THRESH;
        eps3=LM_STOP_THRESH;
        delta=LM_DIFF_DELTA;
      }

      WorkArrays work = workingmemory as WorkArrays;
      if(null==work)
      {
        work = new WorkArrays(n, m);
        workingmemory = work;
      }

      /* set up work arrays */
      e=work.e;
      hx = work.hx;
      jacTe = work.jacTe;
      jac = work.jac;
      jacTjac = work.jacTjac;
      Dp = work.Dp;
      diag_jacTjac = work.diag_jacTjac;
      pDp = work.pDp;
      wrk = work.wrk;
  
      /* compute e=x - f(p) and its L2 norm */
      func(p, hx, adata); nfev=1;
      for(i=0, p_eL2=0.0; i<n; ++i)
      {
        e[i]=tmp=x[i]-hx[i];
        p_eL2+=tmp*tmp;
      }
      init_p_eL2=p_eL2;

      nu=20; /* force computation of J */

      for(k=0; k<itmax; ++k)
      {
        /* Note that p and e have been updated at a previous iteration */

        if(p_eL2<=eps3)
        { /* error is small */
          stop=6;
          break;
        }

        /* Compute the jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
         * The symmetry of J^T J is again exploited for speed
         */

        if((updp && nu>16) || updjac==K)
        { /* compute difference approximation to J */
          if(using_ffdif)
          { /* use forward differences */
            FDIF_FORW_JAC_APPROX(func, p, hx, wrk, delta, jac, m, n, adata);
            ++njap; nfev+=m;
          }
          else
          { /* use central differences */
            FDIF_CENT_JAC_APPROX(func, p, wrk, wrk2, delta, jac, m, n, adata);
            ++njap; nfev+=2*m;
          }
          nu=2; updjac=0; updp=false; newjac=true;
        }

        if(newjac)
        { /* jacobian has changed, recompute J^T J, J^t e, etc */
          newjac=false;

          /* J^T J, J^T e */
          if(nm<=__BLOCKSZ__SQ)
          { // this is a small problem
            /* This is the straightforward way to compute J^T J, J^T e. However, due to
             * its noncontinuous memory access pattern, it incures many cache misses when
             * applied to large minimization problems (i.e. problems involving a large
             * number of free variables and measurements), in which J is too large to
             * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
             * is preferable.
             *
             * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
             * performance problem.
             *
             * On the other hand, the straightforward algorithm is faster on small
             * problems since in this case it avoids the overheads of blocking. 
             */
      
            for(i=0; i<m; ++i)
            {
              for(j=i; j<m; ++j)
              {
                int lm;

                for(l=0, tmp=0.0; l<n; ++l)
                {
                  lm=l*m;
                  tmp+=jac[lm+i]*jac[lm+j];
                }

                jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
              }

              /* J^T e */
              for(l=0, tmp=0.0; l<n; ++l)
                tmp+=jac[l*m+i]*e[l];
              jacTe[i]=tmp;
            }
          }
          else
          { // this is a large problem
            /* Cache efficient computation of J^T J based on blocking
             */
            TRANS_MAT_MAT_MULT(jac, jacTjac, n, m, __BLOCKSZ__,null);

            /* cache efficient computation of J^T e */
            for(i=0; i<m; ++i)
              jacTe[i]=0.0;

            for(i=0; i<n; ++i)
            {
              int jacrow;

              for(l=0, jacrow=i*m, tmp=e[i]; l<m; ++l)
                jacTe[l]+=jac[l+jacrow]*tmp;
            }
          }
      
          /* Compute ||J^T e||_inf and ||p||^2 */
          for(i=0, p_L2=jacTe_inf=0.0; i<m; ++i)
          {
            if(jacTe_inf < (tmp=Math.Abs(jacTe[i]))) jacTe_inf=tmp;

            diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */
            p_L2+=p[i]*p[i];
          }
          //p_L2=sqrt(p_L2);
        }

#if false
if(!(k%100)){
  printf("Current estimate: ");
  for(i=0; i<m; ++i)
    printf("%.9g ", p[i]);
  printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
}
#endif

        /* check for convergence */
        if((jacTe_inf <= eps1))
        {
          Dp_L2=0.0; /* no increment for p in this case */
          stop=1;
          break;
        }

        /* compute initial damping factor */
        if(k==0)
        {
          for(i=0, tmp=LM_REAL_MIN; i<m; ++i)
            if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
          mu=tau*tmp;
        }

        /* determine increment using adaptive damping */

        /* augment normal equations */
        for(i=0; i<m; ++i)
          jacTjac[i*m+i]+=mu;

        /* solve augmented equations */
#if HAVE_LAPACK
    /* 5 alternatives are available: LU, Cholesky, 2 variants of QR decomposition and SVD.
     * Cholesky is the fastest but might be inaccurate; QR is slower but more accurate;
     * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
     */

    issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m);
    //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m);
    //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m);
    //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m);
    //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m);
#else
        /* use the LU included with levmar */
        issolved=0!=AX_EQ_B_LU(jacTjac, jacTe, Dp, m);
#endif // HAVE_LAPACK 

        if(issolved)
        {
          /* compute p's new estimate and ||Dp||^2 */
          for(i=0, Dp_L2=0.0; i<m; ++i)
          {
            pDp[i]=p[i] + (tmp=Dp[i]);
            Dp_L2+=tmp*tmp;
          }
          //Dp_L2=sqrt(Dp_L2);

          if(Dp_L2<=eps2_sq*p_L2)
          { /* relative change in p is small, stop */
            //if(Dp_L2<=eps2*(p_L2 + eps2)){ /* relative change in p is small, stop */
            stop=2;
            break;
          }

          if(Dp_L2>=(p_L2+eps2)/(EPSILON*EPSILON))
          { /* almost singular */
            //if(Dp_L2>=(p_L2+eps2)/CNST(EPSILON)){ /* almost singular */
            stop=4;
            break;
          }

          func(pDp, wrk, adata); ++nfev; /* evaluate function at p + Dp */
          for(i=0, pDp_eL2=0.0; i<n; ++i)
          { /* compute ||e(pDp)||_2 */
            tmp=x[i]-wrk[i];
            pDp_eL2+=tmp*tmp;
          }

          dF=p_eL2-pDp_eL2;
          if(updp || dF>0)
          { /* update jac */
            for(i=0; i<n; ++i)
            {
              for(l=0, tmp=0.0; l<m; ++l)
                tmp+=jac[i*m+l]*Dp[l]; /* (J * Dp)[i] */
              tmp=(wrk[i] - hx[i] - tmp)/Dp_L2; /* (f(p+dp)[i] - f(p)[i] - (J * Dp)[i])/(dp^T*dp) */
              for(j=0; j<m; ++j)
                jac[i*m+j]+=tmp*Dp[j];
            }
            ++updjac;
            newjac=true;
          }

          for(i=0, dL=0.0; i<m; ++i)
            dL+=Dp[i]*(mu*Dp[i]+jacTe[i]);

          if(dL>0.0 && dF>0.0)
          { /* reduction in error, increment is accepted */
            dF=(2.0*dF/dL-1.0);
            tmp=dF*dF*dF;
            tmp=1.0-tmp*tmp*dF;
            mu=mu*( (tmp>=ONE_THIRD)? tmp : ONE_THIRD );
            nu=2;

            for(i=0 ; i<m; ++i) /* update p's estimate */
              p[i]=pDp[i];

            for(i=0; i<n; ++i)
            { /* update e, hx and ||e||_2 */
              e[i]=x[i]-wrk[i];
              hx[i]=wrk[i];
            }
            p_eL2=pDp_eL2;
            updp=true;
            continue;
          }
        }

        /* if this point is reached, either the linear system could not be solved or
         * the error did not reduce; in any case, the increment must be rejected
         */

        mu*=nu;
        nu2=nu<<1; // 2*nu;
        if(nu2<=nu)
        { /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
          stop=5;
          break;
        }
        nu=nu2;

        for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
          jacTjac[i*m+i]=diag_jacTjac[i];
      }

      if(k>=itmax) stop=3;

      for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
        jacTjac[i*m+i]=diag_jacTjac[i];

      if(info!=null)
      {
        info[0]=init_p_eL2;
        info[1]=p_eL2;
        info[2]=jacTe_inf;
        info[3]=Dp_L2;
        for(i=0, tmp=LM_REAL_MIN; i<m; ++i)
          if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i];
        info[4]=mu/tmp;
        info[5]=(double)k;
        info[6]=(double)stop;
        info[7]=(double)nfev;
        info[8]=(double)njap;
      }

      /* covariance matrix */
      if(covar!=null)
      {
        LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
      }

      return (stop!=4)?  k : -1;
    }
    /* 
 * This function seeks the parameter vector p that best describes the measurements vector x.
 * More precisely, given a vector function  func : R^m --> R^n with n>=m,
 * it finds p s.t. func(p) ~= x, i.e. the squared second order (i.e. L2) norm of
 * e=x-func(p) is minimized.
 *
 * This function requires an analytic jacobian. In case the latter is unavailable,
 * use LEVMAR_DIF() bellow
 *
 * Returns the number of iterations (>=0) if successfull, -1 if failed
 *
 * For more details, see H.B. Nielsen's (http://www.imm.dtu.dk/~hbn) IMM/DTU
 * tutorial at http://www.imm.dtu.dk/courses/02611/nllsq.pdf
 */

    public static int LEVMAR_DER(
      FitFunction  func, /* functional relation describing measurements. A p \in R^m yields a \hat{x} \in  R^n */
      JacobianFunction jacf,  /* function to evaluate the jacobian \part x / \part p */ 
      double []p,         /* I/O: initial parameter estimates. On output has the estimated solution */
      double []x,         /* I: measurement vector */
      double []weights,   /* vector of the weights used to scale the fit differences, can be null */
         
      int itmax,          /* I: maximum number of iterations */
      double[] opts,    /* I: minim. options [\mu, \epsilon1, \epsilon2, \epsilon3]. Respectively the scale factor for initial \mu,
                       * stopping thresholds for ||J^T e||_inf, ||Dp||_2 and ||e||_2. Set to NULL for defaults to be used
                       */
      double[] info,
      /* O: information regarding the minimization. Set to NULL if don't care
                      * info[0]= ||e||_2 at initial p.
                      * info[1-4]=[ ||e||_2, ||J^T e||_inf,  ||Dp||_2, mu/max[J^T J]_ii ], all computed at estimated p.
                      * info[5]= # iterations,
                      * info[6]=reason for terminating: 1 - stopped by small gradient J^T e
                      *                                 2 - stopped by small Dp
                      *                                 3 - stopped by itmax
                      *                                 4 - singular matrix. Restart from current p with increased mu 
                      *                                 5 - no further error reduction is possible. Restart with increased mu
                      *                                 6 - stopped by small ||e||_2
                      * info[7]= # function evaluations
                      * info[8]= # jacobian evaluations
                      */
      ref object workingmemory,     /* working memory, allocate if NULL */
      double[] covar,    /* O: Covariance matrix corresponding to LS solution; mxm. Set to NULL if not needed. */
      object adata)       /* pointer to possibly additional data, passed uninterpreted to func & jacf.
                      * Set to NULL if not needed
                      */
    {
      int m = p.Length;  /* I: parameter vector dimension (i.e. #unknowns) */
      int n = x.Length;  /* I: measurement vector dimension */

      int i, j, k, l;
      int issolved;
      /* temp work arrays */
      double[] e,          /* nx1 */
        hx,         /* \hat{x}_i, nx1 */
        jacTe,      /* J^T e_i mx1 */
        jac,        /* nxm */
        jacTjac,    /* mxm */
        Dp,         /* mx1 */
        diag_jacTjac,   /* diagonal of J^T J, mx1 */
        pDp;        /* p + Dp, mx1 */

      double mu,  /* damping constant */
        tmp; /* mainly used in matrix & vector multiplications */
      double p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */
      double p_L2, Dp_L2=LM_REAL_MAX, dF, dL;
      double tau, eps1, eps2, eps2_sq, eps3;
      double init_p_eL2;
      int nu=2, nu2, stop, nfev, njev=0;
      int nm=n*m;

      mu=jacTe_inf=0.0; /* -Wall */

      if(n<m)
      {
        throw new ArithmeticException(string.Format("Cannot solve a problem with fewer measurements {0} than unknowns {1}", n, m));
      }

      if(null==jacf)
      {
        throw new ArgumentException("No function specified for computing the jacobian. If no such function is available, use LEVMAR_DIF instead");
      }

      if(null!=opts)
      {
        tau=opts[0];
        eps1=opts[1];
        eps2=opts[2];
        eps2_sq=opts[2]*opts[2];
        eps3=opts[3];
      }
      else
      { // use default values
        tau=(LM_INIT_MU);
        eps1=(LM_STOP_THRESH);
        eps2=(LM_STOP_THRESH);
        eps2_sq=(LM_STOP_THRESH)*(LM_STOP_THRESH);
        eps3=(LM_STOP_THRESH);
      }

  

      /* set up work arrays */
      WorkArrays work = workingmemory as WorkArrays;
      if(null==work)
      {
        work = new WorkArrays(n, m);
        workingmemory = work;
      }

      /* set up work arrays */
      e=work.e;
      hx = work.hx;
      jacTe = work.jacTe;
      jac = work.jac;
      jacTjac = work.jacTjac;
      Dp = work.Dp;
      diag_jacTjac = work.diag_jacTjac;
      pDp = work.pDp;
  

      /* compute e=x - f(p) and its L2 norm */
      func(p, hx, adata); nfev=1;
      if (weights == null)
      {
        for (i = 0, p_eL2 = 0.0; i < n; ++i)
        {
          e[i] = tmp = x[i] - hx[i];
          p_eL2 += tmp * tmp;
        }
      }
      else
      {
        for (i = 0, p_eL2 = 0.0; i < n; ++i)
        {
          e[i] = tmp = (x[i] - hx[i])*weights[i];
          p_eL2 += tmp * tmp;
        }
      }
      init_p_eL2=p_eL2;

      for(k=stop=0; k<itmax && 0==stop; ++k)
      {
        /* Note that p and e have been updated at a previous iteration */

        if(p_eL2<=eps3)
        { /* error is small */
          stop=6;
          break;
        }

        /* Compute the jacobian J at p,  J^T J,  J^T e,  ||J^T e||_inf and ||p||^2.
         * Since J^T J is symmetric, its computation can be speeded up by computing
         * only its upper triangular part and copying it to the lower part
         */

        jacf(p, jac, adata); ++njev;

        /* J^T J, J^T e */
        if(nm<__BLOCKSZ__SQ)
        { // this is a small problem
          /* This is the straightforward way to compute J^T J, J^T e. However, due to
           * its noncontinuous memory access pattern, it incures many cache misses when
           * applied to large minimization problems (i.e. problems involving a large
           * number of free variables and measurements), in which J is too large to
           * fit in the L1 cache. For such problems, a cache-efficient blocking scheme
           * is preferable.
           *
           * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this
           * performance problem.
           *
           * On the other hand, the straightforward algorithm is faster on small
           * problems since in this case it avoids the overheads of blocking. 
           */

          for(i=0; i<m; ++i)
          {
            for(j=i; j<m; ++j)
            {
              int lm;

              if (weights == null)
              {
                for (l = 0, tmp = 0.0; l < n; ++l)
                {
                  lm = l * m;
                  tmp += jac[lm + i] * jac[lm + j];
                }
              }
              else
              {
                for (l = 0, tmp = 0.0; l < n; ++l)
                {
                  lm = l * m;
                  tmp += jac[lm + i] * jac[lm + j] * weights[i] * weights[i];
                }
              }

              /* store tmp in the corresponding upper and lower part elements */
              jacTjac[i*m+j]=jacTjac[j*m+i]=tmp;
            }

            /* J^T e */
            for(l=0, tmp=0.0; l<n; ++l)
              tmp+=jac[l*m+i]*e[l];
            jacTe[i]=tmp;
          }
        }
        else
        { // this is a large problem
          /* Cache efficient computation of J^T J based on blocking
           */
          TRANS_MAT_MAT_MULT(jac, jacTjac, n, m, __BLOCKSZ__,weights);

          /* cache efficient computation of J^T e */
          for(i=0; i<m; ++i)
            jacTe[i]=0.0;

          for(i=0; i<n; ++i)
          {
            int jacrow;

            for(l=0, jacrow=i*m, tmp=e[i]; l<m; ++l)
              jacTe[l]+=jac[jacrow+l]*tmp;
          }
        }

        /* Compute ||J^T e||_inf and ||p||^2 */
        for(i=0, p_L2=jacTe_inf=0.0; i<m; ++i)
        {
          if(jacTe_inf < (tmp=Math.Abs(jacTe[i]))) jacTe_inf=tmp;

          diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */
          p_L2+=p[i]*p[i];
        }
        //p_L2=sqrt(p_L2);

#if false
if(!(k%100)){
  printf("Current estimate: ");
  for(i=0; i<m; ++i)
    printf("%.9g ", p[i]);
  printf("-- errors %.9g %0.9g\n", jacTe_inf, p_eL2);
}
#endif

        /* check for convergence */
        if((jacTe_inf <= eps1))
        {
          Dp_L2=0.0; /* no increment for p in this case */
          stop=1;
          break;
        }

        /* compute initial damping factor */
        if(k==0)
        {
          for(i=0, tmp=LM_REAL_MIN; i<m; ++i)
            if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */
          mu=tau*tmp;
        }

        /* determine increment using adaptive damping */
        while(true)
        {
          /* augment normal equations */
          for(i=0; i<m; ++i)
            jacTjac[i*m+i]+=mu;

          /* solve augmented equations */
#if HAVE_LAPACK
      /* 5 alternatives are available: LU, Cholesky, 2 variants of QR decomposition and SVD.
       * Cholesky is the fastest but might be inaccurate; QR is slower but more accurate;
       * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed
       */

      issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m);
      //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m);
      //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m);
      //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m);
      //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m);

#else
    
          /* use the LU included with levmar */
          issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m);
#endif // HAVE_LAPACK 

          if(0!=issolved)
          {
            /* compute p's new estimate and ||Dp||^2 */
            for(i=0, Dp_L2=0.0; i<m; ++i)
            {
              pDp[i]=p[i] + (tmp=Dp[i]);
              Dp_L2+=tmp*tmp;
            }
            //Dp_L2=sqrt(Dp_L2);

            if(Dp_L2<=eps2_sq*p_L2)
            { /* relative change in p is small, stop */
              //if(Dp_L2<=eps2*(p_L2 + eps2)){ /* relative change in p is small, stop */
              stop=2;
              break;
            }

            if(Dp_L2>=(p_L2+eps2)/((EPSILON)*(EPSILON)))
            { /* almost singular */
              //if(Dp_L2>=(p_L2+eps2)/CNST(EPSILON)){ /* almost singular */
              stop=4;
              break;
            }

            func(pDp, hx, adata); ++nfev; /* evaluate function at p + Dp */
            if (weights == null)
            {
              for (i = 0, pDp_eL2 = 0.0; i < n; ++i)
              { /* compute ||e(pDp)||_2 */
                hx[i] = tmp = x[i] - hx[i];
                pDp_eL2 += tmp * tmp;
              }
            }
            else // use weights
            {
              for (i = 0, pDp_eL2 = 0.0; i < n; ++i)
              { /* compute ||e(pDp)||_2 */
                hx[i] = tmp = (x[i] - hx[i])*weights[i];
                pDp_eL2 += tmp * tmp;
              }
            }

            for(i=0, dL=0.0; i<m; ++i)
              dL+=Dp[i]*(mu*Dp[i]+jacTe[i]);

            dF=p_eL2-pDp_eL2;

            if(dL>0.0 && dF>0.0)
            { /* reduction in error, increment is accepted */
              tmp=((2.0)*dF/dL-(1.0));
              tmp=(1.0)-tmp*tmp*tmp;
              mu=mu*( (tmp>=(ONE_THIRD))? tmp : (ONE_THIRD) );
              nu=2;

              for(i=0 ; i<m; ++i) /* update p's estimate */
                p[i]=pDp[i];

              for(i=0; i<n; ++i) /* update e and ||e||_2 */
                e[i]=hx[i];
              p_eL2=pDp_eL2;
              break;
            }
          }

          /* if this point is reached, either the linear system could not be solved or
           * the error did not reduce; in any case, the increment must be rejected
           */

          mu*=nu;
          nu2=nu<<1; // 2*nu;
          if(nu2<=nu)
          { /* nu has wrapped around (overflown). Thanks to Frank Jordan for spotting this case */
            stop=5;
            break;
          }
          nu=nu2;

          for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
            jacTjac[i*m+i]=diag_jacTjac[i];
        } /* inner loop */
      }

      if(k>=itmax) stop=3;

      for(i=0; i<m; ++i) /* restore diagonal J^T J entries */
        jacTjac[i*m+i]=diag_jacTjac[i];

      if(null!=info)
      {
        info[0]=init_p_eL2;
        info[1]=p_eL2;
        info[2]=jacTe_inf;
        info[3]=Dp_L2;
        for(i=0, tmp=LM_REAL_MIN; i<m; ++i)
          if(tmp<jacTjac[i*m+i]) tmp=jacTjac[i*m+i];
        info[4]=mu/tmp;
        info[5]=(double)k;
        info[6]=(double)stop;
        info[7]=(double)nfev;
        info[8]=(double)njev;
      }

      /* covariance matrix */
      if(null!=covar)
      {
        LEVMAR_COVAR(jacTjac, covar, p_eL2, m, n);
      }

      return (stop!=4)?  k : -1;
    }