/// <summary> /// Measure quality of the training parameters. /// </summary> /// <param name="parameters">The parameters.</param> /// <returns>The evaluated result.</returns> public double Evaluate(double[] parameters) { var contexts = indexer.GetContexts(); var values = indexer.Values; var nEventsSeen = indexer.GetNumTimesEventsSeen(); var outcomeList = indexer.GetOutcomeList(); var nOutcomes = outcomeList.Length; var nPredLabels = indexer.GetPredLabels().Length; var nCorrect = 0; var nTotalEvents = 0; for (var ei = 0; ei < contexts.Length; ei++) { var context = contexts[ei]; var value = values == null ? null : values[ei]; var probs = new double[nOutcomes]; QNModel.Eval(context, value, probs, nOutcomes, nPredLabels, parameters); var outcome = ArrayMath.MaxId(probs); if (outcome == outcomeList[ei]) { nCorrect += nEventsSeen[ei]; } nTotalEvents += nEventsSeen[ei]; } return(nTotalEvents == 0 ? 0 : (double)nCorrect / nTotalEvents); }
/// <summary> /// Gets the negative log-likelihood at the given input vector. /// </summary> /// <param name="x">The input vector.</param> /// <returns>The negative log-likelihood.</returns> /// <exception cref="ArgumentException">The <paramref name="x"/> is invalid, its dimension is not equal to domain dimension.</exception> public virtual double ValueAt(double[] x) { if (x.Length != dimension) { throw new ArgumentException("x is invalid, its dimension is not equal to domain dimension.", nameof(x)); } int ci; double negLogLikelihood = 0; for (ci = 0; ci < numContexts; ci++) { int oi; for (oi = 0; oi < numOutcomes; oi++) { tempSums[oi] = 0; int ai; for (ai = 0; ai < contexts[ci].Length; ai++) { var vectorIndex = IndexOf(oi, contexts[ci][ai]); var predValue = values != null ? values[ci][ai] : 1.0; tempSums[oi] += predValue * x[vectorIndex]; } } var logSumOfExps = ArrayMath.LogSumOfExps(tempSums); negLogLikelihood -= (tempSums[outcomeList[ci]] - logSumOfExps) * numTimesEventsSeen[ci]; } return(negLogLikelihood); }
private void NegLLCompute(int threadIndex, int startIndex, int length, double[] x) { negLogLikelihoodThread[threadIndex] = 0; // Knuppe: In parallel we can't use the tempSums variable ;) var temp = new double[numOutcomes]; for (var ci = startIndex; ci < startIndex + length; ci++) { for (var oi = 0; oi < numOutcomes; oi++) { temp[oi] = 0; for (var ai = 0; ai < contexts[ci].Length; ai++) { var vectorIndex = IndexOf(oi, contexts[ci][ai]); var predValue = values != null ? values[ci][ai] : 1.0; temp[oi] += predValue * x[vectorIndex]; } } var logSumOfExps = ArrayMath.LogSumOfExps(temp); var outcome = outcomeList[ci]; negLogLikelihoodThread[threadIndex] -= (temp[outcome] - logSumOfExps) * numTimesEventsSeen[ci]; } }
/// <summary> /// L-BFGS two-loop recursion (see Nocedal & Wright 2006, Numerical Optimization, p. 178) /// </summary> /// <param name="direction">The direction.</param> private void ComputeDirection(double[] direction) { // Implemented two-loop Hessian update method. var k = updateInfo.kCounter; var rho = updateInfo.rho; var alpha = updateInfo.alpha; // just to avoid recreating alpha var S = updateInfo.S; var Y = updateInfo.Y; // First loop for (var i = k - 1; i >= 0; i--) { alpha[i] = rho[i] * ArrayMath.InnerProduct(S[i], direction); for (var j = 0; j < dimension; j++) { direction[j] = direction[j] - alpha[i] * Y[i][j]; } } // Second loop for (var i = 0; i < k; i++) { var beta = rho[i] * ArrayMath.InnerProduct(Y[i], direction); for (var j = 0; j < dimension; j++) { direction[j] = direction[j] + S[i][j] * (alpha[i] - beta); } } for (var i = 0; i < dimension; i++) { direction[i] = -direction[i]; } }
/// <summary> /// Gets the function value at the given input vector. /// </summary> /// <param name="x">The input vector.</param> /// <returns>The function value.</returns> public double ValueAt(double[] x) { CheckDimension(x); var value = func.ValueAt(x); if (l2Cost > 0) { value += l2Cost * ArrayMath.InnerProduct(x, x); } return(value); }
/// <summary> /// Gets the gradient at the given input vector. /// </summary> /// <param name="x">The input vector.</param> /// <returns>The gradient value.</returns> /// <exception cref="System.ArgumentException">x is invalid, its dimension is not equal to domain dimension.;x</exception> /// <exception cref="ArgumentException">The <paramref name="x" /> is invalid, its dimension is not equal to domain dimension.</exception> public virtual double[] GradientAt(double[] x) { if (x.Length != dimension) { throw new ArgumentException("x is invalid, its dimension is not equal to domain dimension.", nameof(x)); } int ci; // Reset gradient for (var i = 0; i < gradient.Length; i++) { gradient[i] = 0; } for (ci = 0; ci < numContexts; ci++) { int oi; double predValue; int vectorIndex; int ai; for (oi = 0; oi < numOutcomes; oi++) { expectation[oi] = 0; for (ai = 0; ai < contexts[ci].Length; ai++) { vectorIndex = IndexOf(oi, contexts[ci][ai]); predValue = values != null ? values[ci][ai] : 1.0; expectation[oi] += predValue * x[vectorIndex]; } } var logSumOfExps = ArrayMath.LogSumOfExps(expectation); for (oi = 0; oi < numOutcomes; oi++) { expectation[oi] = Math.Exp(expectation[oi] - logSumOfExps); } for (oi = 0; oi < numOutcomes; oi++) { var empirical = outcomeList[ci] == oi ? 1 : 0; for (ai = 0; ai < contexts[ci].Length; ai++) { vectorIndex = IndexOf(oi, contexts[ci][ai]); predValue = values != null ? values[ci][ai] : 1.0; gradient[vectorIndex] += predValue * (expectation[oi] - empirical) * numTimesEventsSeen[ci]; } } } return(gradient); }
private static readonly double RHO = 0.5; // decrease of step size (must be from 0 to 1) /// <summary> /// Backtracking line search. (see Nocedal & Wright 2006, Numerical Optimization, p. 37) /// </summary> /// <param name="function">The function.</param> /// <param name="direction">The direction.</param> /// <param name="lsr">The result.</param> /// <param name="initialStepSize">Initial step size.</param> public static void DoLineSearch( IFunction function, double[] direction, LineSearchResult lsr, double initialStepSize) { var stepSize = initialStepSize; var currFctEvalCount = lsr.FctEvalCount; var x = lsr.NextPoint; var gradAtX = lsr.GradAtNext; var valueAtX = lsr.ValueAtNext; var dimension = x.Length; // Retrieve current points and gradient for array reuse purpose var nextPoint = lsr.CurrPoint; var gradAtNextPoint = lsr.GradAtCurr; double valueAtNextPoint; var dirGradientAtX = ArrayMath.InnerProduct(direction, gradAtX); // To avoid recomputing in the loop var cachedProd = C * dirGradientAtX; while (true) { // Get next point for (var i = 0; i < dimension; i++) { nextPoint[i] = x[i] + direction[i] * stepSize; } // New value valueAtNextPoint = function.ValueAt(nextPoint); currFctEvalCount++; // Check Armijo condition if (valueAtNextPoint <= valueAtX + cachedProd * stepSize) { break; } // Shrink step size stepSize *= RHO; } // Compute and save gradient at the new point Array.Copy(function.GradientAt(nextPoint), 0, gradAtNextPoint, 0, gradAtNextPoint.Length); // Update line search result lsr.SetAll(stepSize, valueAtX, valueAtNextPoint, gradAtX, gradAtNextPoint, x, nextPoint, currFctEvalCount); }
private bool IsConverged(LineSearchResult lsr) { // Check function's change rate if (lsr.FuncChangeRate < ConvergeTolerance) { if (monitor != null) { Display("Function change rate is smaller than the threshold " + ConvergeTolerance + ".\nTraining will stop.\n\n"); } return(true); } // Check gradient's norm using the criteria: ||g(x)|| / max(1, ||x||) < threshold var xNorm = Math.Max(1, ArrayMath.L2Norm(lsr.NextPoint)); var gradNorm = l1Cost > 0 ? ArrayMath.L2Norm(lsr.PseudoGradAtNext) : ArrayMath.L2Norm(lsr.GradAtNext); if (gradNorm / xNorm < RelGradNormTol) { if (monitor != null) { Display("Relative L2-norm of the gradient is smaller than the threshold " + RelGradNormTol + ".\nTraining will stop.\n\n"); } return(true); } // Check step size if (lsr.StepSize < MinStepSize) { if (monitor != null) { Display("Step size is smaller than the minimum step size " + MinStepSize + ".\nTraining will stop.\n\n"); } return(true); } // Check number of function evaluations if (lsr.FctEvalCount > maxFctEval) { if (monitor != null) { Display("Maximum number of function evaluations has exceeded the threshold " + maxFctEval + ".\nTraining will stop.\n\n"); } return(true); } return(false); }
private void GradientCompute(int threadIndex, int startIndex, int length, double[] x) { var exp = new double[numOutcomes]; // Reset gradientThread Array.Clear(gradientThread[threadIndex], 0, gradientThread[threadIndex].Length); for (var ci = startIndex; ci < startIndex + length; ci++) { double predValue; int vectorIndex; for (var oi = 0; oi < numOutcomes; oi++) { exp[oi] = 0; for (var ai = 0; ai < contexts[ci].Length; ai++) { vectorIndex = IndexOf(oi, contexts[ci][ai]); predValue = values != null ? values[ci][ai] : 1.0; exp[oi] += predValue * x[vectorIndex]; } } var logSumOfExps = ArrayMath.LogSumOfExps(exp); for (var oi = 0; oi < numOutcomes; oi++) { exp[oi] = Math.Exp(exp[oi] - logSumOfExps); } for (var oi = 0; oi < numOutcomes; oi++) { var empirical = outcomeList[ci] == oi ? 1 : 0; for (var ai = 0; ai < contexts[ci].Length; ai++) { vectorIndex = IndexOf(oi, contexts[ci][ai]); predValue = values != null ? values[ci][ai] : 1.0; gradientThread[threadIndex][vectorIndex] += predValue * (exp[oi] - empirical) * numTimesEventsSeen[ci]; } } } }
/// <summary> /// Model evaluation which should be used during training to report model accuracy. /// </summary> /// <param name="context">Indices of the predicates which have been observed at the present decision point.</param> /// <param name="values">The weights of the predicates which have been observed at the present decision point.</param> /// <param name="probs">The probability for outcomes.</param> /// <param name="nOutcomes">The number of outcomes.</param> /// <param name="nPredLabels">The number of unique predicates.</param> /// <param name="parameters">The model parameters.</param> /// <returns>The normalized probabilities for the outcomes given the context.</returns> public static double[] Eval(int[] context, float[] values, double[] probs, int nOutcomes, int nPredLabels, double[] parameters) { for (var i = 0; i < context.Length; i++) { var predIdx = context[i]; var predValue = values != null ? values[i] : 1d; for (var oi = 0; oi < nOutcomes; oi++) { probs[oi] += predValue * parameters[oi * nPredLabels + predIdx]; } } var logSumExp = ArrayMath.LogSumOfExps(probs); for (var oi = 0; oi < nOutcomes; oi++) { probs[oi] = Math.Exp(probs[oi] - logSumExp); } return(probs); }
/// <summary> /// Model evaluation which should be used during inference. /// </summary> /// <param name="context">The predicates which have been observed at the present decision point.</param> /// <param name="values">The weights of the predicates which have been observed at the present decision point.</param> /// <param name="probs">The probability for outcomes.</param> /// <returns>The normalized probabilities for the outcomes given the context.</returns> public double[] Eval(string[] context, float[] values, double[] probs) { var ep = evalParameters.Parameters; for (var ci = 0; ci < context.Length; ci++) { var predIdx = GetPredIndex(context[ci]); if (predIdx < 0) { continue; } var predValue = 1d; if (values != null) { predValue = values[ci]; } var outcomes = ep[predIdx].Outcomes; var parameters = ep[predIdx].Parameters; for (var i = 0; i < outcomes.Length; i++) { probs[outcomes[i]] += predValue * parameters[i]; } } var logSumExp = ArrayMath.LogSumOfExps(probs); for (var oi = 0; oi < outcomeNames.Length; oi++) { probs[oi] = Math.Exp(probs[oi] - logSumExp); } return(probs); }
/// <summary> /// Constrained line search (see section 3.2 in the paper "Scalable Training of L1-Regularized Log-Linear Models", Andrew et al. 2007) /// </summary> /// <param name="function">The function.</param> /// <param name="direction">The direction.</param> /// <param name="lsr">The line search result.</param> /// <param name="l1Cost">The l1 cost.</param> /// <param name="initialStepSize">Initial size of the step.</param> public static void DoConstrainedLineSearch( IFunction function, double[] direction, LineSearchResult lsr, double l1Cost, double initialStepSize) { var stepSize = initialStepSize; var currFctEvalCount = lsr.FctEvalCount; var x = lsr.NextPoint; var signX = lsr.SignVector; // existing sign vector var gradAtX = lsr.GradAtNext; var pseudoGradAtX = lsr.PseudoGradAtNext; var valueAtX = lsr.ValueAtNext; var dimension = x.Length; // Retrieve current points and gradient for array reuse purpose var nextPoint = lsr.CurrPoint; var gradAtNextPoint = lsr.GradAtCurr; double valueAtNextPoint; // New sign vector for (var i = 0; i < dimension; i++) { signX[i] = x[i].Equals(0d) ? -pseudoGradAtX[i] : x[i]; } while (true) { // Get next point for (var i = 0; i < dimension; i++) { nextPoint[i] = x[i] + direction[i] * stepSize; } // Projection for (var i = 0; i < dimension; i++) { if (nextPoint[i] * signX[i] <= 0) { nextPoint[i] = 0; } } // New value valueAtNextPoint = function.ValueAt(nextPoint) + l1Cost * ArrayMath.L1Norm(nextPoint); currFctEvalCount++; double dirGradientAtX = 0; for (var i = 0; i < dimension; i++) { dirGradientAtX += (nextPoint[i] - x[i]) * pseudoGradAtX[i]; } // Check the sufficient decrease condition if (valueAtNextPoint <= valueAtX + C * dirGradientAtX) { break; } // Shrink step size stepSize *= RHO; } // Compute and save gradient at the new point Array.Copy(function.GradientAt(nextPoint), 0, gradAtNextPoint, 0, gradAtNextPoint.Length); // Update line search result lsr.SetAll(stepSize, valueAtX, valueAtNextPoint, gradAtX, gradAtNextPoint, pseudoGradAtX, x, nextPoint, signX, currFctEvalCount); }
/// <summary> /// Find the parameters that minimize the objective function. /// </summary> /// <param name="function">The objective function.</param> /// <returns>The minimizing parameters.</returns> /// <exception cref="OperationCanceledException">Occurs when the evaluation monitor cancels the operation.</exception> public double[] Minimize(IFunction function) { var l2RegFunction = new L2RegFunction(function, l2Cost); dimension = l2RegFunction.Dimension; updateInfo = new UpdateInfo(updates, dimension); // Current point is at the origin var currPoint = new double[dimension]; var currValue = l2RegFunction.ValueAt(currPoint); // Gradient at the current point var currGrad = new double[dimension]; Array.Copy(l2RegFunction.GradientAt(currPoint), 0, currGrad, 0, dimension); // Pseudo-gradient - only use when L1-regularization is enabled double[] pseudoGrad = null; if (l1Cost > 0) { currValue += l1Cost * ArrayMath.L1Norm(currPoint); pseudoGrad = new double[dimension]; ComputePseudoGrad(currPoint, currGrad, pseudoGrad); } var lsr = l1Cost > 0 ? LineSearchResult.GetInitialObjectForL1(currValue, currGrad, pseudoGrad, currPoint) : LineSearchResult.GetInitialObject(currValue, currGrad, currPoint); if (monitor != null) { Display("\nSolving convex optimization problem."); Display("\nObjective function has " + dimension + " variable(s)."); Display("\n\nPerforming " + iterations + " iterations with " + "L1Cost=" + l1Cost + " and L2Cost=" + l2Cost + "\n"); } var direction = new double[dimension]; var startTime = DateTime.Now; var token = monitor != null ? monitor.Token : CancellationToken.None; // Initial step size for the 1st iteration var initialStepSize = l1Cost > 0 ? ArrayMath.InvL2Norm(lsr.PseudoGradAtNext) : ArrayMath.InvL2Norm(lsr.GradAtNext); for (var iteration = 1; iteration <= iterations; iteration++) { // cancel if requested token.ThrowIfCancellationRequested(); // Find direction Array.Copy(l1Cost > 0 ? lsr.PseudoGradAtNext : lsr.GradAtNext, 0, direction, 0, direction.Length); ComputeDirection(direction); // Line search if (l1Cost > 0) { // Constrain the search direction pseudoGrad = lsr.PseudoGradAtNext; for (var i = 0; i < dimension; i++) { if (direction[i] * pseudoGrad[i] >= 0) { direction[i] = 0; } } LineSearch.DoConstrainedLineSearch(l2RegFunction, direction, lsr, l1Cost, initialStepSize); ComputePseudoGrad(lsr.NextPoint, lsr.GradAtNext, pseudoGrad); lsr.PseudoGradAtNext = pseudoGrad; } else { LineSearch.DoLineSearch(l2RegFunction, direction, lsr, initialStepSize); } // Save Hessian updates updateInfo.Update(lsr); if (monitor != null) { if (iteration < 10) { Display(" " + iteration + ": "); } else if (iteration < 100) { Display(" " + iteration + ": "); } else { Display(iteration + ": "); } if (Evaluator != null) { Display("\t" + lsr.ValueAtNext + "\t" + lsr.FuncChangeRate + "\t" + Evaluator.Evaluate(lsr.NextPoint) + "\n"); } else { Display("\t " + lsr.ValueAtNext + "\t" + lsr.FuncChangeRate + "\n"); } } if (IsConverged(lsr)) { break; } initialStepSize = InitialStepSize; } // Undo L2-shrinkage if Elastic Net is used (since in that case, the shrinkage is done twice) // // Knuppe: The original code makes no sense, so I change the NextPoint value! // // if (l1Cost > 0 && l2Cost > 0) { // double[] x = lsr.getNextPoint(); // for (int i = 0; i < dimension; i++) { // x[i] = Math.sqrt(1 + l2Cost) * x[i]; // } // } if (l1Cost > 0 && l2Cost > 0) { for (var i = 0; i < dimension; i++) { lsr.NextPoint[i] = Math.Sqrt(1 + l2Cost) * lsr.NextPoint[i]; } } if (monitor != null) { var endTime = DateTime.Now; var duration = endTime - startTime; Display("Running time: " + duration.TotalSeconds + "s\n"); } // Release memory updateInfo = null; // Avoid returning the reference to LineSearchResult's member so that GC can // collect memory occupied by lsr after this function completes (is it necessary?) // double[] parameters = new double[dimension]; // System.arraycopy(lsr.getNextPoint(), 0, parameters, 0, dimension); return(lsr.NextPoint); }