public virtual void TestGetSummaryForInstance(GraphicalModel[] dataset, ConcatVector weights) { LogLikelihoodDifferentiableFunction fn = new LogLikelihoodDifferentiableFunction(); foreach (GraphicalModel model in dataset) { double goldLogLikelihood = LogLikelihood(model, (ConcatVector)weights); ConcatVector goldGradient = DefinitionOfDerivative(model, (ConcatVector)weights); ConcatVector gradient = new ConcatVector(0); double logLikelihood = fn.GetSummaryForInstance(model, (ConcatVector)weights, gradient); NUnit.Framework.Assert.AreEqual(logLikelihood, Math.Max(1.0e-3, goldLogLikelihood * 1.0e-2), goldLogLikelihood); // Our check for gradient similarity involves distance between endpoints of vectors, instead of elementwise // similarity, b/c it can be controlled as a percentage ConcatVector difference = goldGradient.DeepClone(); difference.AddVectorInPlace(gradient, -1); double distance = Math.Sqrt(difference.DotProduct(difference)); // The tolerance here is pretty large, since the gold gradient is computed approximately // 5% still tells us whether everything is working or not though if (distance > 5.0e-2) { System.Console.Error.WriteLine("Definitional and calculated gradient differ!"); System.Console.Error.WriteLine("Definition approx: " + goldGradient); System.Console.Error.WriteLine("Calculated: " + gradient); } NUnit.Framework.Assert.AreEqual(distance, 5.0e-2, 0.0); } }
/* * @Theory * public void testOptimizeLogLikelihoodWithConstraints(AbstractBatchOptimizer optimizer, * @ForAll(sampleSize = 5) @From(LogLikelihoodFunctionTest.GraphicalModelDatasetGenerator.class) GraphicalModel[] dataset, * @ForAll(sampleSize = 2) @From(LogLikelihoodFunctionTest.WeightsGenerator.class) ConcatVector initialWeights, * @ForAll(sampleSize = 2) @InRange(minDouble = 0.0, maxDouble = 5.0) double l2regularization) throws Exception { * Random r = new Random(42); * * int constraintComponent = r.nextInt(initialWeights.getNumberOfComponents()); * double constraintValue = r.nextDouble(); * * if (r.nextBoolean()) { * optimizer.addSparseConstraint(constraintComponent, 0, constraintValue); * } else { * optimizer.addDenseConstraint(constraintComponent, new double[]{constraintValue}); * } * * // Put in some constraints * * AbstractDifferentiableFunction<GraphicalModel> ll = new LogLikelihoodDifferentiableFunction(); * ConcatVector finalWeights = optimizer.optimize(dataset, ll, initialWeights, l2regularization, 1.0e-9, false); * System.err.println("Finished optimizing"); * * assertEquals(constraintValue, finalWeights.getValueAt(constraintComponent, 0), 1.0e-9); * * double logLikelihood = getValueSum(dataset, finalWeights, ll, l2regularization); * * // Check in a whole bunch of random directions really nearby that there is no nearby point with a higher log * // likelihood * for (int i = 0; i < 1000; i++) { * int size = finalWeights.getNumberOfComponents(); * ConcatVector randomDirection = new ConcatVector(size); * for (int j = 0; j < size; j++) { * if (j == constraintComponent) continue; * double[] dense = new double[finalWeights.isComponentSparse(j) ? finalWeights.getSparseIndex(j) + 1 : finalWeights.getDenseComponent(j).length]; * for (int k = 0; k < dense.length; k++) { * dense[k] = (r.nextDouble() - 0.5) * 1.0e-3; * } * randomDirection.setDenseComponent(j, dense); * } * * ConcatVector randomPerturbation = finalWeights.deepClone(); * randomPerturbation.addVectorInPlace(randomDirection, 1.0); * * double randomPerturbedLogLikelihood = getValueSum(dataset, randomPerturbation, ll, l2regularization); * * // Check that we're within a very small margin of error (around 3 decimal places) of the randomly * // discovered value * * if (logLikelihood < randomPerturbedLogLikelihood - (1.0e-3 * Math.max(1.0, Math.abs(logLikelihood)))) { * System.err.println("Thought optimal point was: " + logLikelihood); * System.err.println("Discovered better point: " + randomPerturbedLogLikelihood); * } * * assertTrue(logLikelihood >= randomPerturbedLogLikelihood - (1.0e-3 * Math.max(1.0, Math.abs(logLikelihood)))); * } * } */ private double GetValueSum <T>(T[] dataset, ConcatVector weights, AbstractDifferentiableFunction <T> fn, double l2regularization) { double value = 0.0; foreach (T t in dataset) { value += fn.GetSummaryForInstance(t, weights, new ConcatVector(0)); } return((value / dataset.Length) - (weights.DotProduct(weights) * l2regularization)); }
public virtual void Run() { // Multithreading stuff int numThreads = Math.Max(1, Runtime.GetRuntime().AvailableProcessors()); IList <T>[] queues = (IList <T>[])(new IList[numThreads]); Random r = new Random(); // Allocate work to make estimated cost of work per thread as even as possible if (this.useThreads) { for (int i = 0; i < numThreads; i++) { queues[i] = new List <T>(); } int[] queueEstimatedTotalCost = new int[numThreads]; foreach (T datum in this.dataset) { int datumEstimatedCost = this.EstimateRelativeRuntime(datum); int minCostQueue = 0; for (int i_1 = 0; i_1 < numThreads; i_1++) { if (queueEstimatedTotalCost[i_1] < queueEstimatedTotalCost[minCostQueue]) { minCostQueue = i_1; } } queueEstimatedTotalCost[minCostQueue] += datumEstimatedCost; queues[minCostQueue].Add(datum); } } while (!this.isFinished) { // Collect log-likelihood and derivatives long startTime = Runtime.CurrentTimeMillis(); long threadWaiting = 0; ConcatVector derivative = this.weights.NewEmptyClone(); double logLikelihood = 0.0; if (this.useThreads) { AbstractBatchOptimizer.GradientWorker[] workers = new AbstractBatchOptimizer.GradientWorker[numThreads]; Thread[] threads = new Thread[numThreads]; for (int i = 0; i < workers.Length; i++) { workers[i] = new AbstractBatchOptimizer.GradientWorker(this, i, numThreads, queues[i], this.fn, this.weights); threads[i] = new Thread(workers[i]); workers[i].jvmThreadId = threads[i].GetId(); threads[i].Start(); } // This is for logging long minFinishTime = long.MaxValue; long maxFinishTime = long.MinValue; // This is for re-balancing long minCPUTime = long.MaxValue; long maxCPUTime = long.MinValue; int slowestWorker = 0; int fastestWorker = 0; for (int i_1 = 0; i_1 < workers.Length; i_1++) { try { threads[i_1].Join(); } catch (Exception e) { throw new RuntimeInterruptedException(e); } logLikelihood += workers[i_1].localLogLikelihood; derivative.AddVectorInPlace(workers[i_1].localDerivative, 1.0); if (workers[i_1].finishedAtTime < minFinishTime) { minFinishTime = workers[i_1].finishedAtTime; } if (workers[i_1].finishedAtTime > maxFinishTime) { maxFinishTime = workers[i_1].finishedAtTime; } if (workers[i_1].cpuTimeRequired < minCPUTime) { fastestWorker = i_1; minCPUTime = workers[i_1].cpuTimeRequired; } if (workers[i_1].cpuTimeRequired > maxCPUTime) { slowestWorker = i_1; maxCPUTime = workers[i_1].cpuTimeRequired; } } threadWaiting = maxFinishTime - minFinishTime; // Try to reallocate work dynamically to minimize waiting on subsequent rounds // Figure out the percentage of work represented by the waiting double waitingPercentage = (double)(maxCPUTime - minCPUTime) / (double)maxCPUTime; int needTransferItems = (int)Math.Floor(queues[slowestWorker].Count * waitingPercentage * 0.5); for (int i_2 = 0; i_2 < needTransferItems; i_2++) { int toTransfer = r.NextInt(queues[slowestWorker].Count); T datum = queues[slowestWorker][toTransfer]; queues[slowestWorker].Remove(toTransfer); queues[fastestWorker].Add(datum); } // Check for user interrupt if (this.isFinished) { return; } } else { foreach (T datum in this.dataset) { System.Diagnostics.Debug.Assert((datum != null)); logLikelihood += this.fn.GetSummaryForInstance(datum, this.weights, derivative); // Check for user interrupt if (this.isFinished) { return; } } } logLikelihood /= this.dataset.Length; derivative.MapInPlace(null); long gradientComputationTime = Runtime.CurrentTimeMillis() - startTime; // Regularization logLikelihood = logLikelihood - (this.l2regularization * this.weights.DotProduct(this.weights)); derivative.AddVectorInPlace(this.weights, -2 * this.l2regularization); // Zero out the derivative on the components we're holding fixed foreach (AbstractBatchOptimizer.Constraint constraint in this._enclosing.constraints) { constraint.ApplyToDerivative(derivative); } // If our derivative is sufficiently small, we've converged double derivativeNorm = derivative.DotProduct(derivative); if (derivativeNorm < this.convergenceDerivativeNorm) { if (!this.quiet) { AbstractBatchOptimizer.log.Info("Derivative norm " + derivativeNorm + " < " + this.convergenceDerivativeNorm + ": quitting"); } break; } // Do the actual computation if (!this.quiet) { AbstractBatchOptimizer.log.Info("[" + gradientComputationTime + " ms, threads waiting " + threadWaiting + " ms]"); } bool converged = this._enclosing.UpdateWeights(this.weights, derivative, logLikelihood, this.optimizationState, this.quiet); // Apply constraints to the weights vector foreach (AbstractBatchOptimizer.Constraint constraint_1 in this._enclosing.constraints) { constraint_1.ApplyToWeights(this.weights); } if (converged) { break; } } lock (this.naturalTerminationBarrier) { Sharpen.Runtime.NotifyAll(this.naturalTerminationBarrier); } this.isFinished = true; }
// This sets a gold observation for a model to use as training gold data /// <summary> /// Gets a summary of the log-likelihood of a singe model at a point /// <p> /// It assumes that the models have observations for training set as metadata in /// LogLikelihoodDifferentiableFunction.OBSERVATION_FOR_TRAINING. /// </summary> /// <remarks> /// Gets a summary of the log-likelihood of a singe model at a point /// <p> /// It assumes that the models have observations for training set as metadata in /// LogLikelihoodDifferentiableFunction.OBSERVATION_FOR_TRAINING. The models can also have observations fixed in /// CliqueTree.VARIABLE_OBSERVED_VALUE, but these will be considered fixed and will not be learned against. /// </remarks> /// <param name="model">the model to find the log-likelihood of</param> /// <param name="weights">the weights to use</param> /// <returns>the gradient and value of the function at that point</returns> public override double GetSummaryForInstance(GraphicalModel model, ConcatVector weights, ConcatVector gradient) { double logLikelihood = 0.0; CliqueTree.MarginalResult result = new CliqueTree(model, weights).CalculateMarginals(); // Cache everything in preparation for multiple redundant requests for feature vectors foreach (GraphicalModel.Factor factor in model.factors) { factor.featuresTable.CacheVectors(); } // Subtract log partition function logLikelihood -= Math.Log(result.partitionFunction); // Quit if we have an infinite partition function if (double.IsInfinite(logLikelihood)) { return(0.0); } // Add the determined assignment by training values foreach (GraphicalModel.Factor factor_1 in model.factors) { // Find the assignment, taking both fixed and training observed variables into account int[] assignment = new int[factor_1.neigborIndices.Length]; for (int i = 0; i < assignment.Length; i++) { int deterministicValue = GetDeterministicAssignment(result.marginals[factor_1.neigborIndices[i]]); if (deterministicValue != -1) { assignment[i] = deterministicValue; } else { int trainingObservation = System.Convert.ToInt32(model.GetVariableMetaDataByReference(factor_1.neigborIndices[i])[LogLikelihoodDifferentiableFunction.VariableTrainingValue]); assignment[i] = trainingObservation; } } ConcatVector features = factor_1.featuresTable.GetAssignmentValue(assignment).Get(); // Add the log-likelihood from this observation to the log-likelihood logLikelihood += features.DotProduct(weights); // Add the vector from this observation to the gradient gradient.AddVectorInPlace(features, 1.0); } // Take expectations over features given marginals // NOTE: This is extremely expensive. Not sure what to do about that foreach (GraphicalModel.Factor factor_2 in model.factors) { // OPTIMIZATION: // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. IEnumerator <int[]> fastPassByReferenceIterator = factor_2.featuresTable.FastPassByReferenceIterator(); int[] assignment = fastPassByReferenceIterator.Current; while (true) { // calculate assignment prob double assignmentProb = result.jointMarginals[factor_2].GetAssignmentValue(assignment); // subtract this feature set, weighted by the probability of the assignment if (assignmentProb > 0) { gradient.AddVectorInPlace(factor_2.featuresTable.GetAssignmentValue(assignment).Get(), -assignmentProb); } // This mutates the assignment[] array, rather than creating a new one if (fastPassByReferenceIterator.MoveNext()) { fastPassByReferenceIterator.Current; } else { break; } } } // Uncache everything, now that the computations have completed foreach (GraphicalModel.Factor factor_3 in model.factors) { factor_3.featuresTable.ReleaseCache(); } return(logLikelihood); }