private LassoFit GetLassoFit(IChannel ch, int maxAllowedFeaturesPerModel) { Stopwatch stopWatch = Stopwatch.StartNew(); if (maxAllowedFeaturesPerModel < 0) { maxAllowedFeaturesPerModel = _numFeatures; } int numberOfLambdas = DefaultNumberOFLambdas; int maxAllowedFeaturesAlongPath = (int)Math.Min(maxAllowedFeaturesPerModel * 1.2, _numFeatures); ch.Info("Lasso Compression uses {0} observations.", _numObservations); // lambdaMin = flmin * lambdaMax double flmin = (_numObservations < _numFeatures ? 5e-2 : 1e-4); /******************************** * Standardize predictors and target: * Center the target and features (mean 0) and normalize their vectors to have the same * standard deviation */ double[] featureMeans = new double[_numFeatures]; double[] featureStds = new double[_numFeatures]; double[] feature2residualCorrelations = new double[_numFeatures]; float factor = (float)(1.0 / Math.Sqrt(_numObservations)); for (int j = 0; j < _numFeatures; j++) { double mean = VectorUtils.GetMean(_observations[j]); featureMeans[j] = mean; unsafe { fixed(float *pVector = _observations[j]) { for (int i = 0; i < _numObservations; i++) { pVector[i] = (float)(factor * (pVector[i] - mean)); } } } featureStds[j] = Math.Sqrt(VectorUtils.GetDotProduct(_observations[j], _observations[j])); VectorUtils.DivideInPlace(_observations[j], (float)featureStds[j]); } float targetMean = (float)VectorUtils.GetMean(_targets); unsafe { fixed(float *pVector = _targets) { for (int i = 0; i < _numObservations; i++) { pVector[i] = factor * (pVector[i] - targetMean); } } } float targetStd = (float)Math.Sqrt(VectorUtils.GetDotProduct(_targets, _targets)); VectorUtils.DivideInPlace(_targets, targetStd); for (int j = 0; j < _numFeatures; j++) { feature2residualCorrelations[j] = VectorUtils.GetDotProduct(_targets, _observations[j]); } double[][] feature2featureCorrelations = VectorUtils.AllocateDoubleMatrix(_numFeatures, maxAllowedFeaturesAlongPath); double[] activeWeights = new double[_numFeatures]; int[] correlationCacheIndices = new int[_numFeatures]; double[] denseActiveSet = new double[_numFeatures]; LassoFit fit = new LassoFit(numberOfLambdas, maxAllowedFeaturesAlongPath, _numFeatures); fit.NumberOfLambdas = 0; double alf = Math.Pow(Math.Max(Epsilon, flmin), 1.0 / (numberOfLambdas - 1)); double rsquared = 0.0; fit.NumberOfPasses = 0; int numberOfInputs = 0; int minimumNumberOfLambdas = Math.Min(MinNumberOFLambdas, numberOfLambdas); double curLambda = 0; double maxDelta; for (int iteration = 1; iteration <= numberOfLambdas; iteration++) { ch.Info("Starting iteration {0}: R2={1}", iteration, rsquared); /********** * Compute lambda for this round */ if (iteration == 1) { curLambda = Double.MaxValue; // first lambda is infinity } else if (iteration == 2) { curLambda = 0.0; for (int j = 0; j < _numFeatures; j++) { curLambda = Math.Max(curLambda, Math.Abs(feature2residualCorrelations[j])); } curLambda = alf * curLambda; } else { curLambda = curLambda * alf; } double prevRsq = rsquared; double v; unsafe { fixed(double *pActiveWeights = activeWeights) fixed(double *pFeature2residualCorrelations = feature2residualCorrelations) fixed(int *pIndices = fit.Indices) fixed(int *pCorrelationCacheIndices = correlationCacheIndices) { while (true) { fit.NumberOfPasses++; maxDelta = 0.0; for (int k = 0; k < _numFeatures; k++) { double prevWeight = pActiveWeights[k]; double u = pFeature2residualCorrelations[k] + prevWeight; v = (u >= 0 ? u : -u) - curLambda; // Computes sign(u)(|u| - curLambda)+ pActiveWeights[k] = (v > 0 ? (u >= 0 ? v : -v) : 0.0); // Is the weight of this variable changed? // If not, we go to the next one if (pActiveWeights[k] == prevWeight) { continue; } // If we have not computed the correlations of this // variable with other variables, we do this now and // cache the result if (pCorrelationCacheIndices[k] == 0) { numberOfInputs++; if (numberOfInputs > maxAllowedFeaturesAlongPath) { // we have reached the maximum break; } for (int j = 0; j < _numFeatures; j++) { // if we have already computed correlations for // the jth variable, we will reuse it here. if (pCorrelationCacheIndices[j] != 0) { feature2featureCorrelations[j][numberOfInputs - 1] = feature2featureCorrelations[k][pCorrelationCacheIndices[j] - 1]; } else { // Correlation of variable with itself if one if (j == k) { feature2featureCorrelations[j][numberOfInputs - 1] = 1.0; } else { feature2featureCorrelations[j][numberOfInputs - 1] = VectorUtils.GetDotProduct(_observations[j], _observations[k]); } } } pCorrelationCacheIndices[k] = numberOfInputs; pIndices[numberOfInputs - 1] = k; } // How much is the weight changed? double delta = pActiveWeights[k] - prevWeight; rsquared += delta * (2.0 * pFeature2residualCorrelations[k] - delta); maxDelta = Math.Max((delta >= 0 ? delta : -delta), maxDelta); for (int j = 0; j < _numFeatures; j++) { pFeature2residualCorrelations[j] -= feature2featureCorrelations[j][pCorrelationCacheIndices[k] - 1] * delta; } } if (maxDelta < ConvergenceThreshold || numberOfInputs > maxAllowedFeaturesAlongPath) { break; } for (int ii = 0; ii < numberOfInputs; ii++) { denseActiveSet[ii] = activeWeights[pIndices[ii]]; } do { fit.NumberOfPasses++; maxDelta = 0.0; for (int l = 0; l < numberOfInputs; l++) { int k = pIndices[l]; double prevWeight = pActiveWeights[k]; double u = pFeature2residualCorrelations[k] + prevWeight; v = (u >= 0 ? u : -u) - curLambda; pActiveWeights[k] = (v > 0 ? (u >= 0 ? v : -v) : 0.0); if (activeWeights[k] == prevWeight) { continue; } double delta = pActiveWeights[k] - prevWeight; rsquared += delta * (2.0 * pFeature2residualCorrelations[k] - delta); maxDelta = Math.Max((delta >= 0 ? delta : -delta), maxDelta); for (int j = 0; j < numberOfInputs; j++) { pFeature2residualCorrelations[pIndices[j]] -= feature2featureCorrelations[pIndices[j]][pCorrelationCacheIndices[k] - 1] * delta; } } } while (maxDelta >= ConvergenceThreshold); for (int ii = 0; ii < numberOfInputs; ii++) { denseActiveSet[ii] = pActiveWeights[pIndices[ii]] - denseActiveSet[ii]; } for (int j = 0; j < _numFeatures; j++) { if (pCorrelationCacheIndices[j] == 0) { pFeature2residualCorrelations[j] -= VectorUtils.GetDotProduct(denseActiveSet, feature2featureCorrelations[j], numberOfInputs); } } } if (numberOfInputs > maxAllowedFeaturesAlongPath) { break; } if (numberOfInputs > 0) { for (int ii = 0; ii < numberOfInputs; ii++) { fit.CompressedWeights[iteration - 1][ii] = pActiveWeights[pIndices[ii]]; } } fit.NumberOfWeights[iteration - 1] = numberOfInputs; fit.Rsquared[iteration - 1] = rsquared; fit.Lambdas[iteration - 1] = curLambda; fit.NumberOfLambdas = iteration; if (iteration < minimumNumberOfLambdas) { continue; } int me = 0; for (int j = 0; j < numberOfInputs; j++) { if (fit.CompressedWeights[iteration - 1][j] != 0.0) { me++; } } if (me > maxAllowedFeaturesPerModel || ((rsquared - prevRsq) < (Small * rsquared)) || rsquared > MaxRSquared) { break; } } } } for (int k = 0; k < fit.NumberOfLambdas; k++) { fit.Lambdas[k] = targetStd * fit.Lambdas[k]; int nk = fit.NumberOfWeights[k]; for (int l = 0; l < nk; l++) { fit.CompressedWeights[k][l] = targetStd * fit.CompressedWeights[k][l] / featureStds[fit.Indices[l]]; if (fit.CompressedWeights[k][l] != 0) { fit.NonZeroWeights[k]++; } } double product = 0; for (int i = 0; i < nk; i++) { product += fit.CompressedWeights[k][i] * featureMeans[fit.Indices[i]]; } fit.Intercepts[k] = targetMean - product; } // First lambda was infinity; fixing it fit.Lambdas[0] = Math.Exp(2 * Math.Log(fit.Lambdas[1]) - Math.Log(fit.Lambdas[2])); stopWatch.Stop(); ch.Info("Elapsed time for compression: {0}", stopWatch.Elapsed); return(fit); }
protected override double[] GetGradient(IChannel ch) { Contracts.AssertValue(ch); _previousGradient = _currentGradient; _currentGradient = ObjectiveFunction.GetGradient(ch, TrainingScores.Scores); // We need to make a copy of gradient coz the reference returned is private structare of ObejctiveFunctionBase is valid only till next GetGradient call _currentGradient = (double[])_currentGradient.Clone(); double[] previousDk = _currentDk; //First iteration if (_previousGradient == null) { _previousGradient = _currentGradient; } #if !POLAK_RIBIERE_STEP // Compute Beta[k] = curG[k] * (curG[k] - prevG[k]) // TODO: this can be optimized for speed. Keeping it slow but simple for now double beta = VectorUtils.GetDotProduct(_currentGradient, VectorUtils.Subtract(_currentGradient, _previousGradient)) / VectorUtils.GetDotProduct(_previousGradient, _previousGradient); #else //Fletcher Reeves step // Compute Beta[k] = (curG[k]*cutG[k]) / (prevG[k] * prevG[k]) double beta = VectorUtils.GetDotProduct(currentGradient, currentGradient) / VectorUtils.GetDotProduct(previousGradient, previousGradient); #endif if (beta < 0) { beta = 0; } ch.Info("beta: {0}", beta); VectorUtils.MutiplyInPlace(previousDk, beta); VectorUtils.AddInPlace(previousDk, _currentGradient); _currentDk = previousDk; // Reallay no-op opration // We know that LeastSquaresRegressionTreeLearner does not destroy gradients so we can return our reference that we will need in next iter. if (TreeLearner is LeastSquaresRegressionTreeLearner) { return(_currentDk); } // Assume that other treLearners destroy the gradient array so return a copy. else { return((double[])_currentDk.Clone()); } }