//Project the covariance matrix A on to Omega: Y <- A * Omega //A = X' * X / n, where X = data - mean //Note that the covariance matrix is not computed explicitly private static void Project(IHost host, FeatureFloatVectorCursor.Factory cursorFactory, ref VBuffer <Float> mean, VBuffer <Float>[] omega, VBuffer <Float>[] y, out long numBad) { Contracts.AssertValue(host, "host"); host.AssertNonEmpty(omega); host.Assert(Utils.Size(y) == omega.Length); // Size of Y and Omega: dimension x oversampled rank int numCols = omega.Length; for (int i = 0; i < y.Length; ++i) { VBufferUtils.Clear(ref y[i]); } bool center = mean.IsDense; Float n = 0; long count = 0; using (var pch = host.StartProgressChannel("Project covariance matrix")) using (var cursor = cursorFactory.Create()) { pch.SetHeader(new ProgressHeader(new[] { "rows" }), e => e.SetProgress(0, count)); while (cursor.MoveNext()) { if (center) { VectorUtils.AddMult(ref cursor.Features, cursor.Weight, ref mean); } for (int i = 0; i < numCols; i++) { VectorUtils.AddMult( ref cursor.Features, cursor.Weight * VectorUtils.DotProduct(ref omega[i], ref cursor.Features), ref y[i]); } n += cursor.Weight; count++; } pch.Checkpoint(count); numBad = cursor.SkippedRowCount; } Contracts.Check(n > 0, "Empty training data"); Float invn = 1 / n; for (var i = 0; i < numCols; ++i) { VectorUtils.ScaleBy(ref y[i], invn); } if (center) { VectorUtils.ScaleBy(ref mean, invn); for (int i = 0; i < numCols; i++) { VectorUtils.AddMult(ref mean, -VectorUtils.DotProduct(ref omega[i], ref mean), ref y[i]); } } }
private PcaPredictor TrainCore(IChannel ch, RoleMappedData data, int dimension) { Host.AssertValue(ch); ch.AssertValue(data); if (_rank > dimension) { throw ch.Except("Rank ({0}) cannot be larger than the original dimension ({1})", _rank, dimension); } int oversampledRank = Math.Min(_rank + _oversampling, dimension); //exact: (size of the 2 big matrices + other minor allocations) / (2^30) Double memoryUsageEstimate = 2.0 * dimension * oversampledRank * sizeof(Float) / 1e9; if (memoryUsageEstimate > 2) { ch.Info("Estimate memory usage: {0:G2} GB. If running out of memory, reduce rank and oversampling factor.", memoryUsageEstimate); } var y = Zeros(oversampledRank, dimension); var mean = _center ? VBufferUtils.CreateDense <Float>(dimension) : VBufferUtils.CreateEmpty <Float>(dimension); var omega = GaussianMatrix(oversampledRank, dimension, _seed); var cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Weight); long numBad; Project(Host, cursorFactory, ref mean, omega, y, out numBad); if (numBad > 0) { ch.Warning("Skipped {0} instances with missing features/weights during training", numBad); } //Orthonormalize Y in-place using stabilized Gram Schmidt algorithm. //Ref: https://en.wikipedia.org/wiki/Gram-Schmidt#Algorithm for (var i = 0; i < oversampledRank; ++i) { var v = y[i]; VectorUtils.ScaleBy(ref v, 1 / VectorUtils.Norm(y[i])); // Make the next vectors in the queue orthogonal to the orthonormalized vectors. for (var j = i + 1; j < oversampledRank; ++j) //subtract the projection of y[j] on v. { VectorUtils.AddMult(ref v, -VectorUtils.DotProduct(ref v, ref y[j]), ref y[j]); } } var q = y; // q in QR decomposition. var b = omega; // reuse the memory allocated by Omega. Project(Host, cursorFactory, ref mean, q, b, out numBad); //Compute B2 = B' * B var b2 = new Float[oversampledRank * oversampledRank]; for (var i = 0; i < oversampledRank; ++i) { for (var j = i; j < oversampledRank; ++j) { b2[i * oversampledRank + j] = b2[j * oversampledRank + i] = VectorUtils.DotProduct(ref b[i], ref b[j]); } } Float[] smallEigenvalues;// eigenvectors and eigenvalues of the small matrix B2. Float[] smallEigenvectors; EigenUtils.EigenDecomposition(b2, out smallEigenvalues, out smallEigenvectors); PostProcess(b, smallEigenvalues, smallEigenvectors, dimension, oversampledRank); return(new PcaPredictor(Host, _rank, b, ref mean)); }