void CalculateP() { int N = cc.c.N; distanceBuf = gpu.CreateBufferRW((N * N - N) / 2, 4, 0); using (var shader = gpu.LoadShader("TsneDx.CreateDistanceCache.cso")) { gpu.SetShader(shader); int groupNr = 256; for (int i = 0; i < N; i += groupNr) { cc.c.blockIdx = i; cc.Upload(); gpu.Run(groupNr); } } PBuf = gpu.CreateBufferRW(N * N, 4, 1); cc.c.chacedP = true; using (var sd = gpu.LoadShader("TsneDx.CalculateP.cso")) { // Calculate the squared distance matrix in to P using (var sd2 = gpu.LoadShader("TsneDx.CalculatePFromCache.cso")) { gpu.SetShader(sd2); gpu.Run(64); } gpu.SetShader(sd); // Normalize and symmetrizing the distance matrix cc.c.cmd = 4; cc.Upload(); gpu.Run(); // Convert the matrix to affinities. cc.c.cmd = 2; cc.c.groupNumber = 4; for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GpuGroupSize) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } // Normalize and symmetrizing the affinity matrix gpu.SetShader(sd); cc.c.cmd = 3; cc.Upload(); gpu.Run(); } }
// Reduce give matrix to top PC components. public float[][] DoPca(float[][] A, int eigenCount) { GpuDevice gpu = new GpuDevice(); var cc = gpu.CreateConstBuffer <PcaConstants>(0); bool transposing = (A.Length > A[0].Length); cc.c.eigenCount = eigenCount; cc.c.rows = transposing ? A[0].Length : A.Length; cc.c.columns = (!transposing) ? A[0].Length : A.Length; var resultBuf = gpu.CreateBufferRW(3, 4, 1); // to receive the total changes. var resultStaging = gpu.CreateStagingBuffer(resultBuf); Buffer tableBuf = gpu.CreateBufferRO(cc.c.rows * cc.c.columns, 4, 0); double[] colMean = new double[A[0].Length]; Parallel.For(0, A[0].Length, col => { colMean[col] = 0.0; for (int row = 0; row < A.Length; row++) { colMean[col] += A[row][col]; } colMean[col] /= A.Length; }); using (var ds = gpu.NewWriteStream(tableBuf)) { float[] buf = new float[cc.c.rows * cc.c.columns]; if (transposing) { Parallel.For(0, cc.c.columns, col => { int offset = col * cc.c.rows; for (int row = 0; row < cc.c.rows; row++) { buf[offset + row] = (float)(A[col][row] - colMean[row]); } }); } else { Parallel.For(0, cc.c.columns, col => { int offset = col * cc.c.rows; for (int row = 0; row < cc.c.rows; row++) { buf[offset + row] = (float)(A[row][col] - colMean[col]); } }); } ds.WriteRange(buf); } cc.c.covFactor = transposing ? 1.0f / (cc.c.columns - 1) : 1.0f / (cc.c.rows - 1); Buffer covBuf = gpu.CreateBufferRW(cc.c.rows * cc.c.rows, 4, 0); using (var shader = gpu.LoadShader("TsneDx.PcaCreateCovMatrix.cso")) { gpu.SetShader(shader); cc.c.groupNumber = 256; for (int iBlock = 0; iBlock < cc.c.rows; iBlock += cc.c.groupNumber) { cc.c.iBlock = iBlock; cc.Upload(); gpu.Run(cc.c.groupNumber); } } var eVectorBuf = gpu.CreateBufferRW(cc.c.rows, 4, 2); var eVectorStaging = gpu.CreateStagingBuffer(eVectorBuf); var eVector2Buf = gpu.CreateBufferRW(cc.c.rows, 4, 3); var sdInit = gpu.LoadShader("TsneDx.PcaInitIteration.cso"); var sdStep = gpu.LoadShader("TsneDx.PcaIterateOneStep.cso"); var sdNorm = gpu.LoadShader("TsneDx.PcaCalculateNormal.cso"); var sdAdjCov = gpu.LoadShader("TsneDx.PcaAdjustCovMatrix.cso"); gpu.SetShader(sdInit); cc.c.eigenIdx = 0; cc.Upload(); gpu.Run(); float preEigen = 1e30f; float newEigen = 0; float[][] eVectors = new float[eigenCount][]; double[] eValues = new double[eigenCount]; for (int eigenIdx = 0; eigenIdx < eigenCount; eigenIdx++) { cc.c.groupNumber = 256; cc.Upload(); for (int repeat = 0; repeat < MAX_ITERATION; repeat++) { gpu.SetShader(sdStep); gpu.Run(cc.c.groupNumber); gpu.SetShader(sdNorm); gpu.Run(1); newEigen = gpu.ReadFloat(resultStaging, resultBuf); double delta = Math.Abs((newEigen - preEigen) / preEigen); if (delta < epsilon) { break; } preEigen = newEigen; } eValues[eigenIdx] = (double)newEigen; // Eigenvector with extrem small eigenvalue (i.e. 0.0) will be ignored and stop the calculation. if (Math.Abs(eValues[eigenIdx] / eValues[0]) < stopEpsilon) { Array.Resize(ref eValues, eigenIdx); Array.Resize(ref eVectors, eigenIdx); break; } eVectors[eigenIdx] = new float[cc.c.rows]; Array.Copy(gpu.ReadRange <float>(eVectorStaging, eVectorBuf, cc.c.rows), eVectors[eigenIdx], cc.c.rows); if (eigenIdx == (eigenCount - 1)) { break; } // Adjust the covariance matrix. gpu.SetShader(sdAdjCov); cc.c.groupNumber = 128; cc.Upload(); gpu.Run(cc.c.groupNumber); // Initialize the iteration loop for the next eigen-vector. gpu.SetShader(sdInit); cc.c.eigenIdx = eigenIdx + 1; cc.Upload(); gpu.Run(); //CmdSynchronize(); } if (!transposing) { using (var shader = gpu.LoadShader("TsneDx.PcaTransposeEigenvectors.cso")) { int eRows = eVectors.Length; int eColumns = eVectors[0].Length; Buffer eigenList1 = gpu.CreateBufferRO(eRows * eColumns, 4, 1); double[] S = eValues.Select(x => 1.0 / Math.Sqrt(Math.Abs(x * (eVectors[0].Length - 1)))).ToArray(); float[] eVector1 = new float[eRows * eColumns]; for (int row = 0; row < eRows; row++) { for (int col = 0; col < eColumns; col++) { eVector1[row * eColumns + col] = (float)(S[row] * eVectors[row][col]); } } using (var ds = gpu.NewWriteStream(eigenList1)) ds.WriteRange(eVector1); Buffer eigenList2 = gpu.CreateBufferRW(eVectors.Length * cc.c.columns, 4, 4); gpu.SetShader(shader); cc.c.groupNumber = 128; cc.c.eigenCount = eVectors.Length; cc.Upload(); gpu.Run(cc.c.groupNumber); float[] eVectors2 = gpu.ReadRange <float>(eigenList2, eVectors.Length * cc.c.columns); eVectors = new float[eVectors.Length][]; for (int row = 0; row < eVectors.Length; row++) { eVectors[row] = new float[cc.c.columns]; } Parallel.For(0, eVectors.Length, row => { Array.Copy(eVectors2, row * cc.c.columns, eVectors[row], 0, cc.c.columns); }); TsneDx.SafeDispose(eigenList1, eigenList2); } } float[][] B = null; cc.c.rows = A.Length; cc.c.columns = A[0].Length; cc.c.eigenCount = eVectors.Length; cc.Upload(); if (transposing) { // The tableBuf on GPU is in wrong matrix order. We need to upload the tableBuf in needed order here. TsneDx.SafeDispose(tableBuf); tableBuf = gpu.CreateBufferRO(cc.c.rows * cc.c.columns, 4, 0); Parallel.For(0, cc.c.rows, row => { for (int col = 0; col < cc.c.columns; col++) { A[row][col] -= (float)colMean[col]; } }); gpu.WriteMarix(tableBuf, A); } Buffer eigenTable = gpu.CreateBufferRO(cc.c.eigenCount * cc.c.columns, 4, 1); gpu.WriteMarix(eigenTable, eVectors); TsneDx.SafeDispose(resultBuf); resultBuf = gpu.CreateBufferRW(cc.c.rows * cc.c.eigenCount, 4, 1); using (var shader = gpu.LoadShader("TsneDx.PcaReduceMatrix.cso")) { try { gpu.SetShader(shader); const int GROUP_NR = 256; gpu.Run(GROUP_NR); float[] buf = gpu.ReadRange <float>(resultBuf, cc.c.rows * cc.c.eigenCount); B = new float[cc.c.rows][]; for (int row = 0; row < cc.c.rows; row++) { B[row] = new float[cc.c.eigenCount]; } Parallel.For(0, cc.c.rows, row => { Array.Copy(buf, row * cc.c.eigenCount, B[row], 0, cc.c.eigenCount); }); } catch (SharpDX.SharpDXException ex) { string msg = ex.Message; Console.WriteLine("GPU operation timeouted: Please try to enlarge the TDR value"); } } TsneDx.SafeDispose(eigenTable, sdInit, sdStep, sdNorm, sdAdjCov, eVectorBuf, eVectorStaging, eVector2Buf, resultBuf, resultStaging, covBuf, tableBuf, cc, gpu); return(B); }
public float[][] Fit(float[][] X) { int exaggerationLength = (int)(MaxEpochs * ExaggerationRatio); gpu = new GpuDevice(); cc = gpu.CreateConstantBuffer <TsneMapConstants>(0); int N = X.Length; cc.c.columns = X[0].Length; cc.c.N = N; cc.c.outDim = OutDim; cc.c.metricType = MetricType; #region Initialize Y Buffer Y2Buf = null; Buffer Y3Buf = null; Buffer Y3StagingBuf = null; Buffer Y2StagingBuf = null; Buffer v2Buf = null; Buffer v3Buf = null; if (cc.c.outDim <= 2) { Y2Buf = gpu.CreateBufferRW(N, 8, 3); Y2StagingBuf = gpu.CreateStagingBuffer(Y2Buf); v2Buf = gpu.CreateBufferRW(N, 2 * 8, 5); } else { Y3Buf = gpu.CreateBufferRW(N, 12, 4); Y3StagingBuf = gpu.CreateStagingBuffer(Y3Buf); v3Buf = gpu.CreateBufferRW(N, 2 * 12, 6); } float rang = 0.05f; Random rGenerator = new Random(435243); if (cc.c.outDim <= 2) { using (var ws = gpu.NewWriteStream(v2Buf)) { for (int row = 0; row < N; row++) { ws.Write <float>(0, 1, 0, 1); } } using (var ws = gpu.NewWriteStream(Y2Buf)) { for (int row = 0; row < N; row++) { for (int col = 0; col < cc.c.outDim; col++) { ws.Write((float)(rang * rGenerator.NextDouble() - rang / 2)); } if (cc.c.outDim == 1) { ws.Write(0.0f); } } } } else { using (var ws = gpu.NewWriteStream(v3Buf)) { for (int row = 0; row < N; row++) { ws.Write <float>(0, 1, 0, 1, 0, 1); } } using (var ws = gpu.NewWriteStream(Y3Buf)) { for (int row = 0; row < N; row++) { for (int col = 0; col < cc.c.outDim; col++) { ws.Write((float)(rang * rGenerator.NextDouble() - rang / 2)); } } } } #endregion #region Upload data table and initialize the distance matrix // Used to aggregate values created by parallel threads. // the size of of groupMaxBuf must be large enoght to hold a float value for each thread started in parallel. // Notice: gpu.Run(k) will start k*GROUP_SIZE threads. int gpSize = Math.Max(GpuGroupSize, MaxGroupNumber * GroupSize); gpSize = Math.Max(gpSize, MaxGroupNumberHyp * GroupSizeHyp); groupMaxBuf = gpu.CreateBufferRW(gpSize, 4, 7); resultBuf = gpu.CreateBufferRW(3, 4, 2); // to receive the total changes. resultStaging = gpu.CreateStagingBuffer(resultBuf); tableBuf = gpu.CreateBufferRO(N * cc.c.columns, 4, 0); if (MetricType == 1) { NormalizeTable(X); } gpu.WriteMarix(tableBuf, X, true); const int MinCpuDimension = 100; // minimal dimension to trigger CPU caching. const int MaxDimension = 64; // maximal dimension (table columns) for fast EuclideanNoCache shader. Must be the same as MAX_DIMENSION. const int MaxDimensionS = 32; // maximal dimension (table columns) for fast EuclideanNoCache shader. Must be the same as MAX_DIMENSIONs. if (N <= CacheLimit) { cachingMode = CachingMode.OnGpu; } else { if ((cc.c.columns > MinCpuDimension) && ((double)N * N * 4) < ((double)MaxCpuCacheSize * 1024.0 * 1024.0)) { cachingMode = CachingMode.OnCpu; } else { if (cc.c.columns < MaxDimensionS) { cachingMode = CachingMode.OnFlySmS; } else if (cc.c.columns < MaxDimension) { cachingMode = CachingMode.OnFlySm; } else { cachingMode = CachingMode.OnFly; } } } #endregion cc.c.targetH = (float)Math.Log(PerplexityRatio * N); if (cachingMode == CachingMode.OnGpu) { CalculateP(); } else if (cachingMode == CachingMode.OnCpu) { InitializePCpu(); } else // (cachingMode == CachingMode.OnFly[Sm,SmS]) { InitializeP(); } using (var sd = gpu.LoadShader("TsneDx.CalculateSumQ.cso")) { gpu.SetShader(sd); cc.c.groupNumber = 256; for (int i = 0; i < N; i += cc.c.groupNumber) { cc.c.blockIdx = i; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.blockIdx = -1; cc.Upload(); gpu.Run(); } var sdNames = new Dictionary <CachingMode, string>() { { CachingMode.OnGpu, "TsneDx.OneStep.cso" }, { CachingMode.OnCpu, "TsneDx.OneStepCpuCache.cso" }, { CachingMode.OnFly, "TsneDx.OneStepNoCache.cso" }, { CachingMode.OnFlySm, "TsneDx.FastStep.cso" }, { CachingMode.OnFlySmS, "TsneDx.FastStepS.cso" }, }; ComputeShader csOneStep = gpu.LoadShader(sdNames[cachingMode]); ComputeShader csSumUp = gpu.LoadShader("TsneDx.OneStepSumUp.cso"); int stepCounter = 0; while (true) { if (stepCounter < exaggerationLength) { if (ExaggerationSmoothen) { int len = (int)(0.9 * MaxEpochs); if (stepCounter < len) { double t = (double)stepCounter / len; t = Math.Sqrt(Math.Sqrt(t)); cc.c.PFactor = (float)((1 - t) * ExaggerationFactor + t); } else { cc.c.PFactor = 1.0f; } } else { cc.c.PFactor = (float)ExaggerationFactor; } } else { cc.c.PFactor = 1.0f; } gpu.SetShader(csOneStep); if (cachingMode == CachingMode.OnGpu) { cc.c.groupNumber = MaxGroupNumber; // Notice: cc.c.groupNumber*GroupSize must fit into groupMax[]. for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GroupSize) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.groupNumber = MaxGroupNumber * GroupSize; } else if (cachingMode == CachingMode.OnCpu) { int bSize = MaxGroupNumberHyp * GroupSizeHyp; cc.c.groupNumber = MaxGroupNumberHyp; for (int bIdx = 0; bIdx < N; bIdx += bSize) { gpu.WriteArray(cpuP, bIdx, Math.Min(N, bIdx + bSize), P2Buf); cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.groupNumber = Math.Min(N, bSize); } else if ((cachingMode == CachingMode.OnFlySm) || (cachingMode == CachingMode.OnFlySmS)) { const int GrSize = 64; // This value must match that of GR_SIZE in TsneMap.hlsl. cc.c.groupNumber = MaxGroupNumber; for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GrSize) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.groupNumber = cc.c.groupNumber * GrSize; } else // cachingMode==CachingMode.OnFly { cc.c.groupNumber = 128; for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } } //Notice: cc.c.groupNumber must be number of partial sumQ_next, which add up to sumQ for the next step. gpu.SetShader(csSumUp); cc.Upload(); gpu.Run(); currentVariation = gpu.ReadRange <float>(resultStaging, resultBuf, 3)[2] / N; cc.c.mom = (float)((stepCounter < (MaxEpochs * momentumSwitch)) ? momentum : finalMomentum); stepCounter++; if (stepCounter % 10 == 0) { Console.Write('.'); } if (stepCounter % 500 == 0) { Console.WriteLine(); } if ((stepCounter >= MaxEpochs) || ((stepCounter >= (2 + exaggerationLength)) && (currentVariation < stopVariation))) { break; } } Console.WriteLine(); float[][] Y = new float[N][]; using (var rs = gpu.NewReadStream((cc.c.outDim == 3) ? Y3StagingBuf : Y2StagingBuf, (cc.c.outDim == 3) ? Y3Buf : Y2Buf)) { int outVDim = (cc.c.outDim == 3) ? 3 : 2; for (int row = 0; row < N; row++) { Y[row] = rs.ReadRange <float>(outVDim); } } if (cc.c.outDim == 1) { for (int i = 0; i < N; i++) { Y[i] = new float[] { Y[i][0] } } } ; TsneDx.SafeDispose(csSumUp, csOneStep, PBuf, P2Buf, distanceBuf, tableBuf, resultBuf, resultStaging, groupMaxBuf, Y3Buf, Y3StagingBuf, v3Buf, Y2Buf, Y2StagingBuf, v2Buf, cc, gpu); return(AutoNormalize ? PcaNormalize.DoNormalize(Y) : Y); }