public float[][] Fit(float[][] X) { int exaggerationLength = (int)(MaxEpochs * ExaggerationRatio); gpu = new GpuDevice(); cc = gpu.CreateConstantBuffer <TsneMapConstants>(0); int N = X.Length; cc.c.columns = X[0].Length; cc.c.N = N; cc.c.outDim = OutDim; cc.c.metricType = MetricType; #region Initialize Y Buffer Y2Buf = null; Buffer Y3Buf = null; Buffer Y3StagingBuf = null; Buffer Y2StagingBuf = null; Buffer v2Buf = null; Buffer v3Buf = null; if (cc.c.outDim <= 2) { Y2Buf = gpu.CreateBufferRW(N, 8, 3); Y2StagingBuf = gpu.CreateStagingBuffer(Y2Buf); v2Buf = gpu.CreateBufferRW(N, 2 * 8, 5); } else { Y3Buf = gpu.CreateBufferRW(N, 12, 4); Y3StagingBuf = gpu.CreateStagingBuffer(Y3Buf); v3Buf = gpu.CreateBufferRW(N, 2 * 12, 6); } float rang = 0.05f; Random rGenerator = new Random(435243); if (cc.c.outDim <= 2) { using (var ws = gpu.NewWriteStream(v2Buf)) { for (int row = 0; row < N; row++) { ws.Write <float>(0, 1, 0, 1); } } using (var ws = gpu.NewWriteStream(Y2Buf)) { for (int row = 0; row < N; row++) { for (int col = 0; col < cc.c.outDim; col++) { ws.Write((float)(rang * rGenerator.NextDouble() - rang / 2)); } if (cc.c.outDim == 1) { ws.Write(0.0f); } } } } else { using (var ws = gpu.NewWriteStream(v3Buf)) { for (int row = 0; row < N; row++) { ws.Write <float>(0, 1, 0, 1, 0, 1); } } using (var ws = gpu.NewWriteStream(Y3Buf)) { for (int row = 0; row < N; row++) { for (int col = 0; col < cc.c.outDim; col++) { ws.Write((float)(rang * rGenerator.NextDouble() - rang / 2)); } } } } #endregion #region Upload data table and initialize the distance matrix // Used to aggregate values created by parallel threads. // the size of of groupMaxBuf must be large enoght to hold a float value for each thread started in parallel. // Notice: gpu.Run(k) will start k*GROUP_SIZE threads. int gpSize = Math.Max(GpuGroupSize, MaxGroupNumber * GroupSize); gpSize = Math.Max(gpSize, MaxGroupNumberHyp * GroupSizeHyp); groupMaxBuf = gpu.CreateBufferRW(gpSize, 4, 7); resultBuf = gpu.CreateBufferRW(3, 4, 2); // to receive the total changes. resultStaging = gpu.CreateStagingBuffer(resultBuf); tableBuf = gpu.CreateBufferRO(N * cc.c.columns, 4, 0); if (MetricType == 1) { NormalizeTable(X); } gpu.WriteMarix(tableBuf, X, true); const int MinCpuDimension = 100; // minimal dimension to trigger CPU caching. const int MaxDimension = 64; // maximal dimension (table columns) for fast EuclideanNoCache shader. Must be the same as MAX_DIMENSION. const int MaxDimensionS = 32; // maximal dimension (table columns) for fast EuclideanNoCache shader. Must be the same as MAX_DIMENSIONs. if (N <= CacheLimit) { cachingMode = CachingMode.OnGpu; } else { if ((cc.c.columns > MinCpuDimension) && ((double)N * N * 4) < ((double)MaxCpuCacheSize * 1024.0 * 1024.0)) { cachingMode = CachingMode.OnCpu; } else { if (cc.c.columns < MaxDimensionS) { cachingMode = CachingMode.OnFlySmS; } else if (cc.c.columns < MaxDimension) { cachingMode = CachingMode.OnFlySm; } else { cachingMode = CachingMode.OnFly; } } } #endregion cc.c.targetH = (float)Math.Log(PerplexityRatio * N); if (cachingMode == CachingMode.OnGpu) { CalculateP(); } else if (cachingMode == CachingMode.OnCpu) { InitializePCpu(); } else // (cachingMode == CachingMode.OnFly[Sm,SmS]) { InitializeP(); } using (var sd = gpu.LoadShader("TsneDx.CalculateSumQ.cso")) { gpu.SetShader(sd); cc.c.groupNumber = 256; for (int i = 0; i < N; i += cc.c.groupNumber) { cc.c.blockIdx = i; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.blockIdx = -1; cc.Upload(); gpu.Run(); } var sdNames = new Dictionary <CachingMode, string>() { { CachingMode.OnGpu, "TsneDx.OneStep.cso" }, { CachingMode.OnCpu, "TsneDx.OneStepCpuCache.cso" }, { CachingMode.OnFly, "TsneDx.OneStepNoCache.cso" }, { CachingMode.OnFlySm, "TsneDx.FastStep.cso" }, { CachingMode.OnFlySmS, "TsneDx.FastStepS.cso" }, }; ComputeShader csOneStep = gpu.LoadShader(sdNames[cachingMode]); ComputeShader csSumUp = gpu.LoadShader("TsneDx.OneStepSumUp.cso"); int stepCounter = 0; while (true) { if (stepCounter < exaggerationLength) { if (ExaggerationSmoothen) { int len = (int)(0.9 * MaxEpochs); if (stepCounter < len) { double t = (double)stepCounter / len; t = Math.Sqrt(Math.Sqrt(t)); cc.c.PFactor = (float)((1 - t) * ExaggerationFactor + t); } else { cc.c.PFactor = 1.0f; } } else { cc.c.PFactor = (float)ExaggerationFactor; } } else { cc.c.PFactor = 1.0f; } gpu.SetShader(csOneStep); if (cachingMode == CachingMode.OnGpu) { cc.c.groupNumber = MaxGroupNumber; // Notice: cc.c.groupNumber*GroupSize must fit into groupMax[]. for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GroupSize) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.groupNumber = MaxGroupNumber * GroupSize; } else if (cachingMode == CachingMode.OnCpu) { int bSize = MaxGroupNumberHyp * GroupSizeHyp; cc.c.groupNumber = MaxGroupNumberHyp; for (int bIdx = 0; bIdx < N; bIdx += bSize) { gpu.WriteArray(cpuP, bIdx, Math.Min(N, bIdx + bSize), P2Buf); cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.groupNumber = Math.Min(N, bSize); } else if ((cachingMode == CachingMode.OnFlySm) || (cachingMode == CachingMode.OnFlySmS)) { const int GrSize = 64; // This value must match that of GR_SIZE in TsneMap.hlsl. cc.c.groupNumber = MaxGroupNumber; for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GrSize) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } cc.c.groupNumber = cc.c.groupNumber * GrSize; } else // cachingMode==CachingMode.OnFly { cc.c.groupNumber = 128; for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber) { cc.c.blockIdx = bIdx; cc.Upload(); gpu.Run(cc.c.groupNumber); } } //Notice: cc.c.groupNumber must be number of partial sumQ_next, which add up to sumQ for the next step. gpu.SetShader(csSumUp); cc.Upload(); gpu.Run(); currentVariation = gpu.ReadRange <float>(resultStaging, resultBuf, 3)[2] / N; cc.c.mom = (float)((stepCounter < (MaxEpochs * momentumSwitch)) ? momentum : finalMomentum); stepCounter++; if (stepCounter % 10 == 0) { Console.Write('.'); } if (stepCounter % 500 == 0) { Console.WriteLine(); } if ((stepCounter >= MaxEpochs) || ((stepCounter >= (2 + exaggerationLength)) && (currentVariation < stopVariation))) { break; } } Console.WriteLine(); float[][] Y = new float[N][]; using (var rs = gpu.NewReadStream((cc.c.outDim == 3) ? Y3StagingBuf : Y2StagingBuf, (cc.c.outDim == 3) ? Y3Buf : Y2Buf)) { int outVDim = (cc.c.outDim == 3) ? 3 : 2; for (int row = 0; row < N; row++) { Y[row] = rs.ReadRange <float>(outVDim); } } if (cc.c.outDim == 1) { for (int i = 0; i < N; i++) { Y[i] = new float[] { Y[i][0] } } } ; TsneDx.SafeDispose(csSumUp, csOneStep, PBuf, P2Buf, distanceBuf, tableBuf, resultBuf, resultStaging, groupMaxBuf, Y3Buf, Y3StagingBuf, v3Buf, Y2Buf, Y2StagingBuf, v2Buf, cc, gpu); return(AutoNormalize ? PcaNormalize.DoNormalize(Y) : Y); }
void InitializePCpu() { int N = cc.c.N; const float DistanceScale = 100.0f; const float eps = 2.22e-16f; int bandSize = Math.Min(N, MaxGroupNumberHyp * GroupSizeHyp); PBuf = gpu.CreateBufferRW(bandSize * N, 4, 1); P2Buf = gpu.CreateBufferDynamic(bandSize * N, 4, 7); // dynamic buffer for fast uploading. Linked to Pcpu[] on HLSL. int blockSize = 128; // Calculate so many rows per dispatch. cpuP = new float[N][]; for (int i = 0; i < N; i++) { cpuP[i] = new float[N]; } using (var distanceBuf = gpu.CreateBufferRW(blockSize * N, 4, 0)) using (var stagingBuf = gpu.CreateStagingBuffer(distanceBuf)) using (var sd = gpu.LoadShader("TsneDx.PartialDistance2.cso")) { gpu.SetShader(sd); for (int iBlock = 0; iBlock < N; iBlock += blockSize) { cc.c.blockIdx = iBlock; cc.Upload(); gpu.Run(blockSize); int iBlock2 = Math.Min(iBlock + blockSize, N); int blockLen = (iBlock2 * (iBlock2 - 1) - iBlock * (iBlock - 1)) / 2; float[] ret = gpu.ReadRange <float>(stagingBuf, distanceBuf, blockLen); int idx = 0; for (int row = iBlock; row < iBlock2; row++) { Array.Copy(ret, idx, cpuP[row], 0, row); idx += row; } } } double distanceFactor = double.MinValue; MT.For(1, N, i => { float maxV = cpuP[i].Max(); lock (this) distanceFactor = Math.Max(distanceFactor, maxV); }); if (distanceFactor == 0) { throw new System.Exception("Distance metric degenerated: all components are zero."); } // Scale the distance to managable range [0, 100.0] to avoid degredation // with exp function. distanceFactor = DistanceScale / distanceFactor; MT.For(1, N, i => { for (int j = 0; j < i; j++) { cpuP[i][j] = (float)(cpuP[i][j] * distanceFactor); } }); MT.For(0, N, i => { for (int j = 0; j < i; j++) { cpuP[j][i] = cpuP[i][j]; } cpuP[i][i] = 0; }); int bSize = MaxGroupNumberHyp * GroupSizeHyp; using (var sd = gpu.LoadShader("TsneDx.Dist2Affinity.cso")) using (var stagingBuf = gpu.CreateStagingBuffer(PBuf)) { gpu.SetShader(sd); for (int iBlock = 0; iBlock < N; iBlock += bSize) { cc.c.blockIdx = iBlock; cc.Upload(); int iBlock2 = Math.Min(N, iBlock + bSize); using (var ws = gpu.NewWriteStream(PBuf)) for (int row = iBlock; row < iBlock2; row++) { ws.WriteRange(cpuP[row]); } gpu.Run(MaxGroupNumberHyp); using (var rs = gpu.NewReadStream(stagingBuf, PBuf)) for (int row = iBlock; row < iBlock2; row++) { rs.ReadRange(cpuP[row], 0, N); } } } double sum = 0; MT.For(0, N, i => { double sum2 = 0.0; for (int j = i + 1; j < N; j++) { cpuP[i][j] += cpuP[j][i]; sum2 += cpuP[i][j]; } lock (this) sum += sum2; }); if (sum == 0) { throw new System.Exception("Perplexity too small!"); } sum *= 2; MT.For(0, N, i => { for (int j = i + 1; j < N; j++) { cpuP[i][j] = (float)Math.Max(cpuP[i][j] / sum, eps); cpuP[j][i] = cpuP[i][j]; } cpuP[i][i] = 1.0f; }); }