Exemple #1
0
        private void UploadConstBuffers(NvGpuVmm Vmm, GalPipelineState State, long[] Keys)
        {
            for (int Stage = 0; Stage < Keys.Length; Stage++)
            {
                foreach (ShaderDeclInfo DeclInfo in Gpu.Renderer.Shader.GetConstBufferUsage(Keys[Stage]))
                {
                    ConstBuffer Cb = ConstBuffers[Stage][DeclInfo.Cbuf];

                    if (!Cb.Enabled)
                    {
                        continue;
                    }

                    long Key = Vmm.GetPhysicalAddress(Cb.Position);

                    if (QueryKeyUpload(Vmm, Key, Cb.Size, NvGpuBufferType.ConstBuffer))
                    {
                        IntPtr Source = Vmm.GetHostAddress(Cb.Position, Cb.Size);

                        Gpu.Renderer.Buffer.SetData(Key, Cb.Size, Source);
                    }

                    State.ConstBufferKeys[Stage][DeclInfo.Cbuf] = Key;
                }
            }
        }
Exemple #2
0
        private void CbBind(NvGpuVmm Vmm, NvGpuPBEntry PBEntry)
        {
            int Stage = (PBEntry.Method - 0x904) >> 3;

            int Index = PBEntry.Arguments[0];

            bool Enabled = (Index & 1) != 0;

            Index = (Index >> 4) & 0x1f;

            long Position = MakeInt64From2xInt32(NvGpuEngine3dReg.ConstBufferAddress);

            long CbKey = Vmm.GetPhysicalAddress(Position);

            int Size = ReadRegister(NvGpuEngine3dReg.ConstBufferSize);

            if (!Gpu.Renderer.Buffer.IsCached(CbKey, Size))
            {
                Gpu.Renderer.Buffer.Create(CbKey, Size);
            }

            ConstBuffer Cb = ConstBuffers[Stage][Index];

            if (Cb.Position != Position || Cb.Enabled != Enabled || Cb.Size != Size)
            {
                ConstBuffers[Stage][Index].Position = Position;
                ConstBuffers[Stage][Index].Enabled  = Enabled;
                ConstBuffers[Stage][Index].Size     = Size;
            }
        }
        private void UploadUniforms(AMemory Memory)
        {
            long BasePosition = MakeInt64From2xInt32(NvGpuEngine3dReg.ShaderAddress);

            for (int Index = 0; Index < 5; Index++)
            {
                int Control = ReadRegister(NvGpuEngine3dReg.ShaderNControl + (Index + 1) * 0x10);
                int Offset  = ReadRegister(NvGpuEngine3dReg.ShaderNOffset + (Index + 1) * 0x10);

                //Note: Vertex Program (B) is always enabled.
                bool Enable = (Control & 1) != 0 || Index == 0;

                if (!Enable)
                {
                    continue;
                }

                for (int Cbuf = 0; Cbuf < ConstBuffers.Length; Cbuf++)
                {
                    ConstBuffer Cb = ConstBuffers[Cbuf];

                    if (Cb.Enabled)
                    {
                        long CbPosition = Cb.Position + Index * Cb.Size;

                        byte[] Data = AMemoryHelper.ReadBytes(Memory, CbPosition, (uint)Cb.Size);

                        Gpu.Renderer.SetConstBuffer(BasePosition + (uint)Offset, Cbuf, Data);
                    }
                }
            }
        }
Exemple #4
0
        private void UploadConstBuffers(NvGpuVmm vmm, GalPipelineState state, long[] keys)
        {
            Profile.Begin(Profiles.GPU.Engine3d.UploadConstBuffers);

            for (int stage = 0; stage < keys.Length; stage++)
            {
                foreach (CBufferDescriptor desc in _gpu.Renderer.Shader.GetConstBufferUsage(keys[stage]))
                {
                    ConstBuffer cb = _constBuffers[stage][desc.Slot];

                    if (!cb.Enabled)
                    {
                        continue;
                    }

                    long key = vmm.GetPhysicalAddress(cb.Position);

                    if (_gpu.ResourceManager.MemoryRegionModified(vmm, key, cb.Size, NvGpuBufferType.ConstBuffer))
                    {
                        if (vmm.TryGetHostAddress(cb.Position, cb.Size, out IntPtr cbPtr))
                        {
                            _gpu.Renderer.Buffer.SetData(key, cb.Size, cbPtr);
                        }
                        else
                        {
                            _gpu.Renderer.Buffer.SetData(key, vmm.ReadBytes(cb.Position, cb.Size));
                        }
                    }

                    state.ConstBufferKeys[stage][desc.Slot] = key;
                }
            }

            Profile.End(Profiles.GPU.Engine3d.UploadConstBuffers);
        }
Exemple #5
0
        private void CbBind(NvGpuVmm vmm, GpuMethodCall methCall)
        {
            int stage = (methCall.Method - 0x904) >> 3;

            int index = methCall.Argument;

            bool enabled = (index & 1) != 0;

            index = (index >> 4) & 0x1f;

            long position = MakeInt64From2xInt32(NvGpuEngine3dReg.ConstBufferAddress);

            long cbKey = vmm.GetPhysicalAddress(position);

            int size = ReadRegister(NvGpuEngine3dReg.ConstBufferSize);

            if (!_gpu.Renderer.Buffer.IsCached(cbKey, size))
            {
                _gpu.Renderer.Buffer.Create(cbKey, size);
            }

            ConstBuffer cb = _constBuffers[stage][index];

            if (cb.Position != position || cb.Enabled != enabled || cb.Size != size)
            {
                _constBuffers[stage][index].Position = position;
                _constBuffers[stage][index].Enabled  = enabled;
                _constBuffers[stage][index].Size     = size;
            }
        }
        public NvGpuEngine3d(NvGpu Gpu)
        {
            this.Gpu = Gpu;

            Registers = new int[0xe00];

            Methods = new Dictionary <int, NvGpuMethod>();

            void AddMethod(int Meth, int Count, int Stride, NvGpuMethod Method)
            {
                while (Count-- > 0)
                {
                    Methods.Add(Meth, Method);

                    Meth += Stride;
                }
            }

            AddMethod(0x585, 1, 1, VertexEndGl);
            AddMethod(0x674, 1, 1, ClearBuffers);
            AddMethod(0x6c3, 1, 1, QueryControl);
            AddMethod(0x8e4, 16, 1, CbData);
            AddMethod(0x904, 5, 8, CbBind);

            ConstBuffers = new ConstBuffer[6][];

            for (int Index = 0; Index < ConstBuffers.Length; Index++)
            {
                ConstBuffers[Index] = new ConstBuffer[18];
            }

            FrameBuffers = new HashSet <long>();
        }
Exemple #7
0
        private void UploadConstBuffers(NvGpuVmm Vmm, GalPipelineState State, long[] Keys)
        {
            for (int Stage = 0; Stage < Keys.Length; Stage++)
            {
                foreach (ShaderDeclInfo DeclInfo in Gpu.Renderer.Shader.GetConstBufferUsage(Keys[Stage]))
                {
                    ConstBuffer Cb = ConstBuffers[Stage][DeclInfo.Cbuf];

                    if (!Cb.Enabled)
                    {
                        continue;
                    }

                    long Key = Vmm.GetPhysicalAddress(Cb.Position);

                    if (Gpu.ResourceManager.MemoryRegionModified(Vmm, Key, Cb.Size, NvGpuBufferType.ConstBuffer))
                    {
                        if (Vmm.TryGetHostAddress(Cb.Position, Cb.Size, out IntPtr CbPtr))
                        {
                            Gpu.Renderer.Buffer.SetData(Key, Cb.Size, CbPtr);
                        }
                        else
                        {
                            Gpu.Renderer.Buffer.SetData(Key, Vmm.ReadBytes(Cb.Position, Cb.Size));
                        }
                    }

                    State.ConstBufferKeys[Stage][DeclInfo.Cbuf] = Key;
                }
            }
        }
Exemple #8
0
        private void UploadUniforms(NvGpuVmm Vmm)
        {
            long BasePosition = MakeInt64From2xInt32(NvGpuEngine3dReg.ShaderAddress);

            for (int Index = 0; Index < 5; Index++)
            {
                int Control = ReadRegister(NvGpuEngine3dReg.ShaderNControl + (Index + 1) * 0x10);
                int Offset  = ReadRegister(NvGpuEngine3dReg.ShaderNOffset + (Index + 1) * 0x10);

                //Note: Vertex Program (B) is always enabled.
                bool Enable = (Control & 1) != 0 || Index == 0;

                if (!Enable)
                {
                    continue;
                }

                for (int Cbuf = 0; Cbuf < ConstBuffers[Index].Length; Cbuf++)
                {
                    ConstBuffer Cb = ConstBuffers[Index][Cbuf];

                    if (Cb.Enabled)
                    {
                        IntPtr DataAddress = Vmm.GetHostAddress(Cb.Position, Cb.Size);

                        Gpu.Renderer.Shader.SetConstBuffer(BasePosition + (uint)Offset, Cbuf, Cb.Size, DataAddress);
                    }
                }
            }
        }
Exemple #9
0
        public NvGpuEngine3d(NvGpu Gpu)
        {
            this.Gpu = Gpu;

            Registers = new int[0xe00];

            Methods = new Dictionary <int, NvGpuMethod>();

            void AddMethod(int Meth, int Count, int Stride, NvGpuMethod Method)
            {
                while (Count-- > 0)
                {
                    Methods.Add(Meth, Method);

                    Meth += Stride;
                }
            }

            AddMethod(0x585, 1, 1, VertexEndGl);
            AddMethod(0x674, 1, 1, ClearBuffers);
            AddMethod(0x6c3, 1, 1, QueryControl);
            AddMethod(0x8e4, 16, 1, CbData);
            AddMethod(0x904, 5, 8, CbBind);

            ConstBuffers = new ConstBuffer[6][];

            for (int Index = 0; Index < ConstBuffers.Length; Index++)
            {
                ConstBuffers[Index] = new ConstBuffer[18];
            }

            UploadedKeys = new List <long> [(int)NvGpuBufferType.Count];

            for (int i = 0; i < UploadedKeys.Length; i++)
            {
                UploadedKeys[i] = new List <long>();
            }

            //Ensure that all components are enabled by default.
            //FIXME: Is this correct?
            WriteRegister(NvGpuEngine3dReg.ColorMaskN, 0x1111);

            for (int Index = 0; Index < GalPipelineState.RenderTargetsCount; Index++)
            {
                WriteRegister(NvGpuEngine3dReg.IBlendNEquationRgb + Index * 8, (int)GalBlendEquation.FuncAdd);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncSrcRgb + Index * 8, (int)GalBlendFactor.One);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncDstRgb + Index * 8, (int)GalBlendFactor.Zero);
                WriteRegister(NvGpuEngine3dReg.IBlendNEquationAlpha + Index * 8, (int)GalBlendEquation.FuncAdd);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncSrcAlpha + Index * 8, (int)GalBlendFactor.One);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncDstAlpha + Index * 8, (int)GalBlendFactor.Zero);
            }
        }
Exemple #10
0
        public NvGpuEngine3d(NvGpu gpu)
        {
            _gpu = gpu;

            Registers = new int[0xe00];

            _methods = new Dictionary <int, NvGpuMethod>();

            void AddMethod(int meth, int count, int stride, NvGpuMethod method)
            {
                while (count-- > 0)
                {
                    _methods.Add(meth, method);

                    meth += stride;
                }
            }

            AddMethod(0x585, 1, 1, VertexEndGl);
            AddMethod(0x674, 1, 1, ClearBuffers);
            AddMethod(0x6c3, 1, 1, QueryControl);
            AddMethod(0x8e4, 16, 1, CbData);
            AddMethod(0x904, 5, 8, CbBind);

            _constBuffers = new ConstBuffer[6][];

            for (int index = 0; index < _constBuffers.Length; index++)
            {
                _constBuffers[index] = new ConstBuffer[18];
            }

            // Ensure that all components are enabled by default.
            // FIXME: Is this correct?
            WriteRegister(NvGpuEngine3dReg.ColorMaskN, 0x1111);

            WriteRegister(NvGpuEngine3dReg.FrameBufferSrgb, 1);

            WriteRegister(NvGpuEngine3dReg.FrontFace, (int)GalFrontFace.Cw);

            for (int index = 0; index < GalPipelineState.RenderTargetsCount; index++)
            {
                WriteRegister(NvGpuEngine3dReg.IBlendNEquationRgb + index * 8, (int)GalBlendEquation.FuncAdd);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncSrcRgb + index * 8, (int)GalBlendFactor.One);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncDstRgb + index * 8, (int)GalBlendFactor.Zero);
                WriteRegister(NvGpuEngine3dReg.IBlendNEquationAlpha + index * 8, (int)GalBlendEquation.FuncAdd);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncSrcAlpha + index * 8, (int)GalBlendFactor.One);
                WriteRegister(NvGpuEngine3dReg.IBlendNFuncDstAlpha + index * 8, (int)GalBlendFactor.Zero);
            }
        }
        public NvGpuEngine3d(NvGpu Gpu)
        {
            this.Gpu = Gpu;

            Registers = new int[0xe00];

            Methods = new Dictionary <int, NvGpuMethod>();

            void AddMethod(int Meth, int Count, int Stride, NvGpuMethod Method)
            {
                while (Count-- > 0)
                {
                    Methods.Add(Meth, Method);

                    Meth += Stride;
                }
            }

            AddMethod(0x585, 1, 1, VertexEndGl);
            AddMethod(0x674, 1, 1, ClearBuffers);
            AddMethod(0x6c3, 1, 1, QueryControl);
            AddMethod(0x8e4, 16, 1, CbData);
            AddMethod(0x904, 5, 8, CbBind);

            ConstBuffers = new ConstBuffer[6][];

            for (int Index = 0; Index < ConstBuffers.Length; Index++)
            {
                ConstBuffers[Index] = new ConstBuffer[18];
            }

            UploadedKeys = new List <long> [(int)NvGpuBufferType.Count];

            for (int i = 0; i < UploadedKeys.Length; i++)
            {
                UploadedKeys[i] = new List <long>();
            }

            //Ensure that all components are enabled by default.
            //FIXME: Is this correct?
            WriteRegister(NvGpuEngine3dReg.ColorMaskN, 0x1111);
        }
        private void UploadConstBuffers(NvGpuVmm Vmm, GalPipelineState State)
        {
            for (int Stage = 0; Stage < State.ConstBufferKeys.Length; Stage++)
            {
                for (int Index = 0; Index < State.ConstBufferKeys[Stage].Length; Index++)
                {
                    ConstBuffer Cb = ConstBuffers[Stage][Index];

                    long Key = Cb.Position;

                    if (Cb.Enabled && QueryKeyUpload(Vmm, Key, Cb.Size, NvGpuBufferType.ConstBuffer))
                    {
                        IntPtr Source = Vmm.GetHostAddress(Key, Cb.Size);

                        Gpu.Renderer.Buffer.SetData(Key, Cb.Size, Source);
                    }

                    State.ConstBufferKeys[Stage][Index] = Key;
                }
            }
        }
Exemple #13
0
        public float[][] Fit(float[][] X)
        {
            int exaggerationLength = (int)(MaxEpochs * ExaggerationRatio);

            gpu = new GpuDevice();
            cc  = gpu.CreateConstantBuffer <TsneMapConstants>(0);

            int N = X.Length;

            cc.c.columns    = X[0].Length;
            cc.c.N          = N;
            cc.c.outDim     = OutDim;
            cc.c.metricType = MetricType;

            #region Initialize Y
            Buffer Y2Buf        = null;
            Buffer Y3Buf        = null;
            Buffer Y3StagingBuf = null;
            Buffer Y2StagingBuf = null;
            Buffer v2Buf        = null;
            Buffer v3Buf        = null;

            if (cc.c.outDim <= 2)
            {
                Y2Buf        = gpu.CreateBufferRW(N, 8, 3);
                Y2StagingBuf = gpu.CreateStagingBuffer(Y2Buf);
                v2Buf        = gpu.CreateBufferRW(N, 2 * 8, 5);
            }
            else
            {
                Y3Buf        = gpu.CreateBufferRW(N, 12, 4);
                Y3StagingBuf = gpu.CreateStagingBuffer(Y3Buf);
                v3Buf        = gpu.CreateBufferRW(N, 2 * 12, 6);
            }

            float  rang       = 0.05f;
            Random rGenerator = new Random(435243);

            if (cc.c.outDim <= 2)
            {
                using (var ws = gpu.NewWriteStream(v2Buf)) {
                    for (int row = 0; row < N; row++)
                    {
                        ws.Write <float>(0, 1, 0, 1);
                    }
                }

                using (var ws = gpu.NewWriteStream(Y2Buf)) {
                    for (int row = 0; row < N; row++)
                    {
                        for (int col = 0; col < cc.c.outDim; col++)
                        {
                            ws.Write((float)(rang * rGenerator.NextDouble() - rang / 2));
                        }
                        if (cc.c.outDim == 1)
                        {
                            ws.Write(0.0f);
                        }
                    }
                }
            }
            else
            {
                using (var ws = gpu.NewWriteStream(v3Buf)) {
                    for (int row = 0; row < N; row++)
                    {
                        ws.Write <float>(0, 1, 0, 1, 0, 1);
                    }
                }
                using (var ws = gpu.NewWriteStream(Y3Buf)) {
                    for (int row = 0; row < N; row++)
                    {
                        for (int col = 0; col < cc.c.outDim; col++)
                        {
                            ws.Write((float)(rang * rGenerator.NextDouble() - rang / 2));
                        }
                    }
                }
            }
            #endregion

            #region Upload data table and initialize the distance matrix

            // Used to aggregate values created by parallel threads.
            // the size of of groupMaxBuf must be large enoght to hold a float value for each thread started in parallel.
            // Notice: gpu.Run(k) will start k*GROUP_SIZE threads.
            int gpSize = Math.Max(GpuGroupSize, MaxGroupNumber * GroupSize);
            gpSize      = Math.Max(gpSize, MaxGroupNumberHyp * GroupSizeHyp);
            groupMaxBuf = gpu.CreateBufferRW(gpSize, 4, 7);

            resultBuf     = gpu.CreateBufferRW(3, 4, 2); // to receive the total changes.
            resultStaging = gpu.CreateStagingBuffer(resultBuf);

            tableBuf = gpu.CreateBufferRO(N * cc.c.columns, 4, 0);
            if (MetricType == 1)
            {
                NormalizeTable(X);
            }
            gpu.WriteMarix(tableBuf, X, true);

            const int MinCpuDimension = 100; // minimal dimension to trigger CPU caching.
            const int MaxDimension    = 64;  // maximal dimension (table columns) for fast EuclideanNoCache shader. Must be the same as MAX_DIMENSION.
            const int MaxDimensionS   = 32;  // maximal dimension (table columns) for fast EuclideanNoCache shader. Must be the same as MAX_DIMENSIONs.
            if (N <= CacheLimit)
            {
                cachingMode = CachingMode.OnGpu;
            }
            else
            {
                if ((cc.c.columns > MinCpuDimension) && ((double)N * N * 4) < ((double)MaxCpuCacheSize * 1024.0 * 1024.0))
                {
                    cachingMode = CachingMode.OnCpu;
                }
                else
                {
                    if (cc.c.columns < MaxDimensionS)
                    {
                        cachingMode = CachingMode.OnFlySmS;
                    }
                    else if (cc.c.columns < MaxDimension)
                    {
                        cachingMode = CachingMode.OnFlySm;
                    }
                    else
                    {
                        cachingMode = CachingMode.OnFly;
                    }
                }
            }
            #endregion

            cc.c.targetH = (float)Math.Log(PerplexityRatio * N);
            if (cachingMode == CachingMode.OnGpu)
            {
                CalculateP();
            }
            else if (cachingMode == CachingMode.OnCpu)
            {
                InitializePCpu();
            }
            else     // (cachingMode == CachingMode.OnFly[Sm,SmS])
            {
                InitializeP();
            }

            using (var sd = gpu.LoadShader("TsneDx.CalculateSumQ.cso")) {
                gpu.SetShader(sd);
                cc.c.groupNumber = 256;
                for (int i = 0; i < N; i += cc.c.groupNumber)
                {
                    cc.c.blockIdx = i;
                    cc.Upload();
                    gpu.Run(cc.c.groupNumber);
                }
                cc.c.blockIdx = -1;
                cc.Upload();
                gpu.Run();
            }

            var sdNames = new Dictionary <CachingMode, string>()
            {
                { CachingMode.OnGpu, "TsneDx.OneStep.cso" },
                { CachingMode.OnCpu, "TsneDx.OneStepCpuCache.cso" },
                { CachingMode.OnFly, "TsneDx.OneStepNoCache.cso" },
                { CachingMode.OnFlySm, "TsneDx.FastStep.cso" },
                { CachingMode.OnFlySmS, "TsneDx.FastStepS.cso" },
            };

            ComputeShader csOneStep   = gpu.LoadShader(sdNames[cachingMode]);
            ComputeShader csSumUp     = gpu.LoadShader("TsneDx.OneStepSumUp.cso");
            int           stepCounter = 0;

            while (true)
            {
                if (stepCounter < exaggerationLength)
                {
                    if (ExaggerationSmoothen)
                    {
                        int len = (int)(0.9 * MaxEpochs);
                        if (stepCounter < len)
                        {
                            double t = (double)stepCounter / len;
                            t            = Math.Sqrt(Math.Sqrt(t));
                            cc.c.PFactor = (float)((1 - t) * ExaggerationFactor + t);
                        }
                        else
                        {
                            cc.c.PFactor = 1.0f;
                        }
                    }
                    else
                    {
                        cc.c.PFactor = (float)ExaggerationFactor;
                    }
                }
                else
                {
                    cc.c.PFactor = 1.0f;
                }

                gpu.SetShader(csOneStep);

                if (cachingMode == CachingMode.OnGpu)
                {
                    cc.c.groupNumber = MaxGroupNumber;
                    // Notice: cc.c.groupNumber*GroupSize must fit into groupMax[].
                    for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GroupSize)
                    {
                        cc.c.blockIdx = bIdx;
                        cc.Upload();
                        gpu.Run(cc.c.groupNumber);
                    }
                    cc.c.groupNumber = MaxGroupNumber * GroupSize;
                }
                else if (cachingMode == CachingMode.OnCpu)
                {
                    int bSize = MaxGroupNumberHyp * GroupSizeHyp;
                    cc.c.groupNumber = MaxGroupNumberHyp;
                    for (int bIdx = 0; bIdx < N; bIdx += bSize)
                    {
                        gpu.WriteArray(cpuP, bIdx, Math.Min(N, bIdx + bSize), P2Buf);
                        cc.c.blockIdx = bIdx;
                        cc.Upload();
                        gpu.Run(cc.c.groupNumber);
                    }
                    cc.c.groupNumber = Math.Min(N, bSize);
                }
                else if ((cachingMode == CachingMode.OnFlySm) || (cachingMode == CachingMode.OnFlySmS))
                {
                    const int GrSize = 64;  // This value must match that of GR_SIZE in TsneMap.hlsl.
                    cc.c.groupNumber = MaxGroupNumber;
                    for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber * GrSize)
                    {
                        cc.c.blockIdx = bIdx;
                        cc.Upload();
                        gpu.Run(cc.c.groupNumber);
                    }
                    cc.c.groupNumber = cc.c.groupNumber * GrSize;
                }
                else     // cachingMode==CachingMode.OnFly
                {
                    cc.c.groupNumber = 128;
                    for (int bIdx = 0; bIdx < N; bIdx += cc.c.groupNumber)
                    {
                        cc.c.blockIdx = bIdx;
                        cc.Upload();
                        gpu.Run(cc.c.groupNumber);
                    }
                }

                //Notice: cc.c.groupNumber must be number of partial sumQ_next, which add up to sumQ for the next step.
                gpu.SetShader(csSumUp);
                cc.Upload();
                gpu.Run();

                currentVariation = gpu.ReadRange <float>(resultStaging, resultBuf, 3)[2] / N;

                cc.c.mom = (float)((stepCounter < (MaxEpochs * momentumSwitch)) ? momentum : finalMomentum);
                stepCounter++;
                if (stepCounter % 10 == 0)
                {
                    Console.Write('.');
                }
                if (stepCounter % 500 == 0)
                {
                    Console.WriteLine();
                }
                if ((stepCounter >= MaxEpochs) || ((stepCounter >= (2 + exaggerationLength)) && (currentVariation < stopVariation)))
                {
                    break;
                }
            }
            Console.WriteLine();

            float[][] Y = new float[N][];
            using (var rs = gpu.NewReadStream((cc.c.outDim == 3) ? Y3StagingBuf : Y2StagingBuf, (cc.c.outDim == 3) ? Y3Buf : Y2Buf)) {
                int outVDim = (cc.c.outDim == 3) ? 3 : 2;
                for (int row = 0; row < N; row++)
                {
                    Y[row] = rs.ReadRange <float>(outVDim);
                }
            }

            if (cc.c.outDim == 1)
            {
                for (int i = 0; i < N; i++)
                {
                    Y[i] = new float[] { Y[i][0] }
                }
            }
            ;

            TsneDx.SafeDispose(csSumUp, csOneStep, PBuf, P2Buf, distanceBuf, tableBuf, resultBuf,
                               resultStaging, groupMaxBuf, Y3Buf, Y3StagingBuf, v3Buf, Y2Buf, Y2StagingBuf, v2Buf, cc, gpu);

            return(AutoNormalize ? PcaNormalize.DoNormalize(Y) : Y);
        }