CudaDeviceVariable.CopyToHost C# (CSharp)代码示例

示例#1

0

显示文件

        public static float2[] calculateCudaFFT(float[] h_dataIn)
        {
            CudaContext cntxt = new CudaContext();
            //Caution: Array sizes matter! Based on CUFFFT-Documentation...
            int size_real    = h_dataIn.Length;
            int size_complex = (int)Math.Floor(size_real / 2.0) + 1;

            //Crating FFT Plan
            CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C);

            //Size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real
            CudaDeviceVariable <float> d_data = new CudaDeviceVariable <float>(size_complex * 2);

            //device allocation and host have different sizes, why the amount of data must be given explicitly for copying:
            d_data.CopyToDevice(h_dataIn, 0, 0, size_real * sizeof(float));

            //executa plan
            fftPlan.Exec(d_data.DevicePointer, TransformDirection.Forward);

            //Output to host, either as float2 or float, but array sizes must be right!
            float2[] h_dataOut  = new float2[size_complex];
            float[]  h_dataOut2 = new float[size_complex * 2];
            d_data.CopyToHost(h_dataOut);
            d_data.CopyToHost(h_dataOut2);
            fftPlan.Dispose();
            return(h_dataOut);
        }

示例#2

0

显示文件

文件： MyDistanceOps.cs 项目： xeronith/BrainSimulator

        public void Run(DistanceOperation operation,
                        CudaDeviceVariable <float> A, int sizeA,
                        CudaDeviceVariable <float> B, int sizeB,
                        CudaDeviceVariable <float> result, int sizeRes)
        {
            if (!ValidateAtRun(operation))
            {
                return;
            }

            switch (operation)
            {
            case DistanceOperation.DotProd:
                //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                break;

            case DistanceOperation.CosDist:
                //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                break;

            case DistanceOperation.EuclidDist:
                float res = RunReturn(operation, A, sizeA, B, sizeB);
                result.CopyToDevice(res);
                break;

            case DistanceOperation.EuclidDistSquared:
                m_combineVecsKernel.SetupExecution(sizeA);
                m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA);
                //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0);
                m_dotKernel.Run(result.DevicePointer, m_temp, m_temp);
                break;

            case DistanceOperation.HammingDist:
                m_combineVecsKernel.SetupExecution(sizeA);
                m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                float fDist = 0;     // to transform number of matches to a number of differences
                result.CopyToHost(ref fDist);
                fDist = m_temp.Count - fDist;
                result.CopyToDevice(fDist);
                break;

            case DistanceOperation.HammingSim:
                m_combineVecsKernel.SetupExecution(sizeA);
                m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                // take the single number (number of different bits) and convert it to Hamming Similarity:
                // a number in range <0,1> that says how much the vectors are similar
                float fSim = 0;
                result.CopyToHost(ref fSim);
                fSim = fSim / m_temp.Count;
                result.CopyToDevice(fSim);
                break;
            }
        }

示例#3

0

显示文件

文件： CudaProvider.cs 项目： fcmai/brightwire

 internal MinMax FindMinAndMax(CudaDeviceVariable <float> a, int size)
 {
     if (size > 0)
     {
         var ptr = a;
         while (size > BLOCK_DIM2)
         {
             var bufferSize = (size / BLOCK_DIM2) + 1;
             using (var minBlock = new CudaDeviceVariable <float>(bufferSize))
                 using (var maxBlock = new CudaDeviceVariable <float>(bufferSize)) {
                     minBlock.Memset(0);
                     maxBlock.Memset(0);
                     _Use(_findMinAndMax, size, k => k.Run(BLOCK_DIM2, ptr.DevicePointer, size, minBlock.DevicePointer, maxBlock.DevicePointer));
                     if (ptr != a)
                     {
                         ptr.Dispose();
                     }
                     var minTest = new float[bufferSize];
                     var maxText = new float[bufferSize];
                     minBlock.CopyToHost(minTest);
                     maxBlock.CopyToHost(maxText);
                     size = bufferSize * 2;
                     ptr  = new CudaDeviceVariable <float>(size);
                     ptr.CopyToDevice(minBlock, 0, 0, bufferSize * sizeof(float));
                     ptr.CopyToDevice(maxBlock, 0, bufferSize * sizeof(float), bufferSize * sizeof(float));
                     var test = new float[size];
                     ptr.CopyToHost(test);
                 }
         }
         var data = new float[size];
         ptr.CopyToHost(data);
         float min = float.MaxValue, max = float.MinValue;
         for (var i = 0; i < size; i++)
         {
             var val = data[i];
             if (val > max)
             {
                 max = val;
             }
             if (val < min)
             {
                 min = val;
             }
         }
         return(new MinMax(min, max));
     }
     return(MinMax.Empty);
 }

示例#4

0

显示文件

文件： PBF.cs 项目： ariacat3366/Unity-PBF-simulation-using-managedCUDA

    // Update is called once per frame
    void Update()
    {
        if (wave)
        {
            _wall_x = (float)(wall_x + 8 * Math.Sin(t * 1.2f));
            t      += dt;
        }

        for (int i = 0; i < iter; ++i)
        {
            // rho
            cudaKernel[0].Run(d_pPBF_particle.DevicePointer, d_np.DevicePointer, h, NUM_OF_P);
            // lambda
            cudaKernel[1].Run(d_pPBF_particle.DevicePointer, d_np.DevicePointer, h, NUM_OF_P);
            // update x
            cudaKernel[2].Run(d_pPBF_particle.DevicePointer, d_np.DevicePointer, h, NUM_OF_P, _wall_x, wall_z);
        }
        // apply v and x  :: F_ext
        cudaKernel[3].Run(d_pPBF_particle.DevicePointer, d_pPBF_pos.DevicePointer, d_pPBF_col.DevicePointer, g, dt, NUM_OF_P);

        d_pPBF_pos.CopyToHost(h_pPBF_pos);
        d_pPBF_col.CopyToHost(h_pPBF_col);

        //Debug.Log("Pos:" + h_pPBF_pos[10]);


        posBuf.SetData(h_pPBF_pos);
        colBuf.SetData(h_pPBF_col);
    }

示例#5

0

显示文件

        public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode,
                             CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData,
                             CudnnTensorDescriptor destDiffTensor, float[] destDiffData)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable <float>(srcData.Length))
                using (var srcDiffDataGpu = new CudaDeviceVariable <float>(srcDiffData.Length))
                    using (var destDiffDataGpu = new CudaDeviceVariable <float>(destDiffData.Length))
                    {
                        srcDataGpu.CopyToDevice(srcData);
                        srcDiffDataGpu.CopyToDevice(srcDiffData);

                        Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode,
                                                                             srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                             destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                        destDiffDataGpu.CopyToHost(destDiffData);
                    }
        }

示例#6

0

显示文件

文件： CudnnContext.Pooling.cs 项目： redknightlois/cudalearn

        public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData,
                             CudnnTensorDescriptor destTensor, double[] destData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData)
        {
            Contract.Requires(pooling != null);
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable <double>(srcData.Length))
                using (var srcDiffDataGpu = new CudaDeviceVariable <double>(srcDiffData.Length))
                    using (var destDataGpu = new CudaDeviceVariable <double>(destData.Length))
                        using (var destDiffDataGpu = new CudaDeviceVariable <double>(destDiffData.Length))
                        {
                            srcDataGpu.CopyToDevice(srcData);
                            srcDiffDataGpu.CopyToDevice(srcDiffData);
                            destDataGpu.CopyToDevice(destData);

                            Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle,
                                                                                 srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                                 destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                            destDiffDataGpu.CopyToHost(destDiffData);
                        }
        }

示例#7

0

显示文件

文件： CudnnContext.Convolution.cs 项目： redknightlois/cudalearn

        public void BackwardData(CudnnFilterDescriptor filter, double[] filterData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(filter != null);
            Contract.Requires(filterData != null);
            Contract.Requires(diffTensor != null);
            Contract.Requires(diffData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(gradient != null);
            Contract.Requires(gradientData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, filter, diffTensor, gradient);

            using (var filterDataGpu = new CudaDeviceVariable <double>(filterData.Length))
                using (var diffDataGpu = new CudaDeviceVariable <double>(diffData.Length))
                    using (var gradientDataGpu = new CudaDeviceVariable <double>(gradientData.Length))
                    {
                        filterDataGpu.CopyToDevice(filterData);
                        diffDataGpu.CopyToDevice(diffData);

                        Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardData(handle, filter.Handle, filterDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate));

                        gradientDataGpu.CopyToHost(gradientData);
                    }
        }

示例#8

0

显示文件

文件： MySoundWorld.cs 项目： xeronith/BrainSimulator

            // perform Fast Fourier transform
            private float[] PerformFFT(float[] input)
            {
                int size_real    = input.Length;
                int size_complex = (int)Math.Floor(size_real / 2.0) + 1;

                CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C);

                // size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real
                CudaDeviceVariable <float> fft = new CudaDeviceVariable <float>(size_complex * 2);

                // device allocation and host have different sizes, why the amount of data must be given explicitly for copying:
                fft.CopyToDevice(input, 0, 0, size_real * sizeof(float));

                // executa plan
                fftPlan.Exec(fft.DevicePointer, TransformDirection.Forward);

                // output to host as float2
                float2[] output = new float2[size_complex];
                fft.CopyToHost(output);

                // cleanup
                fft.Dispose();
                fftPlan.Dispose();

                // squared magnitude of complex output as a result
                float[] result = new float[output.Length];
                for (int i = 0; i < output.Length; i++)
                {
                    result[i] = (float)Math.Sqrt((output[i].x * output[i].x) + (output[i].y * output[i].y));
                }

                return(result);
            }

示例#9

0

显示文件

文件： UnitTests.cs 项目： zhouzx-zt/CudaSharp

        private T[] RunKernel <T>(Action <T[]> method, T[] parameters) where T : struct
        {
            var methodInfo = method.Method;

            string[] kernels;
            string   llvmIr, ptxIr;
            var      ptx = CudaSharp.CudaSharp.Translate(out kernels, out llvmIr, out ptxIr, "sm_20", methodInfo);

            Console.WriteLine(llvmIr);
            Console.WriteLine(ptxIr);
            var kernel     = _context.LoadKernelPTX(ptx, kernels[0]);
            var maxThreads = kernel.MaxThreadsPerBlock;

            if (parameters.Length <= maxThreads)
            {
                kernel.BlockDimensions = parameters.Length;
                kernel.GridDimensions  = 1;
            }
            else
            {
                kernel.BlockDimensions = maxThreads;
                kernel.GridDimensions  = parameters.Length / maxThreads;
                if ((kernel.BlockDimensions * kernel.GridDimensions) != parameters.Length)
                {
                    throw new Exception(string.Format("Invalid parameters size (must be <= {0} or a multiple of {0}", maxThreads));
                }
            }
            var gpuMem = new CudaDeviceVariable <T>(parameters.Length);

            gpuMem.CopyToDevice(parameters);
            kernel.Run(gpuMem.DevicePointer);
            gpuMem.CopyToHost(parameters);
            gpuMem.Dispose();
            return(parameters);
        }

示例#10

0

显示文件

文件： GPUMatrix.cs 项目： kozzion/KozzionCSharp

        public static void blaa()
        {
            int num = 10;
            //NewContext creation
            CudaContext cntxt = new CudaContext();

            //Module loading from precompiled .ptx in a project output folder
            CUmodule cumodule = cntxt.LoadModule("kernel.ptx");

            //_Z9addKernelPf - function name, can be found in *.ptx file
            CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt);

            //Create device array for data
            CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num);

            //Create arrays with data
            float[] vec1 = new float[num];

            //Copy data to device
            vec1_device.CopyToDevice(vec1);

            //Set grid and block dimensions
            addWithCuda.GridDimensions  = new dim3(8, 1, 1);
            addWithCuda.BlockDimensions = new dim3(512, 1, 1);

            //Run the kernel
            addWithCuda.Run(
                vec1_device.DevicePointer);

            //Copy data from device
            vec1_device.CopyToHost(vec1);
        }

示例#11

0

显示文件

文件： CudnnContext.Softmax.cs 项目： redknightlois/cudalearn

        public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode,
                             CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData,
                             CudnnTensorDescriptor destDiffTensor, float[] destDiffData)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length))
            using (var srcDiffDataGpu = new CudaDeviceVariable<float>(srcDiffData.Length))
            using (var destDiffDataGpu = new CudaDeviceVariable<float>(destDiffData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                srcDiffDataGpu.CopyToDevice(srcDiffData);

                Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode,
                                                                     srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                     destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                destDiffDataGpu.CopyToHost(destDiffData);
            }
        }

示例#12

0

显示文件

文件： GpuVector.cs 项目： fcmai/brightwire

        public IIndexableVector AsIndexable()
        {
            Debug.Assert(IsValid);
            var data = new float[_size];

            _data.CopyToHost(data);
            return(_cuda.NumericsProvider.Create(_size, i => data[i]).AsIndexable());
        }

示例#13

0

显示文件

文件： GpuMatrix.cs 项目： fcmai/brightwire

        public IIndexableMatrix AsIndexable()
        {
            Debug.Assert(IsValid);
            var data = new float[_rows * _columns];

            _data.CopyToHost(data);
            return(_cuda.NumericsProvider.Create(_rows, _columns, (j, k) => data[k * _rows + j]).AsIndexable());
        }

示例#14

0

显示文件

        internal void CopyFromDeviceToHost()
        {
            if (!_initialisedInContext)
            {
                InitialiseCudaBuffer(copyHostToDevice: false);
            }

            _cudaBuffer.CopyToHost(_data, _cudaZero, _cudaOffsetBytes, _cudaLengthBytes);
        }

示例#15

0

显示文件

文件： Program.cs 项目： mikclark/MolecularSimulationCUDA

        static void Main(string[] args)
        {
            InitKernels();
            int n = 0x1000000;

            float[] xx = new float[n];
            float[] yy = new float[n];
            float[] zz = new float[n];
            int[]   tt = new int[n];

            float L = 512;
            float Lx = L, Ly = L, Lz = L;
            CudaDeviceVariable <float> gpu_lengths  = new float[] { Lx, Ly, Lz };

            for (int i = 0; i < n; i++)
            {
                xx[i] = ((float)(i & 0x0000FF) / (float)0x000100 - 0.5F) * Lx;
                yy[i] = ((float)(i & 0x00FF00) / (float)0x010000 - 0.5F) * Ly;
                zz[i] = ((float)(i & 0xFF0000) / (float)0x1000000 - 0.5F) * Lz;
                tt[i] = 1;
            }
            SimulationMolecules simulationMolecules = new SimulationMolecules(xx, yy, zz, tt);

            for (int i = 0; i < 10; i++)
            {
                Console.WriteLine($"{i}  =  ({simulationMolecules.x[i]},{simulationMolecules.y[i]},{simulationMolecules.z[i]})");
            }
            int ii = 0x808080;

            Console.WriteLine($"{ii}  =  ({simulationMolecules.x[ii]},{simulationMolecules.y[ii]},{simulationMolecules.z[ii]})");

            Interactions interactions = new Interactions();

            interactions.SetLJParameters(1, 1, 1.5F, 1.5F);

            float[] cachedEnergies = new float[n / THREADS_PER_BLOCK + 1];
            CudaDeviceVariable <float> gpu_energies = cachedEnergies;

            energyOfExistingMolecule.Run(n,
                                         simulationMolecules.gpu_x.DevicePointer,
                                         simulationMolecules.gpu_y.DevicePointer,
                                         simulationMolecules.gpu_z.DevicePointer,
                                         simulationMolecules.gpu_types.DevicePointer,
                                         interactions.SigmaPointer(),
                                         interactions.EpsilonPointer(),
                                         gpu_lengths.DevicePointer,
                                         2.5F,
                                         0x808080,
                                         gpu_energies.DevicePointer
                                         );

            gpu_energies.CopyToHost(cachedEnergies);
            double e = cachedEnergies.Sum();

            Console.WriteLine($"Energy = {e}");
            Console.Read();
        }

示例#16

0

显示文件

        private static void CopyCudaVariableToHost <T>(
            CudaDeviceVariable <T> variable, BufferTarget bufferTarget, uint buffer)
            where T : struct
        {
            GL.BindBuffer(bufferTarget, buffer);
            IntPtr ptr = GL.MapBuffer(bufferTarget, BufferAccess.WriteOnly);

            variable.CopyToHost(ptr);

            GL.UnmapBuffer(bufferTarget);
            GL.BindBuffer(bufferTarget, 0);
        }

示例#17

0

显示文件

文件： GPU_Functionality.cs 项目： my81656581/icFlow

        public void TransferUpdatedStateToHost()
        {
            if (cz_stride != 0)
            {
                // transfer to host dcz and icz
                g_dcz.CopyToHost(c_dcz, 0, 0, sizeof(double) * cz_stride * FP_DATA_SIZE_CZ); // pmax, tmax
                g_icz.CopyToHost(c_icz, 0, 0, sizeof(int) * cz_stride * 4);                  // cz_failed

                // infer failed state, pmax[] and tmax[]
                Parallel.For(0, mc.nonFailedCZs.Length, i =>
                {
                    CZ cz = mc.nonFailedCZs[i];
                    if (!cz.failed)
                    {
                        bool tentative_fail = c_icz[i + cz_stride * (TENTATIVE_FAILED_OFFSET_CZ)] == 0 ? false : true;
                        if (tentative_fail)
                        {
                            cz.failed = true;
                        }
                        for (int j = 0; j < 3; j++)
                        {
                            cz.pmax[j] = c_dcz[i + cz_stride * (TENTATIVE_PMAX_OFFSET_CZ + j)];
                            cz.tmax[j] = c_dcz[i + cz_stride * (TENTATIVE_TMAX_OFFSET_CZ + j)];
                        }
                        cz.avgDn = c_dcz[i + cz_stride * DELTA_N_OFFSET_CZ];
                        cz.avgDt = c_dcz[i + cz_stride * DELTA_T_OFFSET_CZ];
                        cz.avgTn = c_dcz[i + cz_stride * T_N_OFFSET_CZ];
                        cz.avgTt = c_dcz[i + cz_stride * T_T_OFFSET_CZ];

                        if (cz.maxAvgDn < cz.avgDn)
                        {
                            cz.maxAvgDn = cz.avgDn;
                        }
                        if (cz.maxAvgDt < cz.avgDt)
                        {
                            cz.maxAvgDt = cz.avgDt;
                        }
                    }
                });
            }

            // copy back elastic forces (should be zero on rigid objects)
            g_dn.CopyToHost(c_dn, sizeof(double) * nd_stride * F_OFFSET, sizeof(double) * nd_stride * F_OFFSET, sizeof(double) * nd_stride * 3);
            Parallel.For(0, mc.allNodes.Length, i =>
            {
                Node nd = mc.allNodes[i];
                nd.fx   = c_dn[i + nd_stride * (F_OFFSET + 0)];
                nd.fy   = c_dn[i + nd_stride * (F_OFFSET + 1)];
                nd.fz   = c_dn[i + nd_stride * (F_OFFSET + 2)];
            });
        }

示例#18

0

显示文件

        public cuDoubleComplex[] PerformFFT(cuDoubleComplex[] data, int n, TransformDirection direction)
        {
            f_plan = new CudaFFTPlan2D(n, n, cufftType.Z2Z);

            CudaDeviceVariable <cuDoubleComplex> d_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n);
            CudaDeviceVariable <cuDoubleComplex> o_signal = new CudaDeviceVariable <cuDoubleComplex>(n * n);

            d_signal.CopyToDevice(data);
            f_plan.Exec(d_signal.DevicePointer, o_signal.DevicePointer, direction);


            cuDoubleComplex[] result = new cuDoubleComplex[n * n];

            o_signal.CopyToHost(result);
            d_signal.Dispose();
            return(result);
        }

示例#19

0

显示文件

文件： Program.cs 项目： shaunstanislaus/CudaSharp

 static void Test(byte[] ptxFile)
 {
     const int size = 16;
     var context = new CudaContext();
     var kernel = context.LoadKernelPTX(ptxFile, "kernel");
     var memory = context.AllocateMemory(4 * size);
     var gpuMemory = new CudaDeviceVariable<int>(memory);
     var cpuMemory = new int[size];
     for (var i = 0; i < size; i++)
         cpuMemory[i] = i - 2;
     gpuMemory.CopyToDevice(cpuMemory);
     kernel.BlockDimensions = 4;
     kernel.GridDimensions = 4;
     kernel.Run(memory);
     gpuMemory.CopyToHost(cpuMemory);
     for (var i = 0; i < size; i++)
         Console.WriteLine("{0} = {1}", i, cpuMemory[i]);
 }

示例#20

0

显示文件

文件： CudnnContext.Convolution.cs 项目： redknightlois/cudalearn

        public void BackwardBias(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor destTensor, double[] destData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, destTensor);

            using (var srcDataGpu = new CudaDeviceVariable <double>(srcData.Length))
                using (var destDataGpu = new CudaDeviceVariable <double>(destData.Length))
                {
                    srcDataGpu.CopyToDevice(srcData);

                    Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardBias(handle, srcTensor.Handle, srcDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, accumulate));
                    destDataGpu.CopyToHost(destData);
                }
        }

示例#21

0

显示文件

文件： Impl_ManagedCuda.cs 项目： pavlexander/gpu_tests

        public override void Proccess()
        {
            // run CUDA method
            myKernel.Run(
                result_dev.DevicePointer,
                resultCalc_dev.DevicePointer,
                input1_dev.DevicePointer,
                input2_dev.DevicePointer,
                input3_dev.DevicePointer,
                input4_dev.DevicePointer,
                DataGenerator.InputCount,
                DataGenerator.Width,
                DataGenerator.Height
                );

            // copy return to host
            result_dev.CopyToHost(resultsBytes);
            resultCalc_dev.CopyToHost(calculatables);
        }

示例#22

0

显示文件

文件： CudnnContext.Convolution.cs 项目： redknightlois/cudalearn

        public void BackwardBias(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor destTensor, double[] destData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, destTensor);

            using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length))
            using (var destDataGpu = new CudaDeviceVariable<double>(destData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);

                Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardBias(handle, srcTensor.Handle, srcDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, accumulate));
                destDataGpu.CopyToHost(destData);
            }
        }

示例#23

0

显示文件

        public ResultPoint[] FindPoints(byte[] baseImage, byte[] nextImage, ResultPoint[] points, int searchDelta, int subsetDelta, int BitmapWidth, int BitmapHeight, int PointsinX, int PointsinY)
        {
            baseImageBuffer = new CudaDeviceVariable <byte>(BitmapWidth * BitmapHeight);
            baseImageBuffer.CopyToDevice(baseImage);

            nextImageBuffer = new CudaDeviceVariable <byte>(BitmapWidth * BitmapHeight);
            nextImageBuffer.CopyToDevice(nextImage);

            pointsBuffer = new CudaDeviceVariable <ResultPoint>(PointsinX * PointsinY);
            pointsBuffer.CopyToDevice(points);

            kernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(PointsinX, 1, 1);
            kernel.GridDimensions  = new ManagedCuda.VectorTypes.dim3(PointsinY, 1, 1);
            kernel.Run(new object[] {
                baseImageBuffer.DevicePointer, nextImageBuffer.DevicePointer, pointsBuffer.DevicePointer, searchDelta, subsetDelta, BitmapWidth, BitmapHeight, PointsinX, PointsinY
            });
            var result = new ResultPoint[PointsinX * PointsinY];

            pointsBuffer.CopyToHost(result);
            return(result);
        }

示例#24

0

显示文件

文件： Manager.cs 项目： tsliumeng/lego-game-and-Unity-like-engine

        /*
         * int length,
         *      float3* currentPositionH, float3* currentVelocityH, float3* currentAccelerationH, float* currentLifeTimeH, float4* startColorH, float4* endColorH, float* startSizeH, float* endSizeH, float* startLifeTimeH,
         *      int* realCount, int* desiredCount, float3 aroundPosition, float4 startColor, float4 endColor, float startSize, float endSize, float startLifeTime
         */
        public void GenerateParticles(int desiredCount, Vector3 aroundPosition, Vector4 startColor, Vector4 endColor, float startSize, float endSize, float startLifeTime)
        {
            List <object> parameters = new List <object>();

            parameters.Add(randomIndex_D.DevicePointer);
            parameters.Add(renderer.particleMesh.maxParticles);

            foreach (var r in resources)
            {
                r.Map();
                parameters.Add(r.GetMappedPointer());
            }

            CudaDeviceVariable <int> realCount_D = 0;

            parameters.Add(realCount_D.DevicePointer);
            CudaDeviceVariable <int> desiredCount_D = desiredCount;

            parameters.Add(desiredCount_D.DevicePointer);
            parameters.Add(Conv(aroundPosition));
            parameters.Add(Conv(startColor));
            parameters.Add(Conv(endColor));
            parameters.Add(startSize);
            parameters.Add(endSize);
            parameters.Add(startLifeTime);

            object[] arrParams = parameters.ToArray();
            generateParticles.Run(arrParams);

            foreach (var r in resources)
            {
                r.UnMap();
            }

            int realCount = 0;

            realCount_D.CopyToHost(ref realCount);
            //Debug.Info("desiredCount=" + desiredCount + " realCount=" + realCount);
        }

示例#25

0

显示文件

    // Update is called once per frame
    void Update()
    {
        // cudaKernel[0].Run(d_pos.DevicePointer, d_M_index.DevicePointer, d_M.DevicePointer, h, NUM_OF_P, NP);

        for (int i = 0; i < iter; ++i)
        {
            cudaKernel[1].Run(d_pos.DevicePointer, d_delta_p.DevicePointer, d_M_index.DevicePointer, d_M.DevicePointer, NUM_OF_P, NP);
            cudaKernel[2].Run(d_pos.DevicePointer, d_delta_p.DevicePointer, NUM_OF_P, NP);
        }
        cudaKernel[3].Run(d_pos.DevicePointer, d_ppos.DevicePointer, S_flyAway, S_floorFliction, dt, NUM_OF_P);

        d_ppos.CopyToHost(h_pos);

        posBuf.SetData(h_pos);
        colBuf.SetData(h_col);

        // Print Screen
        if (S_saveImages)
        {
            ScreenCapture.CaptureScreenshot("images/" + string.Format("{0:00000}", capture) + ".png");
        }
        ++capture;
    }

示例#26

0

显示文件

文件： CudnnContext.Convolution.cs 项目： redknightlois/cudalearn

        public void Forward(CudnnTensorDescriptor srcTensor, float[] srcData, CudnnFilterDescriptor filter, float[] filterData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor destTensor, float[] destData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(filter != null);
            Contract.Requires(filterData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, destTensor, filter);

            using (var srcDataGpu = new CudaDeviceVariable <float>(srcData.Length))
                using (var filterDataGpu = new CudaDeviceVariable <float>(filterData.Length))
                    using (var destDataGpu = new CudaDeviceVariable <float>(destData.Length))
                    {
                        srcDataGpu.CopyToDevice(srcData);
                        filterDataGpu.CopyToDevice(filterData);

                        Invoke(() => CudnnNativeMethods.cudnnConvolutionForward(handle, srcTensor.Handle, srcDataGpu.DevicePointer, filter.Handle, filterDataGpu.DevicePointer, convolution.Handle, destTensor.Handle, destDataGpu.DevicePointer, accumulate));
                        destDataGpu.CopyToHost(destData);
                    }
        }

示例#27

0

显示文件

文件： MyDistanceOps.cs 项目： sschocke/BrainSimulator

        public void Run(DistanceOperation operation,
            CudaDeviceVariable<float> A, int sizeA,
            CudaDeviceVariable<float> B, int sizeB,
            CudaDeviceVariable<float> result, int sizeRes)
        {
            if (!ValidateAtRun(operation))
                return;

            switch (operation)
            {
                case DistanceOperation.DotProd:
                    //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                    m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                    break;

                case DistanceOperation.CosDist:
                    //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                    m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                    break;

                case DistanceOperation.EuclidDist:
                    float res = RunReturn(operation, A, sizeA, B, sizeB);
                    result.CopyToDevice(res);
                    break;

                case DistanceOperation.EuclidDistSquared:
                    m_combineVecsKernel.SetupExecution(sizeA);
                    m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA);
                    //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0);
                    m_dotKernel.Run(result.DevicePointer, m_temp, m_temp);
                    break;

                case DistanceOperation.HammingDist:
                    m_combineVecsKernel.SetupExecution(sizeA);
                    m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                    //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                    m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                    float fDist = 0; // to transform number of matches to a number of differences
                    result.CopyToHost(ref fDist);
                    fDist = m_temp.Count - fDist;
                    result.CopyToDevice(fDist);
                    break;

                case DistanceOperation.HammingSim:
                    m_combineVecsKernel.SetupExecution(sizeA);
                    m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                    //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                    m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                    // take the single number (number of different bits) and convert it to Hamming Similarity:
                    // a number in range <0,1> that says how much the vectors are similar
                    float fSim = 0;
                    result.CopyToHost(ref fSim);
                    fSim = fSim / m_temp.Count;
                    result.CopyToDevice(fSim);
                    break;
            }
        }

示例#28

0

显示文件

文件： 1260103-33806701-104501761-2.cs 项目： shendongnian/download4

        public void cuFFTreconstruct()
        {
            CudaContext ctx = new CudaContext(0);

            ManagedCuda.BasicTypes.CUmodule cumodule = ctx.LoadModule("kernel.ptx");
            CudaKernel cuKernel = new CudaKernel("cu_ArrayInversion", cumodule, ctx);

            float2[] fData  = new float2[Resolution * Resolution];
            float2[] result = new float2[Resolution * Resolution];
            FFTData2D = new float[Resolution, Resolution, 2];
            CudaDeviceVariable <float2> devData      = new CudaDeviceVariable <float2>(Resolution * Resolution);
            CudaDeviceVariable <float2> copy_devData = new CudaDeviceVariable <float2>(Resolution * Resolution);

            int    i, j;
            Random rnd  = new Random();
            double avrg = 0.0;

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    fData[i * Resolution + j].x = i + j * 2;
                    avrg += fData[i * Resolution + j].x;
                    fData[i * Resolution + j].y = 0.0f;
                }
            }

            avrg = avrg / (double)(Resolution * Resolution);

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    fData[(i * Resolution + j)].x = fData[(i * Resolution + j)].x - (float)avrg;
                }
            }

            devData.CopyToDevice(fData);

            CudaFFTPlan1D plan1D = new CudaFFTPlan1D(Resolution, cufftType.C2C, Resolution);

            plan1D.Exec(devData.DevicePointer, TransformDirection.Forward);

            cuKernel.GridDimensions  = new ManagedCuda.VectorTypes.dim3(Resolution / cuda_blockNum, Resolution, 1);
            cuKernel.BlockDimensions = new ManagedCuda.VectorTypes.dim3(cuda_blockNum, 1, 1);

            cuKernel.Run(devData.DevicePointer, copy_devData.DevicePointer, Resolution);

            copy_devData.CopyToHost(result);

            for (i = 0; i < Resolution; i++)
            {
                for (j = 0; j < Resolution; j++)
                {
                    FFTData2D[i, j, 0] = result[i * Resolution + j].x;
                    FFTData2D[i, j, 1] = result[i * Resolution + j].y;
                }
            }

            //Clean up
            devData.Dispose();
            copy_devData.Dispose();
            plan1D.Dispose();
            CudaContext.ProfilerStop();
            ctx.Dispose();
        }

示例#29

0

显示文件

    void SolveInteractions()
    {
        ComputeBuffer positionBuffer = new ComputeBuffer(Pedestrians.Length + 1, 8);
        ComputeBuffer velocityBuffer = new ComputeBuffer(Pedestrians.Length + 1, 8);
        int           j = 0;

        for (j = 0; j < Pedestrians.Length; ++j)
        {
            Positions[j]  = Pedestrians[j].GetComponent <PedestrianController>().position;
            Velocities[j] = Pedestrians[j].GetComponent <PedestrianController>().velocity;
        }
        Positions[j]  = new Vector2(0, 0);
        Velocities[j] = new Vector2(0, 0);

        positionBuffer.SetData(Positions);
        AgentViewMaterial.SetBuffer("positionBuffer", positionBuffer);
        velocityBuffer.SetData(Velocities);
        AgentViewMaterial.SetBuffer("velocityBuffer", velocityBuffer);


        //1. First Render From Every Pedestrians PoV
        for (int i = 0; i < Pedestrians.Length; ++i)
        {
            Pedestrians[i].GetComponent <PedestrianController>().SolveInteraction();
        }

        //2. Now Process the Resultant Texture
        if (RenderTargetTex && RenderTarget.IsCreated())
        {
            RenderTexture.active = RenderTarget;
            RenderTargetTex.ReadPixels(new Rect(0, 0, RenderTarget.width, RenderTarget.height), 0, 0);
            RenderTargetTex.Apply();

            // Launch CUDA Kernel
            uint  textureSize        = (uint)(RenderTargetTex.width * agentViewTexHeight);
            int   threadsPerBlock    = 1024;
            float threadsPerBlockInv = 1.0f / (float)threadsPerBlock;
            int   blocksPerGrid      = (int)((textureSize + threadsPerBlock - 1) * threadsPerBlockInv);

            /*******************************************************************/
            /************************copyReductionKernel************************/
            /*******************************************************************/
            dim3 block = new dim3(threadsPerBlock, 1, 1);
            dim3 grid  = new dim3(blocksPerGrid, 1, 1);

            uint shMemeSize = (uint)(block.x * 5 * sizeof(float));  // size of shared memory
            uint offset, currentNumData, currentNumBlocks;

            h_idata = RenderTargetTex.GetPixels();
            for (int i = 0; i < h_idata.Length; i++)
            {
                h_idata_float4[i] = new float4(h_idata[i].r,
                                               h_idata[i].g,
                                               h_idata[i].b,
                                               h_idata[i].a);
            }

            d_idata.CopyToDevice(h_idata_float4);

            for (int i = pedStart; i < pedStop; i++)
            {
                offset           = (uint)(i - pedStart) * (textureSize);
                currentNumData   = textureSize;
                currentNumBlocks = (uint)blocksPerGrid;
                grid.x           = currentNumBlocks;

                cudaKernel[0].BlockDimensions     = block;
                cudaKernel[0].GridDimensions      = grid;
                cudaKernel[0].DynamicSharedMemory = shMemeSize;
                cudaKernel[0].Run(d_idata.DevicePointer, d_odata.DevicePointer, 5, textureSize, offset);
                //Debug.Log("1: CUDA kernel launch with " + blocksPerGrid + " blocks of " + threadsPerBlock + " threads\n");

                /*******************************************************************/
                /**************************reductionKernel**************************/
                /*******************************************************************/
                currentNumData   = currentNumBlocks;
                currentNumBlocks = (uint)((currentNumData + threadsPerBlock - 1) * threadsPerBlockInv);

                for (; currentNumData > 1;)
                {
                    // perform reduction to get one pixel
                    grid.x = currentNumBlocks;
                    cudaKernel[1].BlockDimensions     = block;
                    cudaKernel[1].GridDimensions      = grid;
                    cudaKernel[1].DynamicSharedMemory = shMemeSize;
                    cudaKernel[1].Run(d_odata.DevicePointer, 5, currentNumData);
                    //Debug.Log("2: CUDA kernel launch with " + blocksPerGrid + " blocks of " + threadsPerBlock + " threads\n");

                    currentNumData   = currentNumBlocks;
                    currentNumBlocks = (uint)((currentNumData + threadsPerBlock - 1) * threadsPerBlockInv);
                }

                d_result_data.CopyToDevice(d_odata, 0, 5 * i * sizeof(float), 5 * sizeof(float));
            }

            d_result_data.CopyToHost(h_result_data);
            RenderTexture.active = null;
        }

        //3. Now Calculate Agent Veclocity Based on Results from Step 2.
        for (int i = 0; i < Pedestrians.Length; ++i)
        {
            float thetaMax   = 0;
            float thetaMin   = 0;
            float ttcMin     = 0;
            float dttcMin    = 0;
            bool  goFirstMin = true;
            bool  goFirstMax = true;

            computeParams(i, ref thetaMax, ref thetaMin, ref ttcMin, ref dttcMin, ref goFirstMin, ref goFirstMax);

            // compute new velocity
            Pedestrians[i].GetComponent <PedestrianController>().updateVelocity(thetaMin, thetaMax, ttcMin, dttcMin, goFirstMin, goFirstMax);
        }
    }

示例#30

0

显示文件

文件： Program.cs 项目： SharpCoder/MachineLearning

        static void Main(string[] args)
        {
            // NOTE: You need to change this location to match your own machine.
            Console.ForegroundColor = ConsoleColor.Red;
            Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment.");
            Console.ResetColor();
            System.Threading.Thread.Sleep(500);

            string      path   = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx";
            CudaContext ctx    = new CudaContext();
            CUmodule    module = ctx.LoadModule(path);

            kernel = new CudaKernel("kernel", module, ctx);

            // This tells the kernel to allocate a lot of threads for the Gpu.
            kernel.BlockDimensions = THREADS_PER_BLOCK;
            kernel.GridDimensions  = VECTOR_SIZE / THREADS_PER_BLOCK + 1;;

            // Now let's load the kernel!
            // Create the topology.
            int[] topology = new int[] { 1, 200, 200, 100, 1 };

            int height = topology.Length;
            int width  = 0;

            for (int i = 0; i < topology.Length; i++)
            {
                if (width < topology[i])
                {
                    width = topology[i];
                }
            }

            // Launch!
            float[] res = new float[height * width];
            for (int i = 0; i < 10; i++)
            {
                float[] matrix  = new float[height * width];
                float[] weights = new float[height * width];
                Random  rand    = new Random(424242);
                for (int y = 0; y < height; y++)
                {
                    for (int x = 0; x < width; x++)
                    {
                        matrix[y * width + x]  = (y == 0 && x < topology[y]) ? 1.0f : 0;
                        weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0;
                    }
                }

                // Load the kernel with some variables.
                CudaDeviceVariable <int>   cuda_topology = topology;
                CudaDeviceVariable <float> cuda_membank  = matrix;
                CudaDeviceVariable <float> cuda_weights  = weights;

                Stopwatch sw = new Stopwatch();
                sw.Start();
                kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width);
                cuda_membank.CopyToHost(res);
                sw.Stop();

                Console.ForegroundColor = ConsoleColor.Green;
                Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]);
                Console.ResetColor();
            }

            Console.ReadKey();
        }

示例#31

0

显示文件

 public void CopyToHost(float[] target)
 {
     _data.CopyToHost(target);
 }

示例#32

0

显示文件

文件： CudnnContext.Convolution.cs 项目： redknightlois/cudalearn

        public void BackwardFilter(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnFilterDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(diffTensor != null);
            Contract.Requires(diffData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(gradient != null);
            Contract.Requires(gradientData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, diffTensor, gradient);

            using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length))
            using (var diffDataGpu = new CudaDeviceVariable<double>(diffData.Length))
            using (var gradientDataGpu = new CudaDeviceVariable<double>(gradientData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                diffDataGpu.CopyToDevice(diffData);

                Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardFilter(handle, srcTensor.Handle, srcDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate));
                gradientDataGpu.CopyToHost(gradientData);
            }
        }

示例#33

0

显示文件

文件： CudnnContext.Convolution.cs 项目： redknightlois/cudalearn

        public void Forward(CudnnTensorDescriptor srcTensor, float[] srcData, CudnnFilterDescriptor filter, float[] filterData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor destTensor, float[] destData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(filter != null);
            Contract.Requires(filterData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, destTensor, filter);

            using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length))
            using (var filterDataGpu = new CudaDeviceVariable<float>(filterData.Length))
            using (var destDataGpu = new CudaDeviceVariable<float>(destData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                filterDataGpu.CopyToDevice(filterData);

                Invoke(() => CudnnNativeMethods.cudnnConvolutionForward(handle, srcTensor.Handle, srcDataGpu.DevicePointer, filter.Handle, filterDataGpu.DevicePointer, convolution.Handle, destTensor.Handle, destDataGpu.DevicePointer, accumulate));
                destDataGpu.CopyToHost(destData);
            }
        }

示例#34

0

显示文件

文件： CudnnContext.Pooling.cs 项目： redknightlois/cudalearn

        public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData,
                                                             CudnnTensorDescriptor destTensor, double[] destData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData)
        {
            Contract.Requires(pooling != null);
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length))
            using (var srcDiffDataGpu = new CudaDeviceVariable<double>(srcDiffData.Length))
            using (var destDataGpu = new CudaDeviceVariable<double>(destData.Length))
            using (var destDiffDataGpu = new CudaDeviceVariable<double>(destDiffData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                srcDiffDataGpu.CopyToDevice(srcDiffData);
                destDataGpu.CopyToDevice(destData);

                Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle,
                                                                     srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                     destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                destDiffDataGpu.CopyToHost(destDiffData);
            }
        }

示例#35

0

显示文件

文件： JpegNPP.cs 项目： kunzmi/managedCuda

        public static void SaveJpeg(string aFilename, int aQuality, Bitmap aImage)
        {
            if (aImage.PixelFormat != System.Drawing.Imaging.PixelFormat.Format24bppRgb)
            {
                throw new ArgumentException("Only three channel color images are supported.");
            }

            if (aImage.Width % 16 != 0 || aImage.Height % 16 != 0)
            {
                throw new ArgumentException("The provided bitmap must have a height and width of a multiple of 16.");
            }

            JPEGCompression compression = new JPEGCompression();

            NPPImage_8uC3 src = new NPPImage_8uC3(aImage.Width, aImage.Height);
            NPPImage_8uC1 srcY = new NPPImage_8uC1(aImage.Width, aImage.Height);
            NPPImage_8uC1 srcCb = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2);
            NPPImage_8uC1 srcCr = new NPPImage_8uC1(aImage.Width / 2, aImage.Height / 2);
            src.CopyToDevice(aImage);

            //System.Drawing.Bitmap is ordered BGR not RGB
            //The NPP routine BGR to YCbCR outputs the values in clamped range, following the YCbCr standard.
            //But JPEG uses unclamped values ranging all from [0..255], thus use our own color matrix:
            float[,] BgrToYCbCr = new float[3, 4]
            {{0.114f,     0.587f,    0.299f,   0},
             {0.5f,      -0.33126f, -0.16874f, 128},
             {-0.08131f, -0.41869f,  0.5f,     128}};

            src.ColorTwist(BgrToYCbCr);

            //Reduce size of of Cb and Cr channel
            src.Copy(srcY, 2);
            srcY.Resize(srcCr, 0.5, 0.5, InterpolationMode.SuperSampling);
            src.Copy(srcY, 1);
            srcY.Resize(srcCb, 0.5, 0.5, InterpolationMode.SuperSampling);
            src.Copy(srcY, 0);

            FrameHeader oFrameHeader = new FrameHeader();
            oFrameHeader.nComponents = 3;
            oFrameHeader.nHeight = (ushort)aImage.Height;
            oFrameHeader.nSamplePrecision = 8;
            oFrameHeader.nWidth = (ushort)aImage.Width;
            oFrameHeader.aComponentIdentifier = new byte[] { 1, 2, 3 };
            oFrameHeader.aSamplingFactors = new byte[] { 34, 17, 17 }; //Y channel is twice the sice of Cb/Cr channel
            oFrameHeader.aQuantizationTableSelector = new byte[] { 0, 1, 1 };

            //Get quantization tables from JPEG standard with quality scaling
            QuantizationTable[] aQuantizationTables = new QuantizationTable[2];
            aQuantizationTables[0] = new QuantizationTable(QuantizationTable.QuantizationType.Luminance, aQuality);
            aQuantizationTables[1] = new QuantizationTable(QuantizationTable.QuantizationType.Chroma, aQuality);

            CudaDeviceVariable<byte>[] pdQuantizationTables = new CudaDeviceVariable<byte>[2];
            pdQuantizationTables[0] = aQuantizationTables[0].aTable;
            pdQuantizationTables[1] = aQuantizationTables[1].aTable;

            //Get Huffman tables from JPEG standard
            HuffmanTable[] aHuffmanTables = new HuffmanTable[4];
            aHuffmanTables[0] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceDC);
            aHuffmanTables[1] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaDC);
            aHuffmanTables[2] = new HuffmanTable(HuffmanTable.HuffmanType.LuminanceAC);
            aHuffmanTables[3] = new HuffmanTable(HuffmanTable.HuffmanType.ChromaAC);

            //Set header
            ScanHeader oScanHeader = new ScanHeader();
            oScanHeader.nA = 0;
            oScanHeader.nComponents = 3;
            oScanHeader.nSe = 63;
            oScanHeader.nSs = 0;
            oScanHeader.aComponentSelector = new byte[] { 1, 2, 3 };
            oScanHeader.aHuffmanTablesSelector = new byte[] { 0, 17, 17 };

            NPPImage_16sC1[] apdDCT = new NPPImage_16sC1[3];

            NPPImage_8uC1[] apDstImage = new NPPImage_8uC1[3];
            NppiSize[] aDstSize = new NppiSize[3];
            aDstSize[0] = new NppiSize(srcY.Width, srcY.Height);
            aDstSize[1] = new NppiSize(srcCb.Width, srcCb.Height);
            aDstSize[2] = new NppiSize(srcCr.Width, srcCr.Height);

            // Compute channel sizes as stored in the output JPEG (8x8 blocks & MCU block layout)
            NppiSize oDstImageSize = new NppiSize();
            float frameWidth = (float)Math.Floor((float)oFrameHeader.nWidth);
            float frameHeight = (float)Math.Floor((float)oFrameHeader.nHeight);

            oDstImageSize.width = (int)Math.Max(1.0f, frameWidth);
            oDstImageSize.height = (int)Math.Max(1.0f, frameHeight);

            //Console.WriteLine("Output Size: " + oDstImageSize.width + "x" + oDstImageSize.height + "x" + (int)(oFrameHeader.nComponents));

            apDstImage[0] = srcY;
            apDstImage[1] = srcCb;
            apDstImage[2] = srcCr;

            int nMCUBlocksH = 0;
            int nMCUBlocksV = 0;

            // Compute channel sizes as stored in the JPEG (8x8 blocks & MCU block layout)
            for (int i = 0; i < oFrameHeader.nComponents; ++i)
            {
                nMCUBlocksV = Math.Max(nMCUBlocksV, oFrameHeader.aSamplingFactors[i] >> 4);
                nMCUBlocksH = Math.Max(nMCUBlocksH, oFrameHeader.aSamplingFactors[i] & 0x0f);
            }

            for (int i = 0; i < oFrameHeader.nComponents; ++i)
            {
                NppiSize oBlocks = new NppiSize();
                NppiSize oBlocksPerMCU = new NppiSize(oFrameHeader.aSamplingFactors[i] & 0x0f, oFrameHeader.aSamplingFactors[i] >> 4);

                oBlocks.width = (int)Math.Ceiling((oFrameHeader.nWidth + 7) / 8 *
                                          (float)(oBlocksPerMCU.width) / nMCUBlocksH);
                oBlocks.width = DivUp(oBlocks.width, oBlocksPerMCU.width) * oBlocksPerMCU.width;

                oBlocks.height = (int)Math.Ceiling((oFrameHeader.nHeight + 7) / 8 *
                                           (float)(oBlocksPerMCU.height) / nMCUBlocksV);
                oBlocks.height = DivUp(oBlocks.height, oBlocksPerMCU.height) * oBlocksPerMCU.height;

                // Allocate Memory
                apdDCT[i] = new NPPImage_16sC1(oBlocks.width * 64, oBlocks.height);

            }

            /***************************
            *
            *   Output
            *
            ***************************/

            // Forward DCT
            for (int i = 0; i < 3; ++i)
            {
                compression.DCTQuantFwd8x8LS(apDstImage[i], apdDCT[i], aDstSize[i], pdQuantizationTables[oFrameHeader.aQuantizationTableSelector[i]]);
            }

            // Huffman Encoding
            CudaDeviceVariable<byte> pdScan = new CudaDeviceVariable<byte>(BUFFER_SIZE);
            int nScanLength = 0;

            int nTempSize = JPEGCompression.EncodeHuffmanGetSize(aDstSize[0], 3);
            CudaDeviceVariable<byte> pJpegEncoderTemp = new CudaDeviceVariable<byte>(nTempSize);

            NppiEncodeHuffmanSpec[] apHuffmanDCTableEnc = new NppiEncodeHuffmanSpec[3];
            NppiEncodeHuffmanSpec[] apHuffmanACTableEnc = new NppiEncodeHuffmanSpec[3];

            for (int i = 0; i < 3; ++i)
            {
                apHuffmanDCTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] >> 4)].aCodes, NppiHuffmanTableType.nppiDCTable);
                apHuffmanACTableEnc[i] = JPEGCompression.EncodeHuffmanSpecInitAlloc(aHuffmanTables[(oScanHeader.aHuffmanTablesSelector[i] & 0x0f) + 2].aCodes, NppiHuffmanTableType.nppiACTable);
            }

            JPEGCompression.EncodeHuffmanScan(apdDCT, 0, oScanHeader.nSs, oScanHeader.nSe, oScanHeader.nA >> 4, oScanHeader.nA & 0x0f, pdScan, ref nScanLength, apHuffmanDCTableEnc, apHuffmanACTableEnc, aDstSize, pJpegEncoderTemp);

            for (int i = 0; i < 3; ++i)
            {
                JPEGCompression.EncodeHuffmanSpecFree(apHuffmanDCTableEnc[i]);
                JPEGCompression.EncodeHuffmanSpecFree(apHuffmanACTableEnc[i]);
            }

            // Write JPEG to byte array, as in original sample code
            byte[] pDstOutput = new byte[BUFFER_SIZE];
            int pos = 0;

            oFrameHeader.nWidth = (ushort)oDstImageSize.width;
            oFrameHeader.nHeight = (ushort)oDstImageSize.height;

            writeMarker(0x0D8, pDstOutput, ref pos);
            writeJFIFTag(pDstOutput, ref pos);
            writeQuantizationTable(aQuantizationTables[0], pDstOutput, ref pos);
            writeQuantizationTable(aQuantizationTables[1], pDstOutput, ref pos);
            writeFrameHeader(oFrameHeader, pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[0], pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[1], pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[2], pDstOutput, ref pos);
            writeHuffmanTable(aHuffmanTables[3], pDstOutput, ref pos);
            writeScanHeader(oScanHeader, pDstOutput, ref pos);

            pdScan.CopyToHost(pDstOutput, 0, pos, nScanLength);

            pos += nScanLength;
            writeMarker(0x0D9, pDstOutput, ref pos);

            FileStream fs = new FileStream(aFilename, FileMode.Create, FileAccess.Write);
            fs.Write(pDstOutput, 0, pos);
            fs.Close();

            //cleanup:
            fs.Dispose();
            pJpegEncoderTemp.Dispose();
            pdScan.Dispose();
            apdDCT[2].Dispose();
            apdDCT[1].Dispose();
            apdDCT[0].Dispose();
            pdQuantizationTables[1].Dispose();
            pdQuantizationTables[0].Dispose();

            srcCr.Dispose();
            srcCb.Dispose();
            srcY.Dispose();
            src.Dispose();
            compression.Dispose();
        }

示例#36

0

显示文件

文件： CUDAGenerator.cs 项目： barograf/VoxelTerrain

        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count = width * height * depth;
            int widthD = width - 1;
            int heightD = height - 1;
            int depthD = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions = new dim3(8, 8, 8);
            dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW = NearestPowerOfTwo(widthD);
            int nearestH = NearestPowerOfTwo(heightD);
            int nearestD = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
            trisCountDevice.Memset(0);
            CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;
            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;
            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                    container.Geometry.Dispose();

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags = ResourceOptionFlags.None,
                    SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
                
                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                    container.Geometry.Dispose();
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }

示例#37

0

显示文件

文件： Program.cs 项目： xmvdev/MoneroVMiner

        static void Main(string[] args)
        {
            try
            {
                if (args.Length == 1 && args[0].ToLower().Contains("fidelity"))
                {
                    string[] fseg = args[0].Split(':');
                    deviceID = int.Parse(fseg[1]);
                    nonce    = Int64.Parse(fseg[2]) - 1;
                    range    = int.Parse(fseg[3]);
                    QTEST    = true;
                }
                else
                {
                    if (args.Length > 0)
                    {
                        deviceID = int.Parse(args[0]);
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error: " + ex.Message);
            }

            try
            {
                if (args.Length > 0)
                {
                    deviceID = int.Parse(args[0]);
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error");
            }

            try
            {
                if (args.Length > 1)
                {
                    port = int.Parse(args[1]);
                    Comms.ConnectToMaster(port);
                }
                else
                {
                    TEST = true;
                    Logger.CopyToConsole = true;
                    CGraph.ShowCycles    = true;
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Master connection error");
            }

            try
            {
                if (args.Length > 3)
                {
                    gpuCount = int.Parse(args[3]);
                    fastCuda = gpuCount <= (Environment.ProcessorCount / 2);
                    if (fastCuda)
                    {
                        Logger.Log(LogLevel.Info, "Using single GPU blocking mode");
                    }
                }
            }
            catch
            {
            }

            if (TEST)
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };
            }
            else
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };

                if (!Comms.IsConnected())
                {
                    Console.WriteLine("Master connection failed, aborting");
                    Logger.Log(LogLevel.Error, "No master connection, exitting!");
                    return;
                }

                if (deviceID < 0)
                {
                    int devCnt             = CudaContext.GetDeviceCount();
                    GpuDevicesMessage gpum = new GpuDevicesMessage()
                    {
                        devices = new List <GpuDevice>(devCnt)
                    };
                    for (int i = 0; i < devCnt; i++)
                    {
                        string name = CudaContext.GetDeviceName(i);
                        var    info = CudaContext.GetDeviceInfo(i);
                        gpum.devices.Add(new GpuDevice()
                        {
                            deviceID = i, name = name, memory = info.TotalGlobalMemory
                        });
                    }
                    //Console.WriteLine(devCnt);
                    Comms.gpuMsg = gpum;
                    Comms.SetEvent();
                    //Console.WriteLine("event fired");
                    Task.Delay(1000).Wait();
                    //Console.WriteLine("closing");
                    Comms.Close();
                    return;
                }
            }

            try
            {
                var assembly       = Assembly.GetEntryAssembly();
                var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx");
                ctx = new CudaContext(deviceID, /*!fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) :*/ CUCtxFlags.MapHost);
                string pow = new StreamReader(resourceStream).ReadToEnd();

                //pow = File.ReadAllText(@"kernel_x64.ptx");

                Turing = ctx.GetDeviceInfo().MaxSharedMemoryPerMultiprocessor == 65536;

                using (var s = GenerateStreamFromString(pow))
                {
                    if (!Turing)
                    {
                        meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 });
                        meanSeedA.BlockDimensions = 512;
                        meanSeedA.GridDimensions  = 1024;
                        meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound = ctx.LoadKernelPTX(s, "FluffyRound_A2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 });
                        meanRound.BlockDimensions = 512;
                        meanRound.GridDimensions  = 4096;
                        meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_A1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRound_4.BlockDimensions = 1024;
                        meanRound_4.GridDimensions  = 1024;
                        meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_A3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRoundJoin.BlockDimensions = 1024;
                        meanRoundJoin.GridDimensions  = 4096;
                        meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanTail = ctx.LoadKernelPTX(s, "FluffyTail");
                        meanTail.BlockDimensions = 1024;
                        meanTail.GridDimensions  = 4096;
                        meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                        meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery");
                        meanRecover.BlockDimensions = 256;
                        meanRecover.GridDimensions  = 2048;
                        meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
                    }
                    else
                    {
                        meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 });
                        meanSeedA.BlockDimensions = 512;
                        meanSeedA.GridDimensions  = 1024;
                        meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound = ctx.LoadKernelPTX(s, "FluffyRound_C2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRound.BlockDimensions = 1024;
                        meanRound.GridDimensions  = 4096;
                        meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_C1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 });
                        meanRound_4.BlockDimensions = 1024;
                        meanRound_4.GridDimensions  = 1024;
                        meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_C3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 });
                        meanRoundJoin.BlockDimensions = 1024;
                        meanRoundJoin.GridDimensions  = 4096;
                        meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                        meanTail = ctx.LoadKernelPTX(s, "FluffyTail");
                        meanTail.BlockDimensions = 1024;
                        meanTail.GridDimensions  = 4096;
                        meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                        meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery");
                        meanRecover.BlockDimensions = 256;
                        meanRecover.GridDimensions  = 2048;
                        meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message);
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                d_buffer    = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32 * (temp ? 8 : 1));
                d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 2));
                d_bufferB   = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8));

                d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE);
                d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE);
                d_aux      = new CudaDeviceVariable <uint>(512);

                Array.Clear(h_indexesA, 0, h_indexesA.Length);
                Array.Clear(h_indexesB, 0, h_indexesA.Length);

                d_indexesA = h_indexesA;
                d_indexesB = h_indexesB;

                streamPrimary = new CudaStream(CUStreamFlags.NonBlocking);
            }
            catch (Exception ex)
            {
                Task.Delay(200).Wait();
                Logger.Log(LogLevel.Error, $"Mem alloc exception. Out of video memory? {ctx.GetFreeDeviceMemorySize()} free");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32);
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create pinned memory.");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            int loopCnt = 0;

            while (!Comms.IsTerminated)
            {
                try
                {
                    if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow))
                    {
                        Logger.Log(LogLevel.Info, string.Format("Waiting for job...."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                    {
                        currentJob           = Comms.nextJob;
                        currentJob.timestamp = DateTime.Now;
                    }

                    if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now)
                    {
                        Logger.Log(LogLevel.Info, string.Format("Job too old..."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    // test runs only once
                    if (TEST && ++loopCnt >= range)
                    {
                        Comms.IsTerminated = true;
                    }

                    Solution s;
                    while (graphSolutions.TryDequeue(out s))
                    {
                        meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges());
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer);
                        streamPrimary.Synchronize();
                        s.nonces = new uint[32];
                        d_indexesB.CopyToHost(s.nonces, 0, 0, 32 * 4);
                        s.nonces = s.nonces.OrderBy(n => n).ToArray();
                        //fidelity = (32-cycles_found / graphs_searched) * 32
                        solutions++;
                        s.fidelity = ((double)solutions / (double)trims) * 32.0;
                        //Console.WriteLine(s.fidelity.ToString("0.000"));
                        if (Comms.IsConnected())
                        {
                            Comms.graphSolutionsOut.Enqueue(s);
                            Comms.SetEvent();
                        }
                        if (QTEST)
                        {
                            Console.ForegroundColor = ConsoleColor.Red;
                            Console.WriteLine($"Solution for nonce {s.job.nonce}: {string.Join(' ', s.nonces)}");
                            Console.ResetColor();
                        }
                    }

                    if (QTEST)
                    {
                        currentJob = currentJob.NextSequential(ref nonce);
                        Console.WriteLine($"Nonce: {nonce} K0: {currentJob.k0:X} K1: {currentJob.k1:X} K2: {currentJob.k2:X} K3: {currentJob.k3:X}");
                    }
                    else
                    {
                        currentJob = currentJob.Next();
                    }

                    Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID));

                    timer.Restart();

                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    d_aux.MemsetAsync(0, streamPrimary.Stream);

                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer, 0);
                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 1, d_indexesB.DevicePointer + (4096 * 4), EDGE_SEG);
                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 2, d_indexesB.DevicePointer + (4096 * 8), EDGE_SEG * 2);
                    meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 3, d_indexesB.DevicePointer + (4096 * 12), EDGE_SEG * 3);

                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 0);
                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 1024);
                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 2048);
                    meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 3072);


                    //streamPrimary.Synchronize();
                    //h_indexesA = d_indexesA;
                    //h_indexesB = d_indexesB;
                    //var sumA = h_indexesA.Sum(e => e);
                    //var sumB = h_indexesB.Sum(e => e);
                    //streamPrimary.Synchronize();

                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    meanRoundJoin.RunAsync(streamPrimary.Stream,
                                           d_buffer.DevicePointer,
                                           d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1,
                                           d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2,
                                           d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3,
                                           d_bufferB.DevicePointer,
                                           d_indexesA.DevicePointer,
                                           d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 2);

                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 0, d_aux.DevicePointer);
                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 1, d_aux.DevicePointer);
                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 2, d_aux.DevicePointer);
                    d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                    meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4, 3, d_aux.DevicePointer);

                    for (int i = 0; i < (TEST ? 80 : trimRounds); i++)
                    //for (int i = 0; i < 85; i++)
                    {
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 4, d_aux.DevicePointer);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 5, d_aux.DevicePointer);
                    }

                    d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                    meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer);

                    Task.Delay((int)lastTrimMs).Wait();

                    streamPrimary.Synchronize();

                    uint[] count = new uint[2];
                    d_indexesA.CopyToHost(count, 0, 0, 8);

                    if (count[0] > 131071)
                    {
                        // trouble
                        count[0] = 131071;
                        // log
                    }

                    hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream);
                    streamPrimary.Synchronize();
                    System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int));

                    trims++;
                    timer.Stop();
                    lastTrimMs          = (long)Math.Min(Math.Max((float)timer.ElapsedMilliseconds * 0.9f, 50), 500);
                    currentJob.solvedAt = DateTime.Now;
                    currentJob.trimTime = timer.ElapsedMilliseconds;

                    //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                    Logger.Log(LogLevel.Info, string.Format("GPU NV{2}:     Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0], deviceID));


                    FinderBag.RunFinder(TEST, ref trims, count[0], h_a, currentJob, graphSolutions, timer);

                    if (trims % 50 == 0 && TEST)
                    {
                        Console.ForegroundColor = ConsoleColor.Green;
                        Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions);
                        Console.ResetColor();
                    }

                    /*
                     * if (TEST)
                     * {
                     *  //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                     *
                     *  CGraph cg = FinderBag.GetFinder();
                     *  cg.SetEdges(h_a, (int)count[0]);
                     *  cg.SetHeader(currentJob);
                     *
                     *  //currentJob = currentJob.Next();
                     *
                     *  Task.Factory.StartNew(() =>
                     *     {
                     *         Stopwatch sw = new Stopwatch();
                     *         sw.Start();
                     *
                     *         if (count[0] < 131071)
                     *         {
                     *             try
                     *             {
                     *                 if (findersInFlight++ < 3)
                     *                 {
                     *                     Stopwatch cycleTime = new Stopwatch();
                     *                     cycleTime.Start();
                     *                     cg.FindSolutions(graphSolutions);
                     *                     cycleTime.Stop();
                     *                     AdjustTrims(cycleTime.ElapsedMilliseconds);
                     *                     //if (graphSolutions.Count > 0) solutions++;
                     *                 }
                     *                 else
                     *                     Logger.Log(LogLevel.Warning, "CPU overloaded!");
                     *             }
                     *             catch (Exception ex)
                     *             {
                     *                 Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message);
                     *             }
                     *             finally
                     *             {
                     *                 FinderBag.ReturnFinder(cg);
                     *                 findersInFlight--;
                     *             }
                     *         }
                     *
                     *         sw.Stop();
                     *
                     *         if (trims % 50 == 0)
                     *         {
                     *             Console.ForegroundColor = ConsoleColor.Green;
                     *             Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims/solutions );
                     *             Console.ResetColor();
                     *         }
                     *         //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count);
                     *         //Console.WriteLine("Duped edges: {0}", cg.dupes);
                     *         if (!QTEST)
                     *          Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes));
                     *     });
                     *
                     *  //h_indexesA = d_indexesA;
                     *  //h_indexesB = d_indexesB;
                     *
                     *  //var sumA = h_indexesA.Sum(e => e);
                     *  //var sumB = h_indexesB.Sum(e => e);
                     *
                     *  ;
                     * }
                     * else
                     * {
                     *  CGraph cg = FinderBag.GetFinder();
                     *  cg.SetEdges(h_a, (int)count[0]);
                     *  cg.SetHeader(currentJob);
                     *
                     *  Task.Factory.StartNew(() =>
                     *  {
                     *      if (count[0] < 131071)
                     *      {
                     *          try
                     *          {
                     *              if (findersInFlight++ < 3)
                     *              {
                     *                  Stopwatch cycleTime = new Stopwatch();
                     *                  cycleTime.Start();
                     *                  cg.FindSolutions(graphSolutions);
                     *                  cycleTime.Stop();
                     *                  AdjustTrims(cycleTime.ElapsedMilliseconds);
                     *              }
                     *              else
                     *                  Logger.Log(LogLevel.Warning, "CPU overloaded!");
                     *          }
                     *          catch (Exception ex)
                     *          {
                     *              Logger.Log(LogLevel.Warning, "Cycle finder crashed: " + ex.Message);
                     *          }
                     *          finally
                     *          {
                     *              FinderBag.ReturnFinder(cg);
                     *              findersInFlight--;
                     *          }
                     *      }
                     *  });
                     * }
                     *
                     */
                }
                catch (Exception ex)
                {
                    Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message);
                    Task.Delay(500).Wait();
                    break;
                }
            }

            // clean up
            try
            {
                Task.Delay(500).Wait();

                Comms.Close();

                d_buffer.Dispose();
                d_indexesA.Dispose();
                d_indexesB.Dispose();
                d_aux.Dispose();

                streamPrimary.Dispose();
                streamSecondary.Dispose();

                hAligned_a.Dispose();

                if (ctx != null)
                {
                    ctx.Dispose();
                }
            }
            catch { }
        }

示例#38

0

显示文件

        protected override void Execute()
        {
            if (!initializedFlag)
            {
                m_d_Force = new CudaDeviceVariable <float>(Target.MAX_CELLS * 3);

                m_d_ActiveConnectionsCount = new CudaDeviceVariable <int>(1);
                m_d_CenterOfGravity        = new CudaDeviceVariable <float>(3);

                //initialize vertices position
                InitCoordinatesAndVelocity();
                initializedFlag = true;
                ViewMode        = ViewMethod.Orbit_3D;

                float translationValue = 0.50f * (COORDINATES_MAX - COORDINATES_MIN);
                m_Translation = new Vector3(-translationValue, 0, -translationValue);

                m_zeroTextureKernel.SetupExecution(TextureHeight * TextureWidth);

                m_zeroTextureKernel.Run(
                    VBODevicePointer,
                    TextureHeight * TextureWidth
                    );
            }


            if (TRANSLATE_TO_CENTER == Option.True)
            {
                m_centerOfGravityKernel.SetupExecution(1);

                m_centerOfGravityKernel.Run(
                    m_d_PointsCoordinates.DevicePointer,
                    m_d_CenterOfGravity.DevicePointer,
                    Target.ActivityFlag,
                    Target.MAX_CELLS
                    );

                float[] m_h_centerOfGravity = new float[3];
                m_d_CenterOfGravity.CopyToHost(m_h_centerOfGravity);

                m_Translation = new Vector3(-m_h_centerOfGravity[0], -m_h_centerOfGravity[1], -m_h_centerOfGravity[2]);

                m_Connections.Translation     = m_Translation;
                m_ReferenceFields.Translation = m_Translation;
                m_WinnerOne.Translation       = m_Translation;
                m_WinnerTwo.Translation       = m_Translation;
            }

            // PHYSICS PART
            // set forces to zero
            m_setForcesToZeroKernel.SetupExecution(Target.MAX_CELLS * 3);

            m_setForcesToZeroKernel.Run(
                m_d_Force.DevicePointer,
                Target.MAX_CELLS
                );

            // spring force computation
            m_springKernel.SetupExecution(Target.MAX_CELLS);

            m_springKernel.Run(
                Target.ActivityFlag,
                Target.ConnectionMatrix,
                m_d_PointsCoordinates.DevicePointer,
                SPRING_STRENGTH,
                m_d_Force.DevicePointer,
                Target.MAX_CELLS
                );


            // repulsion force computation
            m_repulsionKernel.SetupExecution(Target.MAX_CELLS);

            m_repulsionKernel.Run(
                REPULSION,
                REPULSION_DISTANCE,
                m_d_Force.DevicePointer,
                m_d_PointsCoordinates.DevicePointer,
                Target.ActivityFlag,
                Target.MAX_CELLS
                );


            // applying forces to the points
            m_useForceKernel.SetupExecution(Target.MAX_CELLS * 3);

            m_useForceKernel.Run(
                m_d_Force.DevicePointer,
                FORCE_FACTOR,
                m_d_PointsCoordinates.DevicePointer,
                Target.MAX_CELLS
                );



            // GRAPHICS PART
            // COPY AND PROCESS TEXTURE
            m_copyAndProcessTextureKernel.SetupExecution(Target.ReferenceVector.Count);

            m_copyAndProcessTextureKernel.Run(
                Target.ReferenceVector,
                Target.INPUT_SIZE,
                Target.Input.ColumnHint,
                TextureWidth,
                VBODevicePointer,
                Target.MAX_CELLS,
                Target.ReferenceVector.Count
                );



            // CONNECTIONS
            m_d_ActiveConnectionsCount.CopyToDevice(0);

            m_copyConnectionsCoordinatesKernel.SetupExecution(Target.MAX_CELLS * Target.MAX_CELLS);

            m_copyConnectionsCoordinatesKernel.Run(
                Target.ConnectionMatrix,
                m_d_PointsCoordinates.DevicePointer,
                VertexVBODevicePointer,
                m_d_ActiveConnectionsCount.DevicePointer,
                Target.MAX_CELLS
                );

            m_d_ActiveConnectionsCount.CopyToHost(m_h_ActiveConnectionsCount);
            m_Connections.VertexCount = 2 * m_h_ActiveConnectionsCount[0];

            // REFERENCE VECTORS (CUBES)

            /*
             * m_computeCubesKernel.m_kernel.SetupExecution(Target.MAX_CELLS
             *  );
             *
             *
             * .Run(
             *  m_computeCubesKernel,
             *  m_d_PointsCoordinates.DevicePointer,
             *  VertexVBODevicePointer,
             *  m_ReferenceFields.VertexOffset,
             *  TEXTURE_SIDE,
             *  Target.ActivityFlag,
             *  Target.Input.ColumnHint,
             *  Target.MAX_CELLS
             *  );
             */

            /*
             * m_cubeCoordinatesKernel.m_kernel.SetupExecution(Target.MAX_CELLS * 72
             *  );
             *
             * .Run(
             *  m_cubeCoordinatesKernel,
             *  VertexVBODevicePointer,
             *  m_d_CubeOperation.DevicePointer,
             *  m_ReferenceFields.VertexOffset,
             *  Target.ActivityFlag,
             *  TEXTURE_SIDE,
             *  m_d_PointsCoordinates.DevicePointer,
             *  Target.MAX_CELLS
             *  );
             *
             * m_cubeTextureKernel.m_kernel.SetupExecution(Target.MAX_CELLS * 48
             *  );
             *
             * .Run(
             *  m_cubeTextureKernel,
             *  VertexVBODevicePointer,
             *  m_ReferenceFields.TexCoordOffset,
             *  m_d_CubeTexCoordinates.DevicePointer,
             *  TEXTURE_SIDE,
             *  Target.Input.ColumnHint,
             *  Target.ActivityFlag,
             *  Target.MAX_CELLS
             *  );
             */

            m_computeCubes2Kernel.SetupExecution(Target.MAX_CELLS * 6);

            m_computeCubes2Kernel.Run(
                m_d_PointsCoordinates.DevicePointer,
                VertexVBODevicePointer,
                m_ReferenceFields.VertexOffset,
                TEXTURE_SIDE,
                m_d_CubeOperation.DevicePointer,
                m_d_CubeTexCoordinates.DevicePointer,
                Target.ActivityFlag,
                (float)Target.Input.ColumnHint,
                Target.MAX_CELLS
                );



            /*
             * m_computeQuadsKernel.m_kernel.SetupExecution(
             *  Target.MAX_CELLS
             *  );
             *
             * m_computeQuadsKernel.Run(
             *  m_d_PointsCoordinates.DevicePointer,
             *  VertexVBODevicePointer,
             *  m_ReferenceFields.VertexOffset,
             *  TEXTURE_SIDE,
             *  Target.ActivityFlag,
             *  Target.Input.ColumnHint,
             *  Target.MAX_CELLS
             *  );
             */

            m_winnersKernel.SetupExecution(Target.MAX_CELLS);

            m_winnersKernel.Run(
                Target.WinnerOne,
                VertexVBODevicePointer,
                m_WinnerOne.VertexOffset,
                m_d_PointsCoordinates.DevicePointer,
                TEXTURE_SIDE,
                Target.MAX_CELLS
                );

            m_winnersKernel.SetupExecution(Target.MAX_CELLS);

            m_winnersKernel.Run(
                Target.WinnerTwo,
                VertexVBODevicePointer,
                m_WinnerTwo.VertexOffset,
                m_d_PointsCoordinates.DevicePointer,
                TEXTURE_SIDE,
                Target.MAX_CELLS
                );

            if (ONE_SHOT_RESTART == Option.True)
            {
                initializedFlag = false;
                TriggerReset();
                ONE_SHOT_RESTART = Option.False;
            }
        }

示例#39

0

显示文件

文件： Program.cs 项目： tubedev2000/tube4referenceMiner

        static void Main(string[] args)
        {
            try
            {
                if (args.Length > 0)
                {
                    deviceID = int.Parse(args[0]);
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Device ID parse error");
            }

            try
            {
                if (args.Length > 1)
                {
                    port = int.Parse(args[1]);
                    Comms.ConnectToMaster(port);
                }
                else
                {
                    TEST = true;
                    Logger.CopyToConsole = true;
                    CGraph.ShowCycles    = true;
                }
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Master connection error");
            }

            try
            {
                if (args.Length > 3)
                {
                    gpuCount = int.Parse(args[3]);
                    fastCuda = gpuCount <= (Environment.ProcessorCount / 2);
                    if (fastCuda)
                    {
                        Logger.Log(LogLevel.Info, "Using single GPU blocking mode");
                    }
                }
            }
            catch
            {
            }

            if (TEST)
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };
            }
            else
            {
                currentJob = nextJob = new Job()
                {
                    jobID     = 0,
                    k0        = 0xf4956dc403730b01L,
                    k1        = 0xe6d45de39c2a5a3eL,
                    k2        = 0xcbf626a8afee35f6L,
                    k3        = 0x4307b94b1a0c9980L,
                    pre_pow   = TestPrePow,
                    timestamp = DateTime.Now
                };

                if (!Comms.IsConnected())
                {
                    Console.WriteLine("Master connection failed, aborting");
                    Logger.Log(LogLevel.Error, "No master connection, exitting!");
                    return;
                }

                if (deviceID < 0)
                {
                    int devCnt             = CudaContext.GetDeviceCount();
                    GpuDevicesMessage gpum = new GpuDevicesMessage()
                    {
                        devices = new List <GpuDevice>(devCnt)
                    };
                    for (int i = 0; i < devCnt; i++)
                    {
                        string name = CudaContext.GetDeviceName(i);
                        var    info = CudaContext.GetDeviceInfo(i);
                        gpum.devices.Add(new GpuDevice()
                        {
                            deviceID = i, name = name, memory = info.TotalGlobalMemory
                        });
                    }
                    //Console.WriteLine(devCnt);
                    Comms.gpuMsg = gpum;
                    Comms.SetEvent();
                    //Console.WriteLine("event fired");
                    Task.Delay(1000).Wait();
                    //Console.WriteLine("closing");
                    Comms.Close();
                    return;
                }
            }


            try
            {
                var assembly       = Assembly.GetEntryAssembly();
                var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx");
                ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost);

                meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A");
                meanSeedA.BlockDimensions = 128;
                meanSeedA.GridDimensions  = 2048;
                meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B");
                meanSeedB.BlockDimensions = 128;
                meanSeedB.GridDimensions  = 2048;
                meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B");
                meanSeedB_4.BlockDimensions = 128;
                meanSeedB_4.GridDimensions  = 1024;
                meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound");
                meanRound.BlockDimensions = 512;
                meanRound.GridDimensions  = 4096;
                meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound");
                meanRound_2.BlockDimensions = 512;
                meanRound_2.GridDimensions  = 2048;
                meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J");
                meanRoundJoin.BlockDimensions = 512;
                meanRoundJoin.GridDimensions  = 4096;
                meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared;

                meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail");
                meanTail.BlockDimensions = 1024;
                meanTail.GridDimensions  = 4096;
                meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;

                meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery");
                meanRecover.BlockDimensions = 256;
                meanRecover.GridDimensions  = 2048;
                meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1;
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message);
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                d_buffer    = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32);
                d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8));
                d_bufferB   = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8));

                d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2);
                d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2);

                Array.Clear(h_indexesA, 0, h_indexesA.Length);
                Array.Clear(h_indexesB, 0, h_indexesA.Length);

                d_indexesA = h_indexesA;
                d_indexesB = h_indexesB;

                streamPrimary   = new CudaStream(CUStreamFlags.NonBlocking);
                streamSecondary = new CudaStream(CUStreamFlags.NonBlocking);
            }
            catch (Exception ex)
            {
                Task.Delay(200).Wait();
                Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            try
            {
                AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32);
            }
            catch (Exception ex)
            {
                Logger.Log(LogLevel.Error, "Unable to create pinned memory.");
                Task.Delay(500).Wait();
                Comms.Close();
                return;
            }

            int loopCnt = 0;

            while (!Comms.IsTerminated)
            {
                try
                {
                    if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow))
                    {
                        Logger.Log(LogLevel.Info, string.Format("Waiting for job...."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                    {
                        currentJob           = Comms.nextJob;
                        currentJob.timestamp = DateTime.Now;
                    }

                    if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now)
                    {
                        Logger.Log(LogLevel.Info, string.Format("Job too old..."));
                        Task.Delay(1000).Wait();
                        continue;
                    }

                    // test runs only once
                    if (TEST && loopCnt++ > 100)
                    {
                        Comms.IsTerminated = true;
                    }

                    Solution s;
                    while (graphSolutions.TryDequeue(out s))
                    {
                        meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges());
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer);
                        streamPrimary.Synchronize();
                        s.nonces = new uint[40];
                        d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4);
                        s.nonces = s.nonces.OrderBy(n => n).ToArray();
                        lock (Comms.graphSolutionsOut)
                        {
                            Comms.graphSolutionsOut.Enqueue(s);
                        }
                        Comms.SetEvent();
                    }
                    uint[] count;
                    do
                    {
                        if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin)))
                        {
                            currentJob           = Comms.nextJob;
                            currentJob.timestamp = DateTime.Now;
                        }
                        currentJob = currentJob.Next();

                        Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID));

                        timer.Restart();

                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);

                        meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32);
                        meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48);

                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2);
                        meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2);
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);

                        //d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);
                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2);
                        d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                        meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4);

                        for (int i = 0; i < trimRounds; i++)
                        {
                            d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                            meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4);
                            d_indexesB.MemsetAsync(0, streamPrimary.Stream);
                            meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4);
                        }

                        d_indexesA.MemsetAsync(0, streamPrimary.Stream);
                        meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer);

                        ctx.Synchronize();
                        streamPrimary.Synchronize();

                        count = new uint[2];
                        d_indexesA.CopyToHost(count, 0, 0, 8);

                        if (count[0] > 4194304)
                        {
                            // trouble
                            count[0] = 4194304;
                            // log
                        }

                        hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream);
                        streamPrimary.Synchronize();
                        System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int));

                        timer.Stop();
                        currentJob.solvedAt = DateTime.Now;
                        currentJob.trimTime = timer.ElapsedMilliseconds;

                        //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);
                        Logger.Log(LogLevel.Info, string.Format("GPU NV{2}:     Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height));
                    }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST));

                    if (TEST)
                    {
                        //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]);

                        CGraph cg = FinderBag.GetFinder();
                        if (cg == null)
                        {
                            continue;
                        }

                        cg.SetEdges(h_a, (int)count[0]);
                        cg.SetHeader(currentJob);

                        //currentJob = currentJob.Next();

                        Task.Factory.StartNew(() =>
                        {
                            Stopwatch sw = new Stopwatch();
                            sw.Start();

                            if (count[0] < 200000)
                            {
                                try
                                {
                                    if (findersInFlight++ < 3)
                                    {
                                        Stopwatch cycleTime = new Stopwatch();
                                        cycleTime.Start();
                                        cg.FindSolutions(graphSolutions);
                                        cycleTime.Stop();
                                        AdjustTrims(cycleTime.ElapsedMilliseconds);
                                        if (graphSolutions.Count > 0)
                                        {
                                            solutions++;
                                        }
                                    }
                                    else
                                    {
                                        Logger.Log(LogLevel.Warning, "CPU overloaded!");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message);
                                }
                                finally
                                {
                                    findersInFlight--;
                                    FinderBag.ReturnFinder(cg);
                                }
                            }

                            sw.Stop();

                            if (++trims % 50 == 0)
                            {
                                Console.ForegroundColor = ConsoleColor.Green;
                                Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions);
                                Console.ResetColor();
                            }
                            //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count);
                            //Console.WriteLine("Duped edges: {0}", cg.dupes);
                            Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes));
                        });

                        //h_indexesA = d_indexesA;
                        //h_indexesB = d_indexesB;

                        //var sumA = h_indexesA.Sum(e => e);
                        //var sumB = h_indexesB.Sum(e => e);

                        ;
                    }
                    else
                    {
                        CGraph cg = FinderBag.GetFinder();
                        cg.SetEdges(h_a, (int)count[0]);
                        cg.SetHeader(currentJob);

                        Task.Factory.StartNew(() =>
                        {
                            if (count[0] < 200000)
                            {
                                try
                                {
                                    if (findersInFlight++ < 3)
                                    {
                                        Stopwatch cycleTime = new Stopwatch();
                                        cycleTime.Start();
                                        cg.FindSolutions(graphSolutions);
                                        cycleTime.Stop();
                                        AdjustTrims(cycleTime.ElapsedMilliseconds);
                                        if (graphSolutions.Count > 0)
                                        {
                                            solutions++;
                                        }
                                    }
                                    else
                                    {
                                        Logger.Log(LogLevel.Warning, "CPU overloaded!");
                                    }
                                }
                                catch (Exception ex)
                                {
                                    Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message);
                                }
                                finally
                                {
                                    findersInFlight--;
                                    FinderBag.ReturnFinder(cg);
                                }
                            }
                        });
                    }
                }
                catch (Exception ex)
                {
                    Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message);
                    Task.Delay(5000).Wait();
                }
            }

            // clean up
            try
            {
                Task.Delay(500).Wait();

                Comms.Close();

                d_buffer.Dispose();
                d_indexesA.Dispose();
                d_indexesB.Dispose();

                streamPrimary.Dispose();
                streamSecondary.Dispose();

                hAligned_a.Dispose();

                if (ctx != null)
                {
                    ctx.Dispose();
                }
            }
            catch { }
        }

示例#40

0

显示文件

文件： Program.cs 项目： kunzmi/managedCuda

        static void Main(string[] args)
        {
            string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder...
            string fileToCompile = File.ReadAllText(filename);

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel");

            rtc.Compile(args);

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            CudaContext ctx = new CudaContext(0);

            CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd");

            // Print the vector length to be used, and compute its size
            int numElements = 50000;
            SizeT size = numElements * sizeof(float);
            Console.WriteLine("[Vector addition of {0} elements]", numElements);

            // Allocate the host input vector A
            float[] h_A = new float[numElements];
            // Allocate the host input vector B
            float[] h_B = new float[numElements];
            // Allocate the host output vector C
            float[] h_C = new float[numElements];

            Random rand = new Random(0);

            // Initialize the host input vectors
            for (int i = 0; i < numElements; ++i)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
            }

            Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n");
            // Allocate the device input vector A and copy to device
            CudaDeviceVariable<float> d_A = h_A;

            // Allocate the device input vector B and copy to device
            CudaDeviceVariable<float> d_B = h_B;

            // Allocate the device output vector C
            CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements);

            // Launch the Vector Add CUDA Kernel
            int threadsPerBlock = 256;
            int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
            Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock);
            vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1);
            vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1);

            vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements);

            // Copy the device result vector in device memory to the host result vector
            // in host memory.
            Console.WriteLine("Copy output data from the CUDA device to the host memory\n");
            d_C.CopyToHost(h_C);

            // Verify that the result vector is correct
            for (int i = 0; i < numElements; ++i)
            {
                if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
                {
                    Console.WriteLine("Result verification failed at element {0}!\n", i);
                    return;
                }
            }

            Console.WriteLine("Test PASSED\n");

            // Free device global memory
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            ctx.Dispose();
            Console.WriteLine("Done\n");
        }

C# (CSharp) CudaDeviceVariable.CopyToHost示例