示例#1
0
        public override float InferenceTraining(CudaDeviceVariable <float> input)
        {
            float error = 0;

            switch (_norm)
            {
            case Norm.L1:
                //derivative of cost-function. Hier L1-Norm:

                _res.CopyToDevice(input);
                _groundTrouthData.Sub(input, _dx);
                _dx.Threshold_GTVal(0, 1);
                _dx.Threshold_LTVal(0, -1);
                _dx.DivC(_batch * _inChannels * _inWidth * _inHeight);

                _groundTrouthData.Sub(input, _temp);
                _temp.Abs();
                _temp.Sum(_summedError, _buffer);

                error = _summedError;
                error = error / _batch / _inChannels / _inWidth / _inHeight;
                break;

            case Norm.L2:
                //derivative of cost-function. Hier L2-Norm:
                _res.CopyToDevice(input);
                _groundTrouthData.Sub(input, _dx);
                _dx.DivC(_batch * _inChannels * _inWidth * _inHeight);

                _groundTrouthData.Sub(input, _temp);
                _temp.Sqr();
                _temp.Sum(_summedError, _buffer);

                error = _summedError;
                error = error / _batch / _inChannels / _inWidth / _inHeight;
                break;

            case Norm.MSSSIM:
                _res.CopyToDevice(input);
                _kernelMSSSIML1.RunSafe(input, _groundTrouthData, _msssiml1, _dx, _inChannels, _batch, 1.0f);

                _msssiml1.Sum(_summedError, _buffer);
                error = _summedError;
                error = error / _batch / _inChannels;
                break;

            case Norm.Mix:
                _res.CopyToDevice(input);
                _kernelMSSSIML1.RunSafe(input, _groundTrouthData, _msssiml1, _dx, _inChannels, _batch, 0.84f);
                _msssiml1.Sum(_summedError, _buffer);
                error = _summedError;
                break;

            default:
                break;
            }


            return(error);
        }
        public override void InitRandomWeight(Random rand)
        {
            // Xavier weight filling * _outChannels
            float wconv1 = (float)Math.Sqrt(3.0f / (_filterX * _filterY * _inChannels));

            float[] w = new float[_weights.Size];
            float[] b = new float[_bias.Size];

            // Randomize network
            for (int i = 0; i < _weights.Size; i++)
            {
                w[i] = (float)((rand.NextDouble() * 2.0 - 1.0) * wconv1);
            }
            for (int i = 0; i < _bias.Size; i++)
            {
                b[i] = (float)((rand.NextDouble() * 2.0 - 1.0) * wconv1);
            }
            _weights.CopyToDevice(w);
            _bias.CopyToDevice(b);

            switch (_activation)
            {
            case Activation.PRelu:
                _aRelu.Set(0.25f);
                break;

            case Activation.LeakyRelu:
                _aRelu.Set(0.25f);
                break;

            default:
                break;
            }
            base.InitRandomWeight(rand);
        }
示例#3
0
        public void Run(DistanceOperation operation,
                        CudaDeviceVariable <float> A, int sizeA,
                        CudaDeviceVariable <float> B, int sizeB,
                        CudaDeviceVariable <float> result, int sizeRes)
        {
            if (!ValidateAtRun(operation))
            {
                return;
            }

            switch (operation)
            {
            case DistanceOperation.DotProd:
                //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                break;

            case DistanceOperation.CosDist:
                //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                break;

            case DistanceOperation.EuclidDist:
                float res = RunReturn(operation, A, sizeA, B, sizeB);
                result.CopyToDevice(res);
                break;

            case DistanceOperation.EuclidDistSquared:
                m_combineVecsKernel.SetupExecution(sizeA);
                m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA);
                //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0);
                m_dotKernel.Run(result.DevicePointer, m_temp, m_temp);
                break;

            case DistanceOperation.HammingDist:
                m_combineVecsKernel.SetupExecution(sizeA);
                m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                float fDist = 0;     // to transform number of matches to a number of differences
                result.CopyToHost(ref fDist);
                fDist = m_temp.Count - fDist;
                result.CopyToDevice(fDist);
                break;

            case DistanceOperation.HammingSim:
                m_combineVecsKernel.SetupExecution(sizeA);
                m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                // take the single number (number of different bits) and convert it to Hamming Similarity:
                // a number in range <0,1> that says how much the vectors are similar
                float fSim = 0;
                result.CopyToHost(ref fSim);
                fSim = fSim / m_temp.Count;
                result.CopyToDevice(fSim);
                break;
            }
        }
        /*
         * -----------------------
         * METHOD CYCLE
         * -----------------------
         */
        private void runMethodCycle(bool mustBeUpdated)
        {
            m_cycleKernel.SetConstantVariable("D_NB_CURVES", Count);
            m_cycleKernel.SetConstantVariable("D_TEXTURE_WIDTH", TextureWidth);
            m_cycleKernel.SetConstantVariable("D_PLOTAREA_WIDTH", m_plotAreaWidth);
            m_cycleKernel.SetConstantVariable("D_PLOTAREA_HEIGHT", m_plotAreaHeight);
            m_cycleKernel.SetConstantVariable("D_PLOTAREA_OFFSET_X", m_plotAreaOffsetX);
            m_cycleKernel.SetConstantVariable("D_MIN_VALUE", m_plotCurrentValueMin);
            m_cycleKernel.SetConstantVariable("D_MAX_VALUE", m_plotCurrentValueMax);

            int currentColumn = m_currentSamplingTimeStep % m_plotAreaWidth;

            if (Stride == 1)
            {
                m_valuesHistory.CopyToDevice(Target.GetDevicePtr(this), Offset * sizeof(float), currentColumn * Count * sizeof(float), Count * sizeof(float));
            }
            else
            {
                //not really happy with this one
                for (int i = 0; i < Count; i++)
                {
                    m_valuesHistory.CopyToDevice(Target.GetDevicePtr(this), (i * Stride + Offset) * sizeof(float), (currentColumn * Count + i) * sizeof(float), sizeof(float));
                }
            }

            if (mustBeUpdated)
            {
                // Draw curves
                int nbColumnsToDraw = Math.Min(m_currentSamplingTimeStep + 1, m_plotAreaWidth);
                m_cycleKernel.SetupExecution(nbColumnsToDraw * m_plotAreaHeight);
                m_cycleKernel.Run(
                    VBODevicePointer,
                    5,
                    nbColumnsToDraw,
                    m_valuesHistory.DevicePointer
                    );
            }
            else
            {
                // Draw only the needed columns
                m_cycleKernel.SetupExecution(m_plotAreaHeight);
                m_cycleKernel.Run(
                    VBODevicePointer,
                    currentColumn,
                    1,
                    m_valuesHistory.DevicePointer
                    );
            }

            // Draw a vertical grey line
            m_verticalLineKernel.SetupExecution(m_plotAreaHeight);
            m_verticalLineKernel.Run(
                VBODevicePointer,
                m_plotAreaOffsetX + (currentColumn + 1) % m_plotAreaWidth,
                TextureWidth,
                m_plotAreaHeight
                );
        }
示例#5
0
 internal MinMax FindMinAndMax(CudaDeviceVariable <float> a, int size)
 {
     if (size > 0)
     {
         var ptr = a;
         while (size > BLOCK_DIM2)
         {
             var bufferSize = (size / BLOCK_DIM2) + 1;
             using (var minBlock = new CudaDeviceVariable <float>(bufferSize))
                 using (var maxBlock = new CudaDeviceVariable <float>(bufferSize)) {
                     minBlock.Memset(0);
                     maxBlock.Memset(0);
                     _Use(_findMinAndMax, size, k => k.Run(BLOCK_DIM2, ptr.DevicePointer, size, minBlock.DevicePointer, maxBlock.DevicePointer));
                     if (ptr != a)
                     {
                         ptr.Dispose();
                     }
                     var minTest = new float[bufferSize];
                     var maxText = new float[bufferSize];
                     minBlock.CopyToHost(minTest);
                     maxBlock.CopyToHost(maxText);
                     size = bufferSize * 2;
                     ptr  = new CudaDeviceVariable <float>(size);
                     ptr.CopyToDevice(minBlock, 0, 0, bufferSize * sizeof(float));
                     ptr.CopyToDevice(maxBlock, 0, bufferSize * sizeof(float), bufferSize * sizeof(float));
                     var test = new float[size];
                     ptr.CopyToHost(test);
                 }
         }
         var data = new float[size];
         ptr.CopyToHost(data);
         float min = float.MaxValue, max = float.MinValue;
         for (var i = 0; i < size; i++)
         {
             var val = data[i];
             if (val > max)
             {
                 max = val;
             }
             if (val < min)
             {
                 min = val;
             }
         }
         return(new MinMax(min, max));
     }
     return(MinMax.Empty);
 }
示例#6
0
        public static float2[] calculateCudaFFT(float[] h_dataIn)
        {
            CudaContext cntxt = new CudaContext();
            //Caution: Array sizes matter! Based on CUFFFT-Documentation...
            int size_real    = h_dataIn.Length;
            int size_complex = (int)Math.Floor(size_real / 2.0) + 1;

            //Crating FFT Plan
            CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C);

            //Size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real
            CudaDeviceVariable <float> d_data = new CudaDeviceVariable <float>(size_complex * 2);

            //device allocation and host have different sizes, why the amount of data must be given explicitly for copying:
            d_data.CopyToDevice(h_dataIn, 0, 0, size_real * sizeof(float));

            //executa plan
            fftPlan.Exec(d_data.DevicePointer, TransformDirection.Forward);

            //Output to host, either as float2 or float, but array sizes must be right!
            float2[] h_dataOut  = new float2[size_complex];
            float[]  h_dataOut2 = new float[size_complex * 2];
            d_data.CopyToHost(h_dataOut);
            d_data.CopyToHost(h_dataOut2);
            fftPlan.Dispose();
            return(h_dataOut);
        }
示例#7
0
        private T[] RunKernel <T>(Action <T[]> method, T[] parameters) where T : struct
        {
            var methodInfo = method.Method;

            string[] kernels;
            string   llvmIr, ptxIr;
            var      ptx = CudaSharp.CudaSharp.Translate(out kernels, out llvmIr, out ptxIr, "sm_20", methodInfo);

            Console.WriteLine(llvmIr);
            Console.WriteLine(ptxIr);
            var kernel     = _context.LoadKernelPTX(ptx, kernels[0]);
            var maxThreads = kernel.MaxThreadsPerBlock;

            if (parameters.Length <= maxThreads)
            {
                kernel.BlockDimensions = parameters.Length;
                kernel.GridDimensions  = 1;
            }
            else
            {
                kernel.BlockDimensions = maxThreads;
                kernel.GridDimensions  = parameters.Length / maxThreads;
                if ((kernel.BlockDimensions * kernel.GridDimensions) != parameters.Length)
                {
                    throw new Exception(string.Format("Invalid parameters size (must be <= {0} or a multiple of {0}", maxThreads));
                }
            }
            var gpuMem = new CudaDeviceVariable <T>(parameters.Length);

            gpuMem.CopyToDevice(parameters);
            kernel.Run(gpuMem.DevicePointer);
            gpuMem.CopyToHost(parameters);
            gpuMem.Dispose();
            return(parameters);
        }
示例#8
0
        protected override void Execute()
        {
            if (m_qMatrix != null && MatrixSizeOK())
            {
                m_kernel.SetupExecution(TextureWidth * TextureHeight);
                m_kernel.Run(
                    m_plotValues.DevicePointer, m_actionIndices.DevicePointer, m_actionLabels.DevicePointer,
                    numOfActions, LABEL_PIXEL_WIDTH, LABEL_PIXEL_WIDTH, m_minUtilityValue, MaxUtilityValue,
                    m_qMatrix.GetLength(0), m_qMatrix.GetLength(1), VBODevicePointer);

                if (ViewMode == ViewMethod.Orbit_3D)
                {
                    m_vertexKernel.SetupExecution(m_qMatrix.Length);
                    m_vertexKernel.Run(
                        m_plotValues.DevicePointer, 0.1f, m_qMatrix.GetLength(0), m_qMatrix.GetLength(1),
                        MaxUtilityValue, VertexVBODevicePointer);
                }
            }

            float[,] lastQMatrix = m_qMatrix;
            Target.ReadTwoDimensions(ref m_qMatrix, ref m_qMatrixActions, XAxisVariableIndex, YAxisVariableIndex, ApplyInnerScaling);

            if (lastQMatrix != m_qMatrix)
            {
                TriggerReset();
            }
            else if (m_qMatrix != null && MatrixSizeOK())
            {
                m_plotValues.CopyToDevice(m_qMatrix);
                m_actionIndices.CopyToDevice(m_qMatrixActions);
            }
        }
        public override void RestoreValues(Stream stream)
        {
            BinaryReader br = new BinaryReader(stream);

            float[] w = new float[_weights.Size];
            float[] b = new float[_bias.Size];

            for (int i = 0; i < _weights.Size; i++)
            {
                w[i] = br.ReadSingle();
            }
            for (int i = 0; i < _bias.Size; i++)
            {
                b[i] = br.ReadSingle();
            }
            if (_activation == Activation.PRelu || _activation == Activation.LeakyRelu)
            {
                float[] a = new float[_aRelu.Size];
                for (int i = 0; i < _aRelu.Size; i++)
                {
                    a[i] = br.ReadSingle();
                }
                _aRelu.CopyToDevice(a);
            }
            _weights.CopyToDevice(w);
            _bias.CopyToDevice(b);
            base.RestoreValues(stream);
        }
示例#10
0
            // perform Fast Fourier transform
            private float[] PerformFFT(float[] input)
            {
                int size_real    = input.Length;
                int size_complex = (int)Math.Floor(size_real / 2.0) + 1;

                CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C);

                // size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real
                CudaDeviceVariable <float> fft = new CudaDeviceVariable <float>(size_complex * 2);

                // device allocation and host have different sizes, why the amount of data must be given explicitly for copying:
                fft.CopyToDevice(input, 0, 0, size_real * sizeof(float));

                // executa plan
                fftPlan.Exec(fft.DevicePointer, TransformDirection.Forward);

                // output to host as float2
                float2[] output = new float2[size_complex];
                fft.CopyToHost(output);

                // cleanup
                fft.Dispose();
                fftPlan.Dispose();

                // squared magnitude of complex output as a result
                float[] result = new float[output.Length];
                for (int i = 0; i < output.Length; i++)
                {
                    result[i] = (float)Math.Sqrt((output[i].x * output[i].x) + (output[i].y * output[i].y));
                }

                return(result);
            }
示例#11
0
        public SingularValueDecomposition Svd()
        {
            Debug.Assert(IsValid);
            var solver = _cuda.Solver;

            // find the size of the required buffer
            var bufferSize = solver.GesvdBufferSizeFloat(_rows, _columns);
            var mn         = Math.Min(_rows, _columns);

            // allocate output buffers
            var s  = new CudaDeviceVariable <float>(mn);
            var u  = new CudaDeviceVariable <float>(_rows * _rows);
            var vt = new CudaDeviceVariable <float>(_columns * _columns);

            // call cusolver to find the SVD
            try {
                using (var buffer = new CudaDeviceVariable <float>(bufferSize))
                    using (var devInfo = new CudaDeviceVariable <int>(1))
                        using (var rwork = new CudaDeviceVariable <float>(mn))
                            using (var a = new CudaDeviceVariable <float>(_rows * _columns)) {
                                a.CopyToDevice(_data);
                                solver.Gesvd('A', 'A', _rows, _columns, a, _rows, s, u, _rows, vt, _columns, buffer, bufferSize, rwork, devInfo);
                                return(new SingularValueDecomposition(
                                           new GpuMatrix(_cuda, _rows, _rows, u),
                                           new GpuVector(_cuda, mn, s),
                                           new GpuMatrix(_cuda, _columns, _columns, vt)
                                           ));
                            }
            }catch {
                s.Dispose();
                u.Dispose();
                vt.Dispose();
                throw;
            }
        }
示例#12
0
        public GpuMatrix(CudaProvider cuda, int rows, int columns, Func <int, int, float> init)
        {
            _cuda    = cuda;
            _rows    = rows;
            _columns = columns;
            cuda.Register(this);

            var count = rows * columns;
            var data  = new float[count];

            for (var j = 0; j < columns; j++)
            {
                for (var i = 0; i < rows; i++)
                {
                    data[j * rows + i] = init(i, j);
                }
            }
            _data = new CudaDeviceVariable <float>(count);
            _data.CopyToDevice(data);

#if DEBUG
            if (_id == _badAlloc)
            {
                Debugger.Break();
            }
#endif
        }
示例#13
0
        public static void blaa()
        {
            int num = 10;
            //NewContext creation
            CudaContext cntxt = new CudaContext();

            //Module loading from precompiled .ptx in a project output folder
            CUmodule cumodule = cntxt.LoadModule("kernel.ptx");

            //_Z9addKernelPf - function name, can be found in *.ptx file
            CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt);

            //Create device array for data
            CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num);

            //Create arrays with data
            float[] vec1 = new float[num];

            //Copy data to device
            vec1_device.CopyToDevice(vec1);

            //Set grid and block dimensions
            addWithCuda.GridDimensions  = new dim3(8, 1, 1);
            addWithCuda.BlockDimensions = new dim3(512, 1, 1);

            //Run the kernel
            addWithCuda.Run(
                vec1_device.DevicePointer);

            //Copy data from device
            vec1_device.CopyToHost(vec1);
        }
        public void BackwardData(CudnnFilterDescriptor filter, double[] filterData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(filter != null);
            Contract.Requires(filterData != null);
            Contract.Requires(diffTensor != null);
            Contract.Requires(diffData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(gradient != null);
            Contract.Requires(gradientData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, filter, diffTensor, gradient);

            using (var filterDataGpu = new CudaDeviceVariable <double>(filterData.Length))
                using (var diffDataGpu = new CudaDeviceVariable <double>(diffData.Length))
                    using (var gradientDataGpu = new CudaDeviceVariable <double>(gradientData.Length))
                    {
                        filterDataGpu.CopyToDevice(filterData);
                        diffDataGpu.CopyToDevice(diffData);

                        Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardData(handle, filter.Handle, filterDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate));

                        gradientDataGpu.CopyToHost(gradientData);
                    }
        }
示例#15
0
        public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode,
                             CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData,
                             CudnnTensorDescriptor destDiffTensor, float[] destDiffData)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable <float>(srcData.Length))
                using (var srcDiffDataGpu = new CudaDeviceVariable <float>(srcDiffData.Length))
                    using (var destDiffDataGpu = new CudaDeviceVariable <float>(destDiffData.Length))
                    {
                        srcDataGpu.CopyToDevice(srcData);
                        srcDiffDataGpu.CopyToDevice(srcDiffData);

                        Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode,
                                                                             srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                             destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                        destDiffDataGpu.CopyToHost(destDiffData);
                    }
        }
        public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, float[] srcData, CudnnTensorDescriptor srcDiffTensor, float[] srcDiffData,
                                                             CudnnTensorDescriptor destTensor, float[] destData, CudnnTensorDescriptor destDiffTensor, float[] destDiffData)
        {
            Contract.Requires(pooling != null);
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, srcDiffTensor, destTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length))
            using (var srcDiffDataGpu = new CudaDeviceVariable<float>(srcDiffData.Length))
            using (var destDataGpu = new CudaDeviceVariable<float>(destData.Length))
            using (var destDiffDataGpu = new CudaDeviceVariable<float>(destDiffData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                srcDiffDataGpu.CopyToDevice(srcDiffData);
                destDataGpu.CopyToDevice(destData);

                Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle,
                                                                     srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                     destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                destDiffDataGpu.CopyToHost(destDiffData);
            }
        }
示例#17
0
 internal void TensorConvertToMatrix(IReadOnlyList <IDeviceMemoryPtr> matrixList, int tensorRows, int tensorColumns, int matrixRows, int matrixColumns, IDeviceMemoryPtr ret)
 {
     using (var devicePtr = new CudaDeviceVariable <CUdeviceptr>(matrixList.Count)) {
         devicePtr.CopyToDevice(matrixList.Select(m => m.DevicePointer).ToArray());
         _Use(_tensorConvertToMatrix, matrixRows, matrixColumns, k => k.Run(0, devicePtr.DevicePointer, ret.DevicePointer, tensorRows, tensorColumns, matrixRows, matrixColumns));
     }
 }
示例#18
0
        void InitCoordinatesAndVelocity()
        {
            m_d_PointsCoordinates = new CudaDeviceVariable <float>(3 * Target.MAX_CELLS);
            float[] m_h_pointsCoordinates = new float[3 * Target.MAX_CELLS];
            for (int c = 0; c < m_h_pointsCoordinates.Length; c++)
            {
                m_h_pointsCoordinates[c] = COORDINATES_MIN + (COORDINATES_MAX - COORDINATES_MIN) * (float)randomNumber.NextDouble();
                //m_h_pointsCoordinates[c] = 1.00f;
            }
            m_d_PointsCoordinates.CopyToDevice(m_h_pointsCoordinates);

            m_d_Velocity = new CudaDeviceVariable <float>(3 * Target.MAX_CELLS);
            float[] m_h_velocity = new float[3 * Target.MAX_CELLS];
            for (int c = 0; c < m_h_velocity.Length; c++)
            {
                m_h_velocity[c] = 0.00f;
            }
            m_d_Velocity.CopyToDevice(m_h_velocity);



            m_d_CubeOperation = new CudaDeviceVariable <float>(6 * 4 * 3);
            int[] operationMask = new int[6 * 4 * 3]
            {
                -1, -1, +1, -1, -1, -1, +1, -1, -1, +1, -1, +1,
                -1, +1, +1, -1, -1, +1, +1, -1, +1, +1, +1, +1,
                -1, +1, -1, -1, -1, -1, -1, -1, +1, -1, +1, +1,
                -1, +1, -1, -1, -1, -1, +1, -1, -1, +1, +1, -1,
                +1, +1, -1, +1, -1, -1, +1, -1, +1, +1, +1, +1,
                -1, +1, +1, -1, +1, -1, +1, +1, -1, +1, +1, +1
            };
            float[] m_h_CubeOperation = new float[6 * 4 * 3];
            for (int i = 0; i < operationMask.Length; i++)
            {
                m_h_CubeOperation[i] = (float)operationMask[i];
            }
            m_d_CubeOperation.CopyToDevice(m_h_CubeOperation);



            m_d_CubeTexCoordinates = new CudaDeviceVariable <float>(6 * 4 * 2);
            int[] texCoordinates = new int[6 * 4 * 2]
            {
                0, 0, 0, 1, 1, 1, 1, 0,
                0, 0, 0, 1, 1, 1, 1, 0,
                0, 0, 0, 1, 1, 1, 1, 0,
                1, 0, 1, 1, 0, 1, 0, 0,
                1, 0, 1, 1, 0, 1, 0, 0,
                0, 1, 0, 0, 1, 0, 1, 1
            };
            float[] m_h_CubeTexCoordinates = new float[6 * 4 * 2];
            for (int i = 0; i < texCoordinates.Length; i++)
            {
                m_h_CubeTexCoordinates[i] = (float)texCoordinates[i];
            }
            m_d_CubeTexCoordinates.CopyToDevice(m_h_CubeTexCoordinates);

            m_computeCubes2Kernel.SetConstantVariable("operationMaskConstant", m_h_CubeOperation);
            m_computeCubes2Kernel.SetConstantVariable("cubeTexCoordinatesConstant", m_h_CubeTexCoordinates);
        }
示例#19
0
        internal CudaDeviceVariable <CUdeviceptr> GetDeviceMemoryPtr()
        {
            var ret = new CudaDeviceVariable <CUdeviceptr>(Count);

            ret.CopyToDevice(_ptr);
            return(ret);
        }
        public void Backward(CudnnPoolingDescriptor pooling, CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData,
                             CudnnTensorDescriptor destTensor, double[] destData, CudnnTensorDescriptor destDiffTensor, double[] destDiffData)
        {
            Contract.Requires(pooling != null);
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable <double>(srcData.Length))
                using (var srcDiffDataGpu = new CudaDeviceVariable <double>(srcDiffData.Length))
                    using (var destDataGpu = new CudaDeviceVariable <double>(destData.Length))
                        using (var destDiffDataGpu = new CudaDeviceVariable <double>(destDiffData.Length))
                        {
                            srcDataGpu.CopyToDevice(srcData);
                            srcDiffDataGpu.CopyToDevice(srcDiffData);
                            destDataGpu.CopyToDevice(destData);

                            Invoke(() => CudnnNativeMethods.cudnnPoolingBackward(handle, pooling.Handle,
                                                                                 srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                                 destTensor.Handle, destDataGpu.DevicePointer, destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                            destDiffDataGpu.CopyToHost(destDiffData);
                        }
        }
示例#21
0
        protected override void Execute()
        {
            if (firstExec)
            {
                firstExec = false;
                m_kernel_test.SetupExecution(TextureHeight * TextureWidth);
                m_kernel_test.Run(TextureWidth, TextureHeight, VBODevicePointer);
            }

            if (Mode == BoxPlotObserverMinMaxMode.Dynamic)
            {
                MinValue = Target.Output.Host.Min();
                MaxValue = Target.Output.Host.Max();
            }

            float range = MaxValue - MinValue;

            int[] box = new int[Target.Output.Count];
            for (int i = 0; i < Target.Output.Count; i++)
            {
                box[i] = (int)((Target.Output.Host[i] - MinValue) / range * (float)(boxPlotHeight - 1));
            }

            m_box = new CudaDeviceVariable <int>(Target.Output.Count);
            m_box.CopyToDevice(box);

            for (int i = 0; i < Target.OutputRowsN; i++)
            {
                int xpos = i / rowHintV;
                int ypos = i % rowHintV;
                drawBoxPlotAtPostion(m_streams[i], i * 5, horizontalGapV + xpos * (horizontalGapV + boxPlotWidth), verticalGapV + ypos * (verticalGapV + boxPlotHeight));
            }
            Target.Output.SafeCopyToDevice();
        }
示例#22
0
        public IVector Column(int index)
        {
            Debug.Assert(IsValid);
            var ret = new CudaDeviceVariable <float>(_rows);

            ret.CopyToDevice(_data, index * _rows * sizeof(float), 0, _rows * sizeof(float));
            return(new GpuVector(_cuda, _rows, ret));
        }
示例#23
0
        public IVector Clone()
        {
            Debug.Assert(IsValid);
            var data = new CudaDeviceVariable <float>(_size);

            data.CopyToDevice(_data);
            return(new GpuVector(_cuda, _size, data));
        }
        private void assignRandoms()
        {
            var hostPhis1 = RandomGenerator.GetInstance().RandomVector(DimensionsCount * ParticlesCount, 0, 1);
            var hostPhis2 = RandomGenerator.GetInstance().RandomVector(DimensionsCount * ParticlesCount, 0, 1);

            _phis1.CopyToDevice(hostPhis1);
            _phis2.CopyToDevice(hostPhis2);
        }
示例#25
0
        public IMatrix ToRowMatrix(int numRows = 1)
        {
            Debug.Assert(IsValid);
            var ret = new CudaDeviceVariable <float>(_data.Size);

            ret.CopyToDevice(_data);
            return(new GpuMatrix(_cuda, 1, _size, ret));
        }
示例#26
0
        public IMatrix Clone()
        {
            Debug.Assert(IsValid);
            var ret = new CudaDeviceVariable <float>(_rows * _columns);

            ret.CopyToDevice(_data);
            return(new GpuMatrix(_cuda, _rows, _columns, ret));
        }
示例#27
0
        internal CudaDeviceVariable <float> PointwiseDivide(CudaDeviceVariable <float> a, CudaDeviceVariable <float> b, int size)
        {
            CudaDeviceVariable <float> ret = new CudaDeviceVariable <float>(size);

            ret.CopyToDevice(b);
            _Use(_pointwiseDivide, size, k => k.Run(BLOCK_DIM2 * sizeof(float), a.DevicePointer, ret.DevicePointer, size));
            return(ret);
        }
示例#28
0
        public void CopyFrom(IVector vector)
        {
            Debug.Assert(vector.IsValid);
            var other = (GpuVector)vector;

            Debug.Assert(other._size == _size);

            _data.CopyToDevice(other._data);
        }
示例#29
0
        public IVector GetColumnSegment(int columnIndex, int rowIndex, int length)
        {
            Debug.Assert(IsValid);

            var ret = new CudaDeviceVariable <float>(length);

            ret.CopyToDevice(_data, ((columnIndex * _rows) + rowIndex) * sizeof(float), 0, length * sizeof(float));

            return(new GpuVector(_cuda, length, ret));
        }
示例#30
0
 public DeviceMemoryPtrList(IReadOnlyList <IReadOnlyList <IDeviceMemoryPtr> > data)
 {
     _ptrList = data.Select(d => {
         var ptr = new CudaDeviceVariable <CUdeviceptr>(d.Count);
         ptr.CopyToDevice(d.Select(m => m.DevicePointer).ToArray());
         return(ptr);
     }).ToList();
     _ptrToPtrList = new CudaDeviceVariable <CUdeviceptr>(data.Count);
     _ptrToPtrList.CopyToDevice(_ptrList.Select(m => m.DevicePointer).ToArray());
 }
示例#31
0
        public override float Inference(CudaDeviceVariable <float> input)
        {
            _input = input;

            NPPImage_32fC1 tempConv = new NPPImage_32fC1(_tempConvolution.DevicePointer, InWidth, InHeight, InWidth * sizeof(float));

            for (int outLayer = 0; outLayer < OutChannels; outLayer++)
            {
                SizeT          offsetOut        = outLayer * OutWidth * OutHeight * sizeof(float);
                CUdeviceptr    ptrWithOffsetOut = _z.DevicePointer + offsetOut;
                NPPImage_32fC1 imgOut           = new NPPImage_32fC1(ptrWithOffsetOut, OutWidth, OutHeight, OutWidth * sizeof(float));
                imgOut.Set(0);

                for (int inLayer = 0; inLayer < InChannels; inLayer++)
                {
                    SizeT          offsetIn        = inLayer * InWidth * InHeight * sizeof(float);
                    CUdeviceptr    ptrWithOffsetIn = _input.DevicePointer + offsetIn;
                    NPPImage_32fC1 imgIn           = new NPPImage_32fC1(ptrWithOffsetIn, InWidth, InHeight, InWidth * sizeof(float));

                    imgIn.SetRoi(_filterX / 2, _filterY / 2, InWidth - _filterX + 1, InHeight - _filterY + 1);

                    SizeT offsetFilter = (outLayer * InChannels * _filterX * _filterY + inLayer * _filterX * _filterY) * sizeof(float);
                    CudaDeviceVariable <float> filter = new CudaDeviceVariable <float>(_weights.DevicePointer + offsetFilter, false, _filterX * _filterY * sizeof(float));

                    imgIn.Filter(tempConv, filter, new NppiSize(_filterX, _filterY), new NppiPoint(_filterX / 2, _filterY / 2));
                    imgOut.Add(tempConv);
                }
                imgOut.Add(bHost[outLayer]);
            }

            switch (_activation)
            {
            case Activation.None:
                _y.CopyToDevice(_z);
                break;

            case Activation.Relu:
                //_aRelu is set to 0!
                _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch);
                break;

            case Activation.PRelu:
                _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch);
                break;

            case Activation.LeakyRelu:
                _KernelPReluForward.RunSafe(_z, _aRelu, _y, _outWidth * _outHeight, _outChannels, _batch);
                break;

            default:
                break;
            }

            return(_nextLayer.Inference(_y));
        }
示例#32
0
 static void Test(byte[] ptxFile)
 {
     const int size = 16;
     var context = new CudaContext();
     var kernel = context.LoadKernelPTX(ptxFile, "kernel");
     var memory = context.AllocateMemory(4 * size);
     var gpuMemory = new CudaDeviceVariable<int>(memory);
     var cpuMemory = new int[size];
     for (var i = 0; i < size; i++)
         cpuMemory[i] = i - 2;
     gpuMemory.CopyToDevice(cpuMemory);
     kernel.BlockDimensions = 4;
     kernel.GridDimensions = 4;
     kernel.Run(memory);
     gpuMemory.CopyToHost(cpuMemory);
     for (var i = 0; i < size; i++)
         Console.WriteLine("{0} = {1}", i, cpuMemory[i]);
 }
        public void BackwardBias(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor destTensor, double[] destData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, destTensor);

            using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length))
            using (var destDataGpu = new CudaDeviceVariable<double>(destData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);

                Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardBias(handle, srcTensor.Handle, srcDataGpu.DevicePointer, destTensor.Handle, destDataGpu.DevicePointer, accumulate));
                destDataGpu.CopyToHost(destData);
            }
        }
示例#34
0
        public void Run(DistanceOperation operation,
            CudaDeviceVariable<float> A, int sizeA,
            CudaDeviceVariable<float> B, int sizeB,
            CudaDeviceVariable<float> result, int sizeRes)
        {
            if (!ValidateAtRun(operation))
                return;

            switch (operation)
            {
                case DistanceOperation.DotProd:
                    //ZXC m_dotKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                    m_dotKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                    break;

                case DistanceOperation.CosDist:
                    //ZXC m_cosKernel.Run(result.DevicePointer, 0, A.DevicePointer, B.DevicePointer, sizeA, 0);
                    m_cosKernel.Run(result.DevicePointer, A.DevicePointer, B.DevicePointer, sizeA);
                    break;

                case DistanceOperation.EuclidDist:
                    float res = RunReturn(operation, A, sizeA, B, sizeB);
                    result.CopyToDevice(res);
                    break;

                case DistanceOperation.EuclidDistSquared:
                    m_combineVecsKernel.SetupExecution(sizeA);
                    m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Subtraction, sizeA);
                    //ZXC m_dotKernel.Run(result.DevicePointer, 0, m_temp, m_temp, m_temp.Count, 0);
                    m_dotKernel.Run(result.DevicePointer, m_temp, m_temp);
                    break;

                case DistanceOperation.HammingDist:
                    m_combineVecsKernel.SetupExecution(sizeA);
                    m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                    //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                    m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                    float fDist = 0; // to transform number of matches to a number of differences
                    result.CopyToHost(ref fDist);
                    fDist = m_temp.Count - fDist;
                    result.CopyToDevice(fDist);
                    break;

                case DistanceOperation.HammingSim:
                    m_combineVecsKernel.SetupExecution(sizeA);
                    m_combineVecsKernel.Run(A.DevicePointer, B.DevicePointer, m_temp, (int)MyJoin.MyJoinOperation.Equal, sizeA);
                    //ZXC m_reduceSumKernel.Run(result.DevicePointer, m_temp, m_temp.Count, 0, 0, 1, /*distributed = false*/0); // reduction to a single number
                    m_reduceSumKernel.Run(result.DevicePointer, m_temp);
                    // take the single number (number of different bits) and convert it to Hamming Similarity:
                    // a number in range <0,1> that says how much the vectors are similar
                    float fSim = 0;
                    result.CopyToHost(ref fSim);
                    fSim = fSim / m_temp.Count;
                    result.CopyToDevice(fSim);
                    break;
            }
        }
示例#35
0
        static void Main(string[] args)
        {
            int N = 275;

            float[] h_A;
            float[] h_B;
            float[] h_C;
            float[] h_C_ref;

            CudaDeviceVariable<float> d_A;
            CudaDeviceVariable<float> d_B;
            CudaDeviceVariable<float> d_C;
            float alpha = 1.0f;
            float beta = 0.0f;
            int n2 = N * N;
            int i;
            float error_norm;
            float ref_norm;
            float diff;
            CudaBlas handle;

            /* Initialize CUBLAS */
            Console.WriteLine("simpleCUBLAS test running.");

            handle = new CudaBlas();

            /* Allocate host memory for the matrices */
            h_A = new float[n2];
            h_B = new float[n2];
            //h_C = new float[n2];
            h_C_ref = new float[n2];

            Random rand = new Random(0);
            /* Fill the matrices with test data */
            for (i = 0; i < n2; i++)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
                //h_C[i] = (float)rand.NextDouble();
            }

            /* Allocate device memory for the matrices */
            d_A = new CudaDeviceVariable<float>(n2);
            d_B = new CudaDeviceVariable<float>(n2);
            d_C = new CudaDeviceVariable<float>(n2);

            /* Initialize the device matrices with the host matrices */
            d_A.CopyToDevice(h_A);
            d_B.CopyToDevice(h_B);
            //d_C.CopyToDevice(h_C);

            /* Performs operation using plain C code */
            simple_sgemm(N, alpha, h_A, h_B, beta, h_C_ref);

            /* Performs operation using cublas */
            handle.Gemm(Operation.NonTranspose, Operation.NonTranspose, N, N, N, alpha, d_A, N, d_B, N, beta, d_C, N);

            /* Allocate host memory for reading back the result from device memory */
            h_C = d_C;

            /* Check result against reference */
            error_norm = 0;
            ref_norm = 0;

            for (i = 0; i < n2; ++i)
            {
                diff = h_C_ref[i] - h_C[i];
                error_norm += diff * diff;
                ref_norm += h_C_ref[i] * h_C_ref[i];
            }

            error_norm = (float)Math.Sqrt((double)error_norm);
            ref_norm = (float)Math.Sqrt((double)ref_norm);

            if (Math.Abs(ref_norm) < 1e-7)
            {
                Console.WriteLine("!!!! reference norm is 0");
                return;
            }

            /* Memory clean up */
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            /* Shutdown */
            handle.Dispose();

            if (error_norm / ref_norm < 1e-6f)
            {
                Console.WriteLine("simpleCUBLAS test passed.");
                return;
            }
            else
            {
                Console.WriteLine("simpleCUBLAS test failed.");
                return;
            }
        }
        public void BackwardFilter(CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor diffTensor, double[] diffData, CudnnConvolutionDescriptor convolution, CudnnFilterDescriptor gradient, double[] gradientData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(diffTensor != null);
            Contract.Requires(diffData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(gradient != null);
            Contract.Requires(gradientData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, diffTensor, gradient);

            using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length))
            using (var diffDataGpu = new CudaDeviceVariable<double>(diffData.Length))
            using (var gradientDataGpu = new CudaDeviceVariable<double>(gradientData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                diffDataGpu.CopyToDevice(diffData);

                Invoke(() => CudnnNativeMethods.cudnnConvolutionBackwardFilter(handle, srcTensor.Handle, srcDataGpu.DevicePointer, diffTensor.Handle, diffDataGpu.DevicePointer, convolution.Handle, gradient.Handle, gradientDataGpu.DevicePointer, accumulate));
                gradientDataGpu.CopyToHost(gradientData);
            }
        }
        public void Forward(CudnnTensorDescriptor srcTensor, float[] srcData, CudnnFilterDescriptor filter, float[] filterData, CudnnConvolutionDescriptor convolution, CudnnTensorDescriptor destTensor, float[] destData, CudnnAccumulateResult accumulate)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(filter != null);
            Contract.Requires(filterData != null);
            Contract.Requires(convolution != null);
            Contract.Requires(destTensor != null);
            Contract.Requires(destData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Float, srcTensor, destTensor, filter);

            using (var srcDataGpu = new CudaDeviceVariable<float>(srcData.Length))
            using (var filterDataGpu = new CudaDeviceVariable<float>(filterData.Length))
            using (var destDataGpu = new CudaDeviceVariable<float>(destData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                filterDataGpu.CopyToDevice(filterData);

                Invoke(() => CudnnNativeMethods.cudnnConvolutionForward(handle, srcTensor.Handle, srcDataGpu.DevicePointer, filter.Handle, filterDataGpu.DevicePointer, convolution.Handle, destTensor.Handle, destDataGpu.DevicePointer, accumulate));
                destDataGpu.CopyToHost(destData);
            }
        }
        public void Backward(CudnnSoftmaxAlgorithm algorithm, CudnnSoftmaxMode mode,
                             CudnnTensorDescriptor srcTensor, double[] srcData, CudnnTensorDescriptor srcDiffTensor, double[] srcDiffData,
                             CudnnTensorDescriptor destDiffTensor, double[] destDiffData)
        {
            Contract.Requires(srcTensor != null);
            Contract.Requires(srcData != null);
            Contract.Requires(srcDiffTensor != null);
            Contract.Requires(srcDiffData != null);
            Contract.Requires(destDiffTensor != null);
            Contract.Requires(destDiffData != null);

            ThrowIfNotInitialized();
            CheckIfCompatible(CudnnType.Double, srcTensor, srcDiffTensor, destDiffTensor);

            using (var srcDataGpu = new CudaDeviceVariable<double>(srcData.Length))
            using (var srcDiffDataGpu = new CudaDeviceVariable<double>(srcDiffData.Length))
            using (var destDiffDataGpu = new CudaDeviceVariable<double>(destDiffData.Length))
            {
                srcDataGpu.CopyToDevice(srcData);
                srcDiffDataGpu.CopyToDevice(srcDiffData);

                Invoke(() => CudnnNativeMethods.cudnnSoftmaxBackward(handle, algorithm, mode,
                                                                     srcTensor.Handle, srcDataGpu.DevicePointer, srcDiffTensor.Handle, srcDiffDataGpu.DevicePointer,
                                                                     destDiffTensor.Handle, destDiffDataGpu.DevicePointer));
                destDiffDataGpu.CopyToHost(destDiffData);
            }
        }
示例#39
0
        static void Main(string[] args)
        {
            int SIGNAL_SIZE = 50;
            int FILTER_KERNEL_SIZE = 11;

            Console.WriteLine("[simpleCUFFT] is starting...");

            var assembly = Assembly.GetExecutingAssembly();
            var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx";

            CudaContext ctx = new CudaContext(0);
            CudaKernel ComplexPointwiseMulAndScale;
            string[] liste = assembly.GetManifestResourceNames();
            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale");
            }

            // Allocate host memory for the signal
            cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code...

            Random rand = new Random(0);
            // Initialize the memory for the signal
            for (int i = 0; i < SIGNAL_SIZE; ++i)
            {
                h_signal[i].real = (float)rand.NextDouble();
                h_signal[i].imag = 0;
            }

            // Allocate host memory for the filter
            cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE];

            // Initialize the memory for the filter
            for (int i = 0; i < FILTER_KERNEL_SIZE; ++i)
            {
                h_filter_kernel[i].real = (float)rand.NextDouble();
                h_filter_kernel[i].imag = 0;
            }

            // Pad signal and filter kernel
            cuFloatComplex[] h_padded_signal = null;
            cuFloatComplex[] h_padded_filter_kernel = null;
            int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE,
                                   h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE);
            int mem_size = (int)cuFloatComplex.SizeOf * new_size;

            // Allocate device memory for signal
            CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size);
            // Copy host memory to device
            d_signal.CopyToDevice(h_padded_signal);

            // Allocate device memory for filter kernel
            CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size);

            // Copy host memory to device
            d_filter_kernel.CopyToDevice(h_padded_filter_kernel);

            // CUFFT plan simple API
            CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1);

            // Transform signal and kernel
            Console.WriteLine("Transforming signal cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Forward);
            plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward);

            // Multiply the coefficients together and normalize the result
            Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>");
            ComplexPointwiseMulAndScale.BlockDimensions = 256;
            ComplexPointwiseMulAndScale.GridDimensions = 32;
            ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size);

            // Transform signal back
            Console.WriteLine("Transforming signal back cufftExecC2C");
            plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse);

            // Copy device memory to host
            cuFloatComplex[] h_convolved_signal = d_signal;

            // Allocate host memory for the convolution result
            cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE];

            // Convolve on the host
            Convolve(h_signal, SIGNAL_SIZE,
                     h_filter_kernel, FILTER_KERNEL_SIZE,
                     h_convolved_signal_ref);

            // check result
            bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f);

            //Destroy CUFFT context
            plan.Dispose();

            // cleanup memory
            d_filter_kernel.Dispose();
            d_signal.Dispose();
            ctx.Dispose();

            if (bTestResult)
            {
                Console.WriteLine("Test Passed");
            }
            else
            {
                Console.WriteLine("Test Failed");
            }
        }