public void nppsSet_32s_test()
        {
            int    length = 1024;
            int    value  = 75;
            IntPtr ptr    = Npps.nppsMalloc_32s(length);

            int[] result = new int[length];

            GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
            IntPtr   h_result = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0);
            UInt64   size     = Convert.ToUInt64(sizeof(int) * result.Length);

            NppStatus status = Npps.nppsSet_32s(value, ptr, length);

            if (status != NppStatus.NPP_SUCCESS)
            {
                Assert.Fail(String.Format("Fail {0}", status.ToString()));
            }

            cudaError cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, ptr, size, cudaMemcpyKind.DeviceToHost);

            if (cudaStatus != cudaError.cudaSuccess)
            {
                Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString()));
            }

            for (int i = 0; i < result.Length; i++)
            {
                Assert.AreEqual(value, result[i]);
            }

            gcHandle.Free();
            Npps.nppsFree(ptr);
        }
Example #2
0
        public void cudaHostRegister_cudaHostUnRegister_test()
        {
            var    length = 1024 * 1024;
            var    size   = 1024 * 1024 * sizeof(float);
            IntPtr ptr    = Marshal.AllocHGlobal(size);

            Assert.IsFalse(VirtualLock(ptr, (ulong)size));

            var status = CudaRuntimeApi.cudaHostRegister(ptr, (ulong)size, DriverTypes.cudaHostRegisterDefault);

            Assert.AreEqual(status, cudaError.cudaSuccess);
            Console.WriteLine($"ptr : {ptr}");

            float[] test   = new float[length];
            float[] result = new float[length];

            for (int i = 0; i < length; i++)
            {
                test[i] = Convert.ToSingle(i);
            }

            Marshal.Copy(test, 0, ptr, length);
            Marshal.Copy(ptr, result, 0, length);

            for (int i = 0; i < length; i++)
            {
                Assert.AreEqual(result[i], test[i]);
            }

            status = CudaRuntimeApi.cudaHostUnregister(ptr);
            Assert.AreEqual(status, cudaError.cudaSuccess);

            Marshal.FreeHGlobal(ptr);
        }
Example #3
0
        public void cudaHostAlloc_cudaFreeHost_test()
        {
            IntPtr ptr    = IntPtr.Zero;
            var    length = 1024 * 1024;
            var    size   = 1024 * 1024 * sizeof(float);
            var    status = CudaRuntimeApi.cudaHostAlloc(ref ptr, (ulong)size, 0);

            Assert.AreEqual(status, cudaError.cudaSuccess);
            Console.WriteLine($"ptr : {ptr}");

            float[] test   = new float[length];
            float[] result = new float[length];

            for (int i = 0; i < length; i++)
            {
                test[i] = Convert.ToSingle(i);
            }

            Marshal.Copy(test, 0, ptr, length);
            Marshal.Copy(ptr, result, 0, length);

            for (int i = 0; i < length; i++)
            {
                Assert.AreEqual(result[i], test[i]);
            }

            status = CudaRuntimeApi.cudaFreeHost(ptr);
            Assert.AreEqual(status, cudaError.cudaSuccess);
        }
Example #4
0
        public void cudaGetDeviceCount_test()
        {
            int count  = 0;
            var status = CudaRuntimeApi.cudaGetDeviceCount(ref count);

            Assert.AreEqual(status, cudaError.cudaSuccess);
            Console.WriteLine("cuda device count : {0}", count);
        }
        public void nppiAddC_32f_C3R_test()
        {
            NppStatus nppStatus;
            cudaError cudaStatus;
            int       width       = 256;
            int       height      = 256;
            int       channel     = 3;
            int       stepInBytes = width * channel * sizeof(float);

            IntPtr d_src = Nppi.nppiMalloc_32f_C3(width, height, ref stepInBytes);
            IntPtr d_dst = Nppi.nppiMalloc_32f_C3(width, height, ref stepInBytes);

            float[] input     = new float[width * height * channel];
            float[] result    = new float[width * height * channel];
            float[] aconstant = Array.ConvertAll(Enumerable.Range(0, 3).ToArray(), Convert.ToSingle);

            UInt64   size     = Convert.ToUInt64(sizeof(float) * result.Length);
            GCHandle gchInput = GCHandle.Alloc(input, GCHandleType.Pinned);
            IntPtr   h_input  = Marshal.UnsafeAddrOfPinnedArrayElement(input, 0);

            cudaStatus = CudaRuntimeApi.cudaMemcpy(d_src, h_input, size, cudaMemcpyKind.HostToDevice);
            if (cudaStatus != cudaError.cudaSuccess)
            {
                Assert.Fail(cudaStatus.ToString());
            }

            NppiSize roi;

            roi.width  = width;
            roi.height = width;

            nppStatus = Nppi.nppiAddC_32f_C3R(d_src, width * channel * sizeof(float), aconstant, d_dst, width * channel * sizeof(float), roi);
            if (nppStatus != NppStatus.NPP_SUCCESS)
            {
                Assert.Fail(nppStatus.ToString());
            }

            GCHandle gchResult = GCHandle.Alloc(result, GCHandleType.Pinned);
            IntPtr   h_result  = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0);

            cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, d_dst, size, cudaMemcpyKind.DeviceToHost);
            if (cudaStatus != cudaError.cudaSuccess)
            {
                Assert.Fail(cudaStatus.ToString());
            }

            for (int i = 0; i < result.Length; i++)
            {
                int aId = i % aconstant.Length;
                Assert.AreEqual(aconstant[aId], result[i]);
            }

            gchInput.Free();
            gchResult.Free();

            Nppi.nppiFree(d_src);
            Nppi.nppiFree(d_dst);
        }
Example #6
0
        public void cudaMemGetInfo_test()
        {
            ulong free   = 0;
            ulong total  = 0;
            var   status = CudaRuntimeApi.cudaMemGetInfo(ref free, ref total);

            Assert.AreEqual(status, cudaError.cudaSuccess);
            Console.WriteLine($"free : {free}, total : {total}");
        }
Example #7
0
        public void cudaStreamCreate_cudaStreamDestroy_test()
        {
            IntPtr stream = IntPtr.Zero;
            var    status = CudaRuntimeApi.cudaStreamCreate(ref stream);

            Assert.AreEqual(status, cudaError.cudaSuccess);

            status = CudaRuntimeApi.cudaStreamDestroy(stream);
            Assert.AreEqual(status, cudaError.cudaSuccess);
        }
        public void nppsConvert_32f16s_test()
        {
            NppStatus nppStatus;
            cudaError cudaStatus;
            int       length   = 1024;
            int       value    = 75;
            float     expected = 75.0F;

            IntPtr d_src = Npps.nppsMalloc_32f(length);
            IntPtr d_dst = Npps.nppsMalloc_16s(length);

            short[] result = new short[length];

            GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned);
            IntPtr   h_result = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0);
            UInt64   size     = Convert.ToUInt64(sizeof(short) * result.Length);

            nppStatus = Npps.nppsSet_32f((float)value, d_src, length);
            if (nppStatus != NppStatus.NPP_SUCCESS)
            {
                Assert.Fail(String.Format("Fail {0}", nppStatus.ToString()));
            }

            nppStatus = Npps.nppsConvert_32f16s_Sfs(d_src, d_dst, length, NppRoundMode.NPP_RND_NEAR, 0);
            if (nppStatus != NppStatus.NPP_SUCCESS)
            {
                Assert.Fail(String.Format("Fail {0}", nppStatus.ToString()));
            }


            cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, d_dst, size, cudaMemcpyKind.DeviceToHost);
            if (cudaStatus != cudaError.cudaSuccess)
            {
                Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString()));
            }

            for (int i = 0; i < result.Length; i++)
            {
                Assert.AreEqual(expected, result[i]);
            }

            gcHandle.Free();
            Npps.nppsFree(d_src);
            Npps.nppsFree(d_dst);
        }
Example #9
0
        public void cudaSetDevice_cudaGetDeviceProperties_cudaDriverGetVersion_cudaRuntimeGetVersion_test()
        {
            int deviceCount = 0;
            var status      = CudaRuntimeApi.cudaGetDeviceCount(ref deviceCount);

            Assert.AreEqual(status, cudaError.cudaSuccess);

            // This function call returns 0 if there are no CUDA capable devices.
            if (deviceCount == 0)
            {
                Console.WriteLine("There are no available device(s) that support CUDA");
            }
            else
            {
                Console.WriteLine("Detected {0} CUDA Capable device(s)", deviceCount);
            }

            for (int i = 0; i < deviceCount; i++)
            {
                status = CudaRuntimeApi.cudaSetDevice(i);
                Assert.AreEqual(status, cudaError.cudaSuccess);
                int driverVersion  = 0;
                int runtimeVersion = 0;
                status = CudaRuntimeApi.cudaDriverGetVersion(ref driverVersion);
                Assert.AreEqual(status, cudaError.cudaSuccess);
                status = CudaRuntimeApi.cudaRuntimeGetVersion(ref runtimeVersion);
                Assert.AreEqual(status, cudaError.cudaSuccess);

                var deviceProp = new cudaDeviceProp();
                CudaRuntimeApi.cudaGetDeviceProperties(ref deviceProp, i);

                var uuid    = HexStringFromByteArray(UnsignedBytesFromSignedBytes(deviceProp.uuid.bytes));
                var devName = Encoding.Default.GetString(UnsignedBytesFromSignedBytes(deviceProp.name)).TrimEnd('\0');
                Console.WriteLine("\nDevice {0}: \"{1}\", uuid = {2}", i, devName, uuid);
                Console.WriteLine("  CUDA Driver Version / Runtime Version          {0}.{1} / {2}.{3}",
                                  driverVersion / 1000, (driverVersion % 100) / 10,
                                  runtimeVersion / 1000, (runtimeVersion % 100) / 10);
                Console.WriteLine("  CUDA Capability Major/Minor version number:    {0}.{1}",
                                  deviceProp.major, deviceProp.minor);
                Console.WriteLine(
                    "  Total amount of global memory:                 {0:0} MBytes ({1} bytes)",
                    Convert.ToSingle(deviceProp.totalGlobalMem / 1048576.0f), deviceProp.totalGlobalMem);
            }
        }
Example #10
0
        public void cudaMalloc_cudaFree_cudaMemcpy_test_cudaMemset_test()
        {
            int  length    = 1024 * 2;
            byte testValue = 5;

            byte[] test   = new byte[length];
            byte[] result = new byte[length];

            IntPtr d_ptr  = IntPtr.Zero;
            var    size   = length * sizeof(byte);
            var    status = CudaRuntimeApi.cudaMalloc(ref d_ptr, (ulong)size);

            Assert.AreEqual(status, cudaError.cudaSuccess);
            Console.WriteLine($"ptr : {d_ptr}");

            GCHandle gchTest     = GCHandle.Alloc(test, GCHandleType.Pinned);
            GCHandle gchResult   = GCHandle.Alloc(result, GCHandleType.Pinned);
            IntPtr   h_ptrTest   = Marshal.UnsafeAddrOfPinnedArrayElement(test, 0);
            IntPtr   h_ptrResult = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0);

            status = CudaRuntimeApi.cudaMemcpy(d_ptr, h_ptrTest, (ulong)size, cudaMemcpyKind.HostToDevice);
            Assert.AreEqual(status, cudaError.cudaSuccess);

            status = CudaRuntimeApi.cudaMemset(d_ptr, testValue, (ulong)size);
            Assert.AreEqual(status, cudaError.cudaSuccess);

            status = CudaRuntimeApi.cudaMemcpy(h_ptrResult, d_ptr, (ulong)size, cudaMemcpyKind.DeviceToHost);
            Assert.AreEqual(status, cudaError.cudaSuccess);

            for (int i = 0; i < length; i++)
            {
                Assert.AreEqual(result[i], testValue);
            }

            status = CudaRuntimeApi.cudaFree(d_ptr);
            Assert.AreEqual(status, cudaError.cudaSuccess);

            gchTest.Free();
            gchResult.Free();
        }
Example #11
0
        public void cudaDeviceSynchronize_test()
        {
            var status = CudaRuntimeApi.cudaDeviceSynchronize();

            Assert.AreEqual(status, cudaError.cudaSuccess);
        }
Example #12
0
        public void cudaDeviceReset_test()
        {
            var status = CudaRuntimeApi.cudaDeviceReset();

            Assert.AreEqual(status, cudaError.cudaSuccess);
        }
Example #13
0
        public void cublasCgemm_test()
        {
            int devCount = 0;
            var status   = CudaRuntimeApi.cudaGetDeviceCount(ref devCount);

            Assert.AreEqual(cudaError.cudaSuccess, status);

            int devId = 0;

            status = CudaRuntimeApi.cudaSetDevice(devId);

            var handle      = IntPtr.Zero;
            var cblasStatus = Cublas_api.cublasCreate_v2(ref handle);

            Random rand   = new Random();
            int    rows_a = rand.Next(2, 10);
            int    cols_a = rand.Next(2, 10);

            int rows_b = cols_a;
            int cols_b = rand.Next(2, 10);

            int rows_c  = rows_a;
            int cols_c  = cols_b;
            var A       = new float2[rows_a * cols_a];
            var B       = new float2[rows_b * cols_b];
            var C       = new float2[rows_c * cols_c];
            var resultC = new float2[rows_c * cols_c];

            var cA       = new Complex32[rows_a * cols_a];
            var cB       = new Complex32[rows_b * cols_b];
            var cC       = new Complex32[rows_c * cols_c];
            var cResultC = new Complex32[rows_c * cols_c];

            for (int i = 0; i < A.Length; i++)
            {
                var real = Convert.ToSingle(rand.Next(0, 10));
                var imag = Convert.ToSingle(rand.Next(0, 10));
                A[i]  = new float2(real, imag);
                cA[i] = new Complex32(real, imag);
            }

            for (int i = 0; i < B.Length; i++)
            {
                var real = Convert.ToSingle(rand.Next(0, 10));
                var imag = Convert.ToSingle(rand.Next(0, 10));
                B[i]  = new float2(real, imag);
                cB[i] = new Complex32(real, imag);
            }

            for (int i = 0; i < C.Length; i++)
            {
                var real = Convert.ToSingle(rand.Next(0, 10));
                var imag = Convert.ToSingle(rand.Next(0, 10));
                C[i]  = new float2(real, imag);
                cC[i] = new Complex32(real, imag);
            }

            var alphaReal = Convert.ToSingle(rand.Next(0, 10));
            var alphaImag = Convert.ToSingle(rand.Next(0, 10));
            var alpha     = new float2(alphaReal, alphaImag);
            var cAlpha    = new Complex32(alphaReal, alphaImag);

            var betaReal = Convert.ToSingle(rand.Next(0, 10));
            var betaImag = Convert.ToSingle(rand.Next(0, 10));
            var beta     = new float2(betaReal, betaImag);
            var cBeta    = new Complex32(betaReal, betaImag);

            var d_a = IntPtr.Zero;
            var d_b = IntPtr.Zero;
            var d_c = IntPtr.Zero;

            status = CudaRuntimeApi.cudaMalloc(ref d_a, (ulong)(A.Length * Marshal.SizeOf(typeof(float2))));
            status = CudaRuntimeApi.cudaMalloc(ref d_b, (ulong)(B.Length * Marshal.SizeOf(typeof(float2))));
            status = CudaRuntimeApi.cudaMalloc(ref d_c, (ulong)(C.Length * Marshal.SizeOf(typeof(float2))));

            var gch_a       = GCHandle.Alloc(A, GCHandleType.Pinned);
            var gch_b       = GCHandle.Alloc(B, GCHandleType.Pinned);
            var gch_c       = GCHandle.Alloc(C, GCHandleType.Pinned);
            var gch_resultC = GCHandle.Alloc(resultC, GCHandleType.Pinned);

            var h_a       = Marshal.UnsafeAddrOfPinnedArrayElement(A, 0);
            var h_b       = Marshal.UnsafeAddrOfPinnedArrayElement(B, 0);
            var h_c       = Marshal.UnsafeAddrOfPinnedArrayElement(C, 0);
            var h_resultC = Marshal.UnsafeAddrOfPinnedArrayElement(resultC, 0);

            status = CudaRuntimeApi.cudaMemcpy(d_a, h_a, (ulong)(A.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.HostToDevice);
            status = CudaRuntimeApi.cudaMemcpy(d_b, h_b, (ulong)(B.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.HostToDevice);
            status = CudaRuntimeApi.cudaMemcpy(d_c, h_c, (ulong)(C.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.HostToDevice);

            cblasStatus = Cublas_api.cublasCgemm_v2(
                handle,
                cublasOperation_t.CUBLAS_OP_N,
                cublasOperation_t.CUBLAS_OP_N,
                rows_a,
                cols_b,
                cols_a,
                ref alpha,
                d_a,
                rows_a,
                d_b,
                rows_b,
                ref beta,
                d_c,
                rows_c
                );

            status = CudaRuntimeApi.cudaMemcpy(h_resultC, d_c, (ulong)(resultC.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.DeviceToHost);
            for (int i = 0; i < rows_c * cols_c; i++)
            {
                cResultC[i] = new Complex32(resultC[i].X, resultC[i].Y);
            }
            var mResultC = Matrix <Complex32> .Build.Dense(rows_c, cols_c, cResultC);

            var mA = Matrix <Complex32> .Build.Dense(rows_a, cols_a, cA);

            var mB = Matrix <Complex32> .Build.Dense(rows_b, cols_b, cB);

            var mExpectedC = Matrix <Complex32> .Build.Dense(rows_c, cols_c, cC).Clone();

            mExpectedC = cAlpha * mA * mB + cBeta * mExpectedC;
            Complex32[] expected = mExpectedC.ToColumnMajorArray();

            Console.WriteLine("alpha : {0}, beta : {1}", alpha, beta);
            Console.WriteLine("A");
            Console.WriteLine(mA.ToString());
            Console.WriteLine();
            Console.WriteLine("B");
            Console.WriteLine(mB.ToString());
            Console.WriteLine();
            Console.WriteLine("resultC");
            Console.WriteLine(mResultC.ToString());
            Console.WriteLine();
            Console.WriteLine("expectedC");
            Console.WriteLine(mExpectedC.ToString());

            for (int i = 0; i < C.Length; i++)
            {
                Assert.AreEqual(expected[i], cResultC[i]);
            }

            cblasStatus = Cublas_api.cublasDestroy_v2(handle);

            status = CudaRuntimeApi.cudaFree(d_a);
            status = CudaRuntimeApi.cudaFree(d_b);
            status = CudaRuntimeApi.cudaFree(d_c);

            gch_a.Free();
            gch_b.Free();
            gch_c.Free();
        }
Example #14
0
        public void cublasSgemm_test()
        {
            int devCount = 0;
            var status   = CudaRuntimeApi.cudaGetDeviceCount(ref devCount);

            Assert.AreEqual(cudaError.cudaSuccess, status);

            int devId = 0;

            status = CudaRuntimeApi.cudaSetDevice(devId);

            var handle      = IntPtr.Zero;
            var cblasStatus = Cublas_api.cublasCreate_v2(ref handle);

            Random rand   = new Random();
            int    rows_a = rand.Next(2, 10);
            int    cols_a = rand.Next(2, 10);

            int rows_b = cols_a;
            int cols_b = rand.Next(2, 10);

            int rows_c = rows_a;
            int cols_c = cols_b;

            float alpha = 1.0F;
            float beta  = 0.0F;

            float[] A       = new float[rows_a * cols_a];
            float[] B       = new float[rows_b * cols_b];
            float[] C       = new float[rows_c * cols_c];
            float[] resultC = new float[rows_c * cols_c];

            for (int i = 0; i < A.Length; i++)
            {
                A[i] = Convert.ToSingle(rand.Next(0, 10));
            }

            for (int i = 0; i < B.Length; i++)
            {
                B[i] = Convert.ToSingle(rand.Next(0, 10));
            }

            var d_a = IntPtr.Zero;
            var d_b = IntPtr.Zero;
            var d_c = IntPtr.Zero;

            status = CudaRuntimeApi.cudaMalloc(ref d_a, (ulong)A.Length * sizeof(float));
            status = CudaRuntimeApi.cudaMalloc(ref d_b, (ulong)B.Length * sizeof(float));
            status = CudaRuntimeApi.cudaMalloc(ref d_c, (ulong)C.Length * sizeof(float));

            var gch_a       = GCHandle.Alloc(A, GCHandleType.Pinned);
            var gch_b       = GCHandle.Alloc(B, GCHandleType.Pinned);
            var gch_c       = GCHandle.Alloc(C, GCHandleType.Pinned);
            var gch_resultC = GCHandle.Alloc(resultC, GCHandleType.Pinned);

            var h_a       = Marshal.UnsafeAddrOfPinnedArrayElement(A, 0);
            var h_b       = Marshal.UnsafeAddrOfPinnedArrayElement(B, 0);
            var h_c       = Marshal.UnsafeAddrOfPinnedArrayElement(C, 0);
            var h_resultC = Marshal.UnsafeAddrOfPinnedArrayElement(resultC, 0);

            status = CudaRuntimeApi.cudaMemcpy(d_a, h_a, (ulong)A.Length * sizeof(float), cudaMemcpyKind.HostToDevice);
            status = CudaRuntimeApi.cudaMemcpy(d_b, h_b, (ulong)B.Length * sizeof(float), cudaMemcpyKind.HostToDevice);
            status = CudaRuntimeApi.cudaMemcpy(d_c, h_c, (ulong)C.Length * sizeof(float), cudaMemcpyKind.HostToDevice);

            cblasStatus = Cublas_api.cublasSgemm_v2(
                handle,
                cublasOperation_t.CUBLAS_OP_N,
                cublasOperation_t.CUBLAS_OP_N,
                rows_a,
                cols_b,
                cols_a,
                ref alpha,
                d_a,
                rows_a,
                d_b,
                rows_b,
                ref beta,
                d_c,
                rows_c
                );

            status = CudaRuntimeApi.cudaMemcpy(h_resultC, d_c, (ulong)C.Length * sizeof(float), cudaMemcpyKind.DeviceToHost);
            var mResultC = Matrix <float> .Build.Dense(rows_c, cols_c, resultC);

            var mA = Matrix <float> .Build.Dense(rows_a, cols_a, A);

            var mB = Matrix <float> .Build.Dense(rows_b, cols_b, B);

            var mExpectedC = Matrix <float> .Build.Dense(rows_c, cols_c, C).Clone();

            mExpectedC = alpha * mA * mB + beta * mExpectedC;
            float[] expected = mExpectedC.ToColumnMajorArray();

            Console.WriteLine("alpha : {0}, beta : {1}", alpha, beta);
            Console.WriteLine("A");
            Console.WriteLine(mA.ToString());
            Console.WriteLine();
            Console.WriteLine("B");
            Console.WriteLine(mB.ToString());
            Console.WriteLine();
            Console.WriteLine("resultC");
            Console.WriteLine(mResultC.ToString());
            Console.WriteLine();
            Console.WriteLine("expectedC");
            Console.WriteLine(mExpectedC.ToString());

            for (int i = 0; i < C.Length; i++)
            {
                Assert.AreEqual(expected[i], resultC[i]);
            }

            cblasStatus = Cublas_api.cublasDestroy_v2(handle);

            status = CudaRuntimeApi.cudaFree(d_a);
            status = CudaRuntimeApi.cudaFree(d_b);
            status = CudaRuntimeApi.cudaFree(d_c);

            gch_a.Free();
            gch_b.Free();
            gch_c.Free();
        }
        public void nppsThreshold_32f_test()
        {
            NppStatus nppStatus;
            cudaError cudaStatus;
            int       width  = 128;
            int       height = 24;
            int       length = width * height;

            float level = 10.0F;

            IntPtr d_src = Npps.nppsMalloc_32s(length);
            IntPtr d_dst = Npps.nppsMalloc_32f(length);

            float[] input  = new float[length];
            float[] result = new float[length];

            float[] line = Array.ConvertAll(Enumerable.Range(0, width).ToArray(), Convert.ToSingle);
            for (int i = 0; i < height; i++)
            {
                Array.Copy(line, 0, input, i * width, width);
            }

            UInt64 size = Convert.ToUInt64(sizeof(int) * result.Length);

            GCHandle gchInput = GCHandle.Alloc(input, GCHandleType.Pinned);
            IntPtr   h_input  = Marshal.UnsafeAddrOfPinnedArrayElement(input, 0);

            cudaStatus = CudaRuntimeApi.cudaMemcpy(d_src, h_input, size, cudaMemcpyKind.HostToDevice);
            if (cudaStatus != cudaError.cudaSuccess)
            {
                Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString()));
            }

            nppStatus = Npps.nppsThreshold_32f(d_src, d_dst, length, level, NppCmpOp.NPP_CMP_LESS);
            if (nppStatus != NppStatus.NPP_SUCCESS)
            {
                Assert.Fail(String.Format("Fail {0}", nppStatus.ToString()));
            }

            GCHandle gchResult = GCHandle.Alloc(result, GCHandleType.Pinned);
            IntPtr   h_result  = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0);

            cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, d_dst, size, cudaMemcpyKind.DeviceToHost);
            if (cudaStatus != cudaError.cudaSuccess)
            {
                Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString()));
            }

            for (int i = 0; i < result.Length; i++)
            {
                if (result[i] < level)
                {
                    Assert.Fail(String.Format("Fail. level : {0}, value :{1}", level, result[i]));
                }
            }

            gchInput.Free();
            gchResult.Free();

            Npps.nppsFree(d_src);
            Npps.nppsFree(d_dst);
        }