public void nppsSet_32s_test() { int length = 1024; int value = 75; IntPtr ptr = Npps.nppsMalloc_32s(length); int[] result = new int[length]; GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned); IntPtr h_result = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0); UInt64 size = Convert.ToUInt64(sizeof(int) * result.Length); NppStatus status = Npps.nppsSet_32s(value, ptr, length); if (status != NppStatus.NPP_SUCCESS) { Assert.Fail(String.Format("Fail {0}", status.ToString())); } cudaError cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, ptr, size, cudaMemcpyKind.DeviceToHost); if (cudaStatus != cudaError.cudaSuccess) { Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString())); } for (int i = 0; i < result.Length; i++) { Assert.AreEqual(value, result[i]); } gcHandle.Free(); Npps.nppsFree(ptr); }
public void nppiAddC_32f_C3R_test() { NppStatus nppStatus; cudaError cudaStatus; int width = 256; int height = 256; int channel = 3; int stepInBytes = width * channel * sizeof(float); IntPtr d_src = Nppi.nppiMalloc_32f_C3(width, height, ref stepInBytes); IntPtr d_dst = Nppi.nppiMalloc_32f_C3(width, height, ref stepInBytes); float[] input = new float[width * height * channel]; float[] result = new float[width * height * channel]; float[] aconstant = Array.ConvertAll(Enumerable.Range(0, 3).ToArray(), Convert.ToSingle); UInt64 size = Convert.ToUInt64(sizeof(float) * result.Length); GCHandle gchInput = GCHandle.Alloc(input, GCHandleType.Pinned); IntPtr h_input = Marshal.UnsafeAddrOfPinnedArrayElement(input, 0); cudaStatus = CudaRuntimeApi.cudaMemcpy(d_src, h_input, size, cudaMemcpyKind.HostToDevice); if (cudaStatus != cudaError.cudaSuccess) { Assert.Fail(cudaStatus.ToString()); } NppiSize roi; roi.width = width; roi.height = width; nppStatus = Nppi.nppiAddC_32f_C3R(d_src, width * channel * sizeof(float), aconstant, d_dst, width * channel * sizeof(float), roi); if (nppStatus != NppStatus.NPP_SUCCESS) { Assert.Fail(nppStatus.ToString()); } GCHandle gchResult = GCHandle.Alloc(result, GCHandleType.Pinned); IntPtr h_result = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0); cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, d_dst, size, cudaMemcpyKind.DeviceToHost); if (cudaStatus != cudaError.cudaSuccess) { Assert.Fail(cudaStatus.ToString()); } for (int i = 0; i < result.Length; i++) { int aId = i % aconstant.Length; Assert.AreEqual(aconstant[aId], result[i]); } gchInput.Free(); gchResult.Free(); Nppi.nppiFree(d_src); Nppi.nppiFree(d_dst); }
public void nppsConvert_32f16s_test() { NppStatus nppStatus; cudaError cudaStatus; int length = 1024; int value = 75; float expected = 75.0F; IntPtr d_src = Npps.nppsMalloc_32f(length); IntPtr d_dst = Npps.nppsMalloc_16s(length); short[] result = new short[length]; GCHandle gcHandle = GCHandle.Alloc(result, GCHandleType.Pinned); IntPtr h_result = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0); UInt64 size = Convert.ToUInt64(sizeof(short) * result.Length); nppStatus = Npps.nppsSet_32f((float)value, d_src, length); if (nppStatus != NppStatus.NPP_SUCCESS) { Assert.Fail(String.Format("Fail {0}", nppStatus.ToString())); } nppStatus = Npps.nppsConvert_32f16s_Sfs(d_src, d_dst, length, NppRoundMode.NPP_RND_NEAR, 0); if (nppStatus != NppStatus.NPP_SUCCESS) { Assert.Fail(String.Format("Fail {0}", nppStatus.ToString())); } cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, d_dst, size, cudaMemcpyKind.DeviceToHost); if (cudaStatus != cudaError.cudaSuccess) { Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString())); } for (int i = 0; i < result.Length; i++) { Assert.AreEqual(expected, result[i]); } gcHandle.Free(); Npps.nppsFree(d_src); Npps.nppsFree(d_dst); }
public void cudaMalloc_cudaFree_cudaMemcpy_test_cudaMemset_test() { int length = 1024 * 2; byte testValue = 5; byte[] test = new byte[length]; byte[] result = new byte[length]; IntPtr d_ptr = IntPtr.Zero; var size = length * sizeof(byte); var status = CudaRuntimeApi.cudaMalloc(ref d_ptr, (ulong)size); Assert.AreEqual(status, cudaError.cudaSuccess); Console.WriteLine($"ptr : {d_ptr}"); GCHandle gchTest = GCHandle.Alloc(test, GCHandleType.Pinned); GCHandle gchResult = GCHandle.Alloc(result, GCHandleType.Pinned); IntPtr h_ptrTest = Marshal.UnsafeAddrOfPinnedArrayElement(test, 0); IntPtr h_ptrResult = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0); status = CudaRuntimeApi.cudaMemcpy(d_ptr, h_ptrTest, (ulong)size, cudaMemcpyKind.HostToDevice); Assert.AreEqual(status, cudaError.cudaSuccess); status = CudaRuntimeApi.cudaMemset(d_ptr, testValue, (ulong)size); Assert.AreEqual(status, cudaError.cudaSuccess); status = CudaRuntimeApi.cudaMemcpy(h_ptrResult, d_ptr, (ulong)size, cudaMemcpyKind.DeviceToHost); Assert.AreEqual(status, cudaError.cudaSuccess); for (int i = 0; i < length; i++) { Assert.AreEqual(result[i], testValue); } status = CudaRuntimeApi.cudaFree(d_ptr); Assert.AreEqual(status, cudaError.cudaSuccess); gchTest.Free(); gchResult.Free(); }
public void cublasCgemm_test() { int devCount = 0; var status = CudaRuntimeApi.cudaGetDeviceCount(ref devCount); Assert.AreEqual(cudaError.cudaSuccess, status); int devId = 0; status = CudaRuntimeApi.cudaSetDevice(devId); var handle = IntPtr.Zero; var cblasStatus = Cublas_api.cublasCreate_v2(ref handle); Random rand = new Random(); int rows_a = rand.Next(2, 10); int cols_a = rand.Next(2, 10); int rows_b = cols_a; int cols_b = rand.Next(2, 10); int rows_c = rows_a; int cols_c = cols_b; var A = new float2[rows_a * cols_a]; var B = new float2[rows_b * cols_b]; var C = new float2[rows_c * cols_c]; var resultC = new float2[rows_c * cols_c]; var cA = new Complex32[rows_a * cols_a]; var cB = new Complex32[rows_b * cols_b]; var cC = new Complex32[rows_c * cols_c]; var cResultC = new Complex32[rows_c * cols_c]; for (int i = 0; i < A.Length; i++) { var real = Convert.ToSingle(rand.Next(0, 10)); var imag = Convert.ToSingle(rand.Next(0, 10)); A[i] = new float2(real, imag); cA[i] = new Complex32(real, imag); } for (int i = 0; i < B.Length; i++) { var real = Convert.ToSingle(rand.Next(0, 10)); var imag = Convert.ToSingle(rand.Next(0, 10)); B[i] = new float2(real, imag); cB[i] = new Complex32(real, imag); } for (int i = 0; i < C.Length; i++) { var real = Convert.ToSingle(rand.Next(0, 10)); var imag = Convert.ToSingle(rand.Next(0, 10)); C[i] = new float2(real, imag); cC[i] = new Complex32(real, imag); } var alphaReal = Convert.ToSingle(rand.Next(0, 10)); var alphaImag = Convert.ToSingle(rand.Next(0, 10)); var alpha = new float2(alphaReal, alphaImag); var cAlpha = new Complex32(alphaReal, alphaImag); var betaReal = Convert.ToSingle(rand.Next(0, 10)); var betaImag = Convert.ToSingle(rand.Next(0, 10)); var beta = new float2(betaReal, betaImag); var cBeta = new Complex32(betaReal, betaImag); var d_a = IntPtr.Zero; var d_b = IntPtr.Zero; var d_c = IntPtr.Zero; status = CudaRuntimeApi.cudaMalloc(ref d_a, (ulong)(A.Length * Marshal.SizeOf(typeof(float2)))); status = CudaRuntimeApi.cudaMalloc(ref d_b, (ulong)(B.Length * Marshal.SizeOf(typeof(float2)))); status = CudaRuntimeApi.cudaMalloc(ref d_c, (ulong)(C.Length * Marshal.SizeOf(typeof(float2)))); var gch_a = GCHandle.Alloc(A, GCHandleType.Pinned); var gch_b = GCHandle.Alloc(B, GCHandleType.Pinned); var gch_c = GCHandle.Alloc(C, GCHandleType.Pinned); var gch_resultC = GCHandle.Alloc(resultC, GCHandleType.Pinned); var h_a = Marshal.UnsafeAddrOfPinnedArrayElement(A, 0); var h_b = Marshal.UnsafeAddrOfPinnedArrayElement(B, 0); var h_c = Marshal.UnsafeAddrOfPinnedArrayElement(C, 0); var h_resultC = Marshal.UnsafeAddrOfPinnedArrayElement(resultC, 0); status = CudaRuntimeApi.cudaMemcpy(d_a, h_a, (ulong)(A.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.HostToDevice); status = CudaRuntimeApi.cudaMemcpy(d_b, h_b, (ulong)(B.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.HostToDevice); status = CudaRuntimeApi.cudaMemcpy(d_c, h_c, (ulong)(C.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.HostToDevice); cblasStatus = Cublas_api.cublasCgemm_v2( handle, cublasOperation_t.CUBLAS_OP_N, cublasOperation_t.CUBLAS_OP_N, rows_a, cols_b, cols_a, ref alpha, d_a, rows_a, d_b, rows_b, ref beta, d_c, rows_c ); status = CudaRuntimeApi.cudaMemcpy(h_resultC, d_c, (ulong)(resultC.Length * Marshal.SizeOf(typeof(float2))), cudaMemcpyKind.DeviceToHost); for (int i = 0; i < rows_c * cols_c; i++) { cResultC[i] = new Complex32(resultC[i].X, resultC[i].Y); } var mResultC = Matrix <Complex32> .Build.Dense(rows_c, cols_c, cResultC); var mA = Matrix <Complex32> .Build.Dense(rows_a, cols_a, cA); var mB = Matrix <Complex32> .Build.Dense(rows_b, cols_b, cB); var mExpectedC = Matrix <Complex32> .Build.Dense(rows_c, cols_c, cC).Clone(); mExpectedC = cAlpha * mA * mB + cBeta * mExpectedC; Complex32[] expected = mExpectedC.ToColumnMajorArray(); Console.WriteLine("alpha : {0}, beta : {1}", alpha, beta); Console.WriteLine("A"); Console.WriteLine(mA.ToString()); Console.WriteLine(); Console.WriteLine("B"); Console.WriteLine(mB.ToString()); Console.WriteLine(); Console.WriteLine("resultC"); Console.WriteLine(mResultC.ToString()); Console.WriteLine(); Console.WriteLine("expectedC"); Console.WriteLine(mExpectedC.ToString()); for (int i = 0; i < C.Length; i++) { Assert.AreEqual(expected[i], cResultC[i]); } cblasStatus = Cublas_api.cublasDestroy_v2(handle); status = CudaRuntimeApi.cudaFree(d_a); status = CudaRuntimeApi.cudaFree(d_b); status = CudaRuntimeApi.cudaFree(d_c); gch_a.Free(); gch_b.Free(); gch_c.Free(); }
public void cublasSgemm_test() { int devCount = 0; var status = CudaRuntimeApi.cudaGetDeviceCount(ref devCount); Assert.AreEqual(cudaError.cudaSuccess, status); int devId = 0; status = CudaRuntimeApi.cudaSetDevice(devId); var handle = IntPtr.Zero; var cblasStatus = Cublas_api.cublasCreate_v2(ref handle); Random rand = new Random(); int rows_a = rand.Next(2, 10); int cols_a = rand.Next(2, 10); int rows_b = cols_a; int cols_b = rand.Next(2, 10); int rows_c = rows_a; int cols_c = cols_b; float alpha = 1.0F; float beta = 0.0F; float[] A = new float[rows_a * cols_a]; float[] B = new float[rows_b * cols_b]; float[] C = new float[rows_c * cols_c]; float[] resultC = new float[rows_c * cols_c]; for (int i = 0; i < A.Length; i++) { A[i] = Convert.ToSingle(rand.Next(0, 10)); } for (int i = 0; i < B.Length; i++) { B[i] = Convert.ToSingle(rand.Next(0, 10)); } var d_a = IntPtr.Zero; var d_b = IntPtr.Zero; var d_c = IntPtr.Zero; status = CudaRuntimeApi.cudaMalloc(ref d_a, (ulong)A.Length * sizeof(float)); status = CudaRuntimeApi.cudaMalloc(ref d_b, (ulong)B.Length * sizeof(float)); status = CudaRuntimeApi.cudaMalloc(ref d_c, (ulong)C.Length * sizeof(float)); var gch_a = GCHandle.Alloc(A, GCHandleType.Pinned); var gch_b = GCHandle.Alloc(B, GCHandleType.Pinned); var gch_c = GCHandle.Alloc(C, GCHandleType.Pinned); var gch_resultC = GCHandle.Alloc(resultC, GCHandleType.Pinned); var h_a = Marshal.UnsafeAddrOfPinnedArrayElement(A, 0); var h_b = Marshal.UnsafeAddrOfPinnedArrayElement(B, 0); var h_c = Marshal.UnsafeAddrOfPinnedArrayElement(C, 0); var h_resultC = Marshal.UnsafeAddrOfPinnedArrayElement(resultC, 0); status = CudaRuntimeApi.cudaMemcpy(d_a, h_a, (ulong)A.Length * sizeof(float), cudaMemcpyKind.HostToDevice); status = CudaRuntimeApi.cudaMemcpy(d_b, h_b, (ulong)B.Length * sizeof(float), cudaMemcpyKind.HostToDevice); status = CudaRuntimeApi.cudaMemcpy(d_c, h_c, (ulong)C.Length * sizeof(float), cudaMemcpyKind.HostToDevice); cblasStatus = Cublas_api.cublasSgemm_v2( handle, cublasOperation_t.CUBLAS_OP_N, cublasOperation_t.CUBLAS_OP_N, rows_a, cols_b, cols_a, ref alpha, d_a, rows_a, d_b, rows_b, ref beta, d_c, rows_c ); status = CudaRuntimeApi.cudaMemcpy(h_resultC, d_c, (ulong)C.Length * sizeof(float), cudaMemcpyKind.DeviceToHost); var mResultC = Matrix <float> .Build.Dense(rows_c, cols_c, resultC); var mA = Matrix <float> .Build.Dense(rows_a, cols_a, A); var mB = Matrix <float> .Build.Dense(rows_b, cols_b, B); var mExpectedC = Matrix <float> .Build.Dense(rows_c, cols_c, C).Clone(); mExpectedC = alpha * mA * mB + beta * mExpectedC; float[] expected = mExpectedC.ToColumnMajorArray(); Console.WriteLine("alpha : {0}, beta : {1}", alpha, beta); Console.WriteLine("A"); Console.WriteLine(mA.ToString()); Console.WriteLine(); Console.WriteLine("B"); Console.WriteLine(mB.ToString()); Console.WriteLine(); Console.WriteLine("resultC"); Console.WriteLine(mResultC.ToString()); Console.WriteLine(); Console.WriteLine("expectedC"); Console.WriteLine(mExpectedC.ToString()); for (int i = 0; i < C.Length; i++) { Assert.AreEqual(expected[i], resultC[i]); } cblasStatus = Cublas_api.cublasDestroy_v2(handle); status = CudaRuntimeApi.cudaFree(d_a); status = CudaRuntimeApi.cudaFree(d_b); status = CudaRuntimeApi.cudaFree(d_c); gch_a.Free(); gch_b.Free(); gch_c.Free(); }
public void nppsThreshold_32f_test() { NppStatus nppStatus; cudaError cudaStatus; int width = 128; int height = 24; int length = width * height; float level = 10.0F; IntPtr d_src = Npps.nppsMalloc_32s(length); IntPtr d_dst = Npps.nppsMalloc_32f(length); float[] input = new float[length]; float[] result = new float[length]; float[] line = Array.ConvertAll(Enumerable.Range(0, width).ToArray(), Convert.ToSingle); for (int i = 0; i < height; i++) { Array.Copy(line, 0, input, i * width, width); } UInt64 size = Convert.ToUInt64(sizeof(int) * result.Length); GCHandle gchInput = GCHandle.Alloc(input, GCHandleType.Pinned); IntPtr h_input = Marshal.UnsafeAddrOfPinnedArrayElement(input, 0); cudaStatus = CudaRuntimeApi.cudaMemcpy(d_src, h_input, size, cudaMemcpyKind.HostToDevice); if (cudaStatus != cudaError.cudaSuccess) { Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString())); } nppStatus = Npps.nppsThreshold_32f(d_src, d_dst, length, level, NppCmpOp.NPP_CMP_LESS); if (nppStatus != NppStatus.NPP_SUCCESS) { Assert.Fail(String.Format("Fail {0}", nppStatus.ToString())); } GCHandle gchResult = GCHandle.Alloc(result, GCHandleType.Pinned); IntPtr h_result = Marshal.UnsafeAddrOfPinnedArrayElement(result, 0); cudaStatus = CudaRuntimeApi.cudaMemcpy(h_result, d_dst, size, cudaMemcpyKind.DeviceToHost); if (cudaStatus != cudaError.cudaSuccess) { Assert.Fail(String.Format("Fail {0}", cudaStatus.ToString())); } for (int i = 0; i < result.Length; i++) { if (result[i] < level) { Assert.Fail(String.Format("Fail. level : {0}, value :{1}", level, result[i])); } } gchInput.Free(); gchResult.Free(); Npps.nppsFree(d_src); Npps.nppsFree(d_dst); }