private static unsafe float[] CuDotProdSparseVecStruct() { int sparseVecSize = sizeof(SparseVecPtr); uint size = (uint)(N * sizeof(SparseVecPtr)); //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); //CUfunction structPassFunc = cuda.GetModuleFunction("DotProd"); CUfunction structPassFunc = cuda.GetModuleFunction("DotProd2"); SparseVecPtr[] vectors = new SparseVecPtr[N]; Console.WriteLine("init and copy data"); Stopwatch t = Stopwatch.StartNew(); mainIndex = StartingIndex; for (int i = 0; i < N; i++) { vectors[i] = new SparseVecPtr(); int vecSize = avgElements + i % stdElements; vectors[i].size = vecSize; float[] vals = Helpers.InitValues(i, vecSize, maxVal); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(index); vectors[i].indices = new IntPtr(idxPtr.Pointer); vectors[i].values = (IntPtr)valsPtr.Pointer; } GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned); IntPtr ptr = handle.AddrOfPinnedObject(); float[] output = new float[N]; //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors); CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size); CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy and init takes {0}", t.Elapsed); #region set cuda parameters cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, dVectors.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)mainIndex); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(dVectors); cuda.Free(dOutput); return output; }
private static unsafe void CuStructPass() { int N = 4; int sparseVecSize = sizeof(SparseVecPtr); uint size = (uint)(N * sizeof(SparseVecPtr)); CUDA cuda = new CUDA(0, true); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("StructPass"); SparseVecPtr[] vectors = new SparseVecPtr[N]; for (int i = 0; i < N; i++) { vectors[i] = new SparseVecPtr(); vectors[i].size = 2; float[] vals = new float[2] { (float)i + 1 % 5, (float)i + 2 % 7 }; //GCHandle valHandle = GCHandle.Alloc(vals, GCHandleType.Pinned); //vectors[i].values = valHandle.AddrOfPinnedObject(); int[] index = new int[2] { i % 5, i % 7 }; //GCHandle idxHandle = GCHandle.Alloc(index, GCHandleType.Pinned); //vectors[i].indices = idxHandle.AddrOfPinnedObject(); //valHandle.Free(); //idxHandle.Free(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(index); vectors[i].indices = new IntPtr(idxPtr.Pointer); vectors[i].values = (IntPtr)valsPtr.Pointer; } GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned); IntPtr ptr = handle.AddrOfPinnedObject(); float[] output = new float[N]; //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors); CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size); CUdeviceptr dOutput = cuda.Allocate(output); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, (uint)dVectors.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("passing struct takes {0}ms", naiveTime); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(10, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } }