예제 #1
0
        private static unsafe float[] CuDotProdSparseVecStruct()
        {
            int sparseVecSize = sizeof(SparseVecPtr);

            uint size = (uint)(N * sizeof(SparseVecPtr));

            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            //CUfunction structPassFunc = cuda.GetModuleFunction("DotProd");
            CUfunction structPassFunc = cuda.GetModuleFunction("DotProd2");

            SparseVecPtr[] vectors = new SparseVecPtr[N];
            Console.WriteLine("init and copy data");
            Stopwatch t = Stopwatch.StartNew();
            mainIndex = StartingIndex;
            for (int i = 0; i < N; i++)
            {
                vectors[i] = new SparseVecPtr();

                int vecSize = avgElements + i % stdElements;
                vectors[i].size = vecSize;
                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);

                CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals);
                CUdeviceptr idxPtr = cuda.CopyHostToDevice(index);

                vectors[i].indices = new IntPtr(idxPtr.Pointer);
                vectors[i].values = (IntPtr)valsPtr.Pointer;
            }

            GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned);
            IntPtr ptr = handle.AddrOfPinnedObject();

            float[] output = new float[N];

            //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors);

            CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size);
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy and init takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, dVectors.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(dVectors);
            cuda.Free(dOutput);

            return output;
        }
예제 #2
0
        private static unsafe void CuStructPass()
        {
            int N = 4;

            int sparseVecSize = sizeof(SparseVecPtr);

            uint size = (uint)(N * sizeof(SparseVecPtr));

            CUDA cuda = new CUDA(0, true);

            // load module
            cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));
            CUfunction structPassFunc = cuda.GetModuleFunction("StructPass");

            SparseVecPtr[] vectors = new SparseVecPtr[N];

            for (int i = 0; i < N; i++)
            {
                vectors[i] = new SparseVecPtr();
                vectors[i].size = 2;
                float[] vals = new float[2] { (float)i + 1 % 5, (float)i + 2 % 7 };

                //GCHandle valHandle = GCHandle.Alloc(vals, GCHandleType.Pinned);
                //vectors[i].values = valHandle.AddrOfPinnedObject();

                int[] index = new int[2] { i % 5, i % 7 };
                //GCHandle idxHandle = GCHandle.Alloc(index, GCHandleType.Pinned);
                //vectors[i].indices = idxHandle.AddrOfPinnedObject();

                //valHandle.Free();
                //idxHandle.Free();

                CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals);
                CUdeviceptr idxPtr = cuda.CopyHostToDevice(index);

                vectors[i].indices = new IntPtr(idxPtr.Pointer);
                vectors[i].values = (IntPtr)valsPtr.Pointer;

            }

            GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned);
            IntPtr ptr = handle.AddrOfPinnedObject();

            float[] output = new float[N];

            //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors);

            CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size);
            CUdeviceptr dOutput = cuda.Allocate(output);

            int threadsPerBlock = 256;
            int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

            //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, (uint)dVectors.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)dOutput.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);
            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            float naiveTime = cuda.ElapsedTime(start, end);
            Console.Write("passing struct takes {0}ms", naiveTime);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(10, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }
        }