Example #1
0
        private static float[] CuRBFCSRCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;
            float[] selfDot = new float[N];

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                for (int z = 0; z < vals.Length; z++)
                {
                    selfDot[i] += vals[z] * vals[z];
                }
                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            float[] mainVec = new float[maxIndex + 1];

            for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
            {
                int idx = vecIdx[j];
                float val = vecVals[j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);
            CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot);

            //copy to texture
            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, Gamma);
            offset += sizeof(float);

            cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(selfDotPtr);
            cuda.Free(vecLenghtPtr);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
Example #2
0
        private static float[] CuDotProdEllPackTexCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            float[] vecVals = new float[N * maxRowSize];
            int[] vecIdx = new int[N * maxRowSize];

            maxIndex = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                //values are column-major aligment
                for (int z = 0; z < vals.Length; z++)
                {
                    int m = z * N + i;
                    vecVals[m] = vals[z];
                }

                //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length);
                for (int z = 0; z < index.Length; z++)
                {
                    int m = z * N + i;
                    vecIdx[m] = index[z];
                }

            }

            float[] mainVec = new float[maxIndex + 1];

            for (int j = 0; j < maxRowSize; j++)
            {
                int idx = vecIdx[mainIndex + N * j];
                float val = vecVals[mainIndex + N * j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);

            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);

            //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor();
            //cuDesc.Format = CUArrayFormat.Float;
            //cuDesc.NumChannels = 1;
            //cuDesc.Width = maxIndex+1;

            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);

            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            return output;
        }