private static float[] CuRBFCSRCached() { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); List<float> vecValsL = new List<float>(N * maxRowSize / 2); List<int> vecIdxL = new List<int>(N * maxRowSize / 2); List<int> vecLenghtL = new List<int>(N); float[] vecVals; int[] vecIdx; int[] vecLenght; float[] selfDot = new float[N]; maxIndex = 0; int vecStartIdx = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); vecValsL.AddRange(vals); for (int z = 0; z < vals.Length; z++) { selfDot[i] += vals[z] * vals[z]; } int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); vecIdxL.AddRange(index); vecLenghtL.Add(vecStartIdx); vecStartIdx += vecSize; } //for last index vecLenghtL.Add(vecStartIdx); vecVals = vecValsL.ToArray(); vecIdx = vecIdxL.ToArray(); vecLenght = vecLenghtL.ToArray(); float[] mainVec = new float[maxIndex + 1]; for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++) { int idx = vecIdx[j]; float val = vecVals[j]; mainVec[idx] = val; } Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot); //copy to texture CUarray cuArr = cuda.CreateArray(mainVec); cuda.CopyHostToArray(cuArr, mainVec, 0); CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureArray(cuTexRef, cuArr); float[] output = new float[N]; CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy to device takes {0}", t.Elapsed); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)mainIndex); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, Gamma); offset += sizeof(float); cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.Free(selfDotPtr); cuda.Free(vecLenghtPtr); cuda.DestroyArray(cuArr); cuda.DestroyTexture(cuTexRef); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }
private static float[] CuDotProdEllPackTexCached() { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); float[] vecVals = new float[N * maxRowSize]; int[] vecIdx = new int[N * maxRowSize]; maxIndex = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); //values are column-major aligment for (int z = 0; z < vals.Length; z++) { int m = z * N + i; vecVals[m] = vals[z]; } //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length); for (int z = 0; z < index.Length; z++) { int m = z * N + i; vecIdx[m] = index[z]; } } float[] mainVec = new float[maxIndex + 1]; for (int j = 0; j < maxRowSize; j++) { int idx = vecIdx[mainIndex + N * j]; float val = vecVals[mainIndex + N * j]; mainVec[idx] = val; } Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUarray cuArr = cuda.CreateArray(mainVec); cuda.CopyHostToArray(cuArr, mainVec, 0); //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor(); //cuDesc.Format = CUArrayFormat.Float; //cuDesc.NumChannels = 1; //cuDesc.Width = maxIndex+1; CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureArray(cuTexRef, cuArr); float[] output = new float[N]; CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy to device takes {0}", t.Elapsed); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.DestroyArray(cuArr); cuda.DestroyTexture(cuTexRef); return output; }